From 9143fcd7fd38243dd40f927dafaeb75f6ef8ef49 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Jan 2020 14:47:17 -0800
Subject: Add UDP matchers.

---
 pkg/abi/linux/netfilter.go               |  92 +++++++++++++++++
 pkg/sentry/socket/netfilter/netfilter.go | 164 ++++++++++++++++++++++++++-----
 pkg/tcpip/iptables/BUILD                 |   2 +
 pkg/tcpip/iptables/iptables.go           |   3 +
 pkg/tcpip/iptables/tcp_matcher.go        | 122 +++++++++++++++++++++++
 pkg/tcpip/iptables/types.go              |  17 ++++
 pkg/tcpip/iptables/udp_matcher.go        | 127 ++++++++++++++++++++++++
 pkg/tcpip/network/ipv4/ipv4.go           |  10 +-
 8 files changed, 509 insertions(+), 28 deletions(-)
 create mode 100644 pkg/tcpip/iptables/tcp_matcher.go
 create mode 100644 pkg/tcpip/iptables/udp_matcher.go

(limited to 'pkg')

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index 33fcc6c95..fb4588272 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -340,3 +340,95 @@ func goString(cstring []byte) string {
 	}
 	return string(cstring)
 }
+
+// XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp
+// in include/uapi/linux/netfilter/xt_tcpudp.h.
+type XTTCP struct {
+	// SourcePortStart specifies the inclusive start of the range of source
+	// ports to which the matcher applies.
+	SourcePortStart uint16
+
+	// SourcePortEnd specifies the inclusive end of the range of source ports
+	// to which the matcher applies.
+	SourcePortEnd uint16
+
+	// DestinationPortStart specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortStart uint16
+
+	// DestinationPortEnd specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortEnd uint16
+
+	// Option specifies that a particular TCP option must be set.
+	Option uint8
+
+	// FlagMask masks the FlagCompare byte when comparing to the TCP flag
+	// fields.
+	FlagMask uint8
+
+	// FlagCompare is binary and-ed with the TCP flag fields.
+	FlagCompare uint8
+
+	// InverseFlags flips the meaning of certain fields. See the
+	// TX_TCP_INV_* flags.
+	InverseFlags uint8
+}
+
+// SizeOfXTTCP is the size of an XTTCP.
+const SizeOfXTTCP = 12
+
+// Flags in XTTCP.InverseFlags. Corresponding constants are in
+// include/uapi/linux/netfilter/xt_tcpudp.h.
+const (
+	// Invert the meaning of SourcePortStart/End.
+	XT_TCP_INV_SRCPT = 0x01
+	// Invert the meaning of DestinationPortStart/End.
+	XT_TCP_INV_DSTPT = 0x02
+	// Invert the meaning of FlagCompare.
+	XT_TCP_INV_FLAGS = 0x04
+	// Invert the meaning of Option.
+	XT_TCP_INV_OPTION = 0x08
+	// Enable all flags.
+	XT_TCP_INV_MASK = 0x0F
+)
+
+// XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp
+// in include/uapi/linux/netfilter/xt_tcpudp.h.
+type XTUDP struct {
+	// SourcePortStart specifies the inclusive start of the range of source
+	// ports to which the matcher applies.
+	SourcePortStart uint16
+
+	// SourcePortEnd specifies the inclusive end of the range of source ports
+	// to which the matcher applies.
+	SourcePortEnd uint16
+
+	// DestinationPortStart specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortStart uint16
+
+	// DestinationPortEnd specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortEnd uint16
+
+	// InverseFlags flips the meaning of certain fields. See the
+	// TX_UDP_INV_* flags.
+	InverseFlags uint8
+
+	_ uint8
+}
+
+// SizeOfXTUDP is the size of an XTUDP.
+const SizeOfXTUDP = 10
+
+// Flags in XTUDP.InverseFlags. Corresponding constants are in
+// include/uapi/linux/netfilter/xt_tcpudp.h.
+const (
+	// Invert the meaning of SourcePortStart/End.
+	XT_UDP_INV_SRCPT = 0x01
+	// Invert the meaning of DestinationPortStart/End.
+	XT_UDP_INV_DSTPT = 0x02
+	// Enable all flags.
+	XT_UDP_INV_MASK = 0x03
+)
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index e1f2bacce..45296b339 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -131,6 +131,7 @@ func FillDefaultIPTables(stack *stack.Stack) {
 	stack.SetIPTables(ipt)
 }
 
+// TODO: Return proto.
 // convertNetstackToBinary converts the iptables as stored in netstack to the
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
@@ -318,10 +319,12 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 		var entry linux.IPTEntry
 		buf := optVal[:linux.SizeOfIPTEntry]
-		optVal = optVal[linux.SizeOfIPTEntry:]
 		binary.Unmarshal(buf, usermem.ByteOrder, &entry)
-		if entry.TargetOffset != linux.SizeOfIPTEntry {
-			// TODO(gvisor.dev/issue/170): Support matchers.
+		initialOptValLen := len(optVal)
+		optVal = optVal[linux.SizeOfIPTEntry:]
+
+		if entry.TargetOffset < linux.SizeOfIPTEntry {
+			log.Warningf("netfilter: entry has too-small target offset %d", entry.TargetOffset)
 			return syserr.ErrInvalidArgument
 		}
 
@@ -332,19 +335,41 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 			return err
 		}
 
+		// TODO: Matchers (and maybe targets) can specify that they only work for certiain protocols, hooks, tables.
+		// Get matchers.
+		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
+		if len(optVal) < int(matchersSize) {
+			log.Warningf("netfilter: entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+		}
+		matchers, err := parseMatchers(filter, optVal[:matchersSize])
+		if err != nil {
+			log.Warningf("netfilter: failed to parse matchers: %v", err)
+			return err
+		}
+		optVal = optVal[matchersSize:]
+
 		// Get the target of the rule.
-		target, consumed, err := parseTarget(optVal)
+		targetSize := entry.NextOffset - entry.TargetOffset
+		if len(optVal) < int(targetSize) {
+			log.Warningf("netfilter: entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+		}
+		target, err := parseTarget(optVal[:targetSize])
 		if err != nil {
 			return err
 		}
-		optVal = optVal[consumed:]
+		optVal = optVal[targetSize:]
 
 		table.Rules = append(table.Rules, iptables.Rule{
-			Filter: filter,
-			Target: target,
+			Filter:   filter,
+			Target:   target,
+			Matchers: matchers,
 		})
 		offsets = append(offsets, offset)
-		offset += linux.SizeOfIPTEntry + consumed
+		offset += uint32(entry.NextOffset)
+
+		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
+			log.Warningf("netfilter: entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+		}
 	}
 
 	// Go through the list of supported hooks for this table and, for each
@@ -401,12 +426,105 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	return nil
 }
 
-// parseTarget parses a target from the start of optVal and returns the target
-// along with the number of bytes it occupies in optVal.
-func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
+// parseMatchers parses 0 or more matchers from optVal. optVal should contain
+// only the matchers.
+func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) {
+	var matchers []iptables.Matcher
+	for len(optVal) > 0 {
+		log.Infof("parseMatchers: optVal has len %d", len(optVal))
+		// Get the XTEntryMatch.
+		if len(optVal) < linux.SizeOfXTEntryMatch {
+			log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		var match linux.XTEntryMatch
+		buf := optVal[:linux.SizeOfXTEntryMatch]
+		binary.Unmarshal(buf, usermem.ByteOrder, &match)
+		log.Infof("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match)
+
+		// Check some invariants.
+		if match.MatchSize < linux.SizeOfXTEntryMatch {
+			log.Warningf("netfilter: match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
+			return nil, syserr.ErrInvalidArgument
+		}
+		if len(optVal) < int(match.MatchSize) {
+			log.Warningf("netfilter: optVal has insufficient size for match: %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		buf = optVal[linux.SizeOfXTEntryMatch:match.MatchSize]
+		var matcher iptables.Matcher
+		var err error
+		switch match.Name.String() {
+		case "tcp":
+			if len(buf) < linux.SizeOfXTTCP {
+				log.Warningf("netfilter: optVal has insufficient size for TCP match: %d", len(optVal))
+				return nil, syserr.ErrInvalidArgument
+			}
+			var matchData linux.XTTCP
+			// For alignment reasons, the match's total size may exceed what's
+			// strictly necessary to hold matchData.
+			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
+			log.Infof("parseMatchers: parsed XTTCP: %+v", matchData)
+			matcher, err = iptables.NewTCPMatcher(filter, iptables.TCPMatcherData{
+				SourcePortStart:      matchData.SourcePortStart,
+				SourcePortEnd:        matchData.SourcePortEnd,
+				DestinationPortStart: matchData.DestinationPortStart,
+				DestinationPortEnd:   matchData.DestinationPortEnd,
+				Option:               matchData.Option,
+				FlagMask:             matchData.FlagMask,
+				FlagCompare:          matchData.FlagCompare,
+				InverseFlags:         matchData.InverseFlags,
+			})
+			if err != nil {
+				log.Warningf("netfilter: failed to create TCP matcher: %v", err)
+				return nil, syserr.ErrInvalidArgument
+			}
+
+		case "udp":
+			if len(buf) < linux.SizeOfXTUDP {
+				log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal))
+				return nil, syserr.ErrInvalidArgument
+			}
+			var matchData linux.XTUDP
+			// For alignment reasons, the match's total size may exceed what's
+			// strictly necessary to hold matchData.
+			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
+			log.Infof("parseMatchers: parsed XTUDP: %+v", matchData)
+			matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherData{
+				SourcePortStart:      matchData.SourcePortStart,
+				SourcePortEnd:        matchData.SourcePortEnd,
+				DestinationPortStart: matchData.DestinationPortStart,
+				DestinationPortEnd:   matchData.DestinationPortEnd,
+				InverseFlags:         matchData.InverseFlags,
+			})
+			if err != nil {
+				log.Warningf("netfilter: failed to create UDP matcher: %v", err)
+				return nil, syserr.ErrInvalidArgument
+			}
+
+		default:
+			log.Warningf("netfilter: unsupported matcher with name %q", match.Name.String())
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		matchers = append(matchers, matcher)
+
+		// TODO: Support revision.
+		// TODO: Support proto -- matchers usually specify which proto(s) they work with.
+		optVal = optVal[match.MatchSize:]
+	}
+
+	// TODO: Check that optVal is exhausted.
+	return matchers, nil
+}
+
+// parseTarget parses a target from optVal. optVal should contain only the
+// target.
+func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 	if len(optVal) < linux.SizeOfXTEntryTarget {
 		log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
-		return nil, 0, syserr.ErrInvalidArgument
+		return nil, syserr.ErrInvalidArgument
 	}
 	var target linux.XTEntryTarget
 	buf := optVal[:linux.SizeOfXTEntryTarget]
@@ -414,9 +532,9 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 	switch target.Name.String() {
 	case "":
 		// Standard target.
-		if len(optVal) < linux.SizeOfXTStandardTarget {
-			log.Warningf("netfilter.SetEntries: optVal has insufficient size for standard target %d", len(optVal))
-			return nil, 0, syserr.ErrInvalidArgument
+		if len(optVal) != linux.SizeOfXTStandardTarget {
+			log.Warningf("netfilter.SetEntries: optVal has wrong size for standard target %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
 		}
 		var standardTarget linux.XTStandardTarget
 		buf = optVal[:linux.SizeOfXTStandardTarget]
@@ -424,22 +542,22 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 
 		verdict, err := translateToStandardVerdict(standardTarget.Verdict)
 		if err != nil {
-			return nil, 0, err
+			return nil, err
 		}
 		switch verdict {
 		case iptables.Accept:
-			return iptables.UnconditionalAcceptTarget{}, linux.SizeOfXTStandardTarget, nil
+			return iptables.UnconditionalAcceptTarget{}, nil
 		case iptables.Drop:
-			return iptables.UnconditionalDropTarget{}, linux.SizeOfXTStandardTarget, nil
+			return iptables.UnconditionalDropTarget{}, nil
 		default:
 			panic(fmt.Sprintf("Unknown verdict: %v", verdict))
 		}
 
 	case errorTargetName:
 		// Error target.
-		if len(optVal) < linux.SizeOfXTErrorTarget {
+		if len(optVal) != linux.SizeOfXTErrorTarget {
 			log.Infof("netfilter.SetEntries: optVal has insufficient size for error target %d", len(optVal))
-			return nil, 0, syserr.ErrInvalidArgument
+			return nil, syserr.ErrInvalidArgument
 		}
 		var errorTarget linux.XTErrorTarget
 		buf = optVal[:linux.SizeOfXTErrorTarget]
@@ -454,16 +572,16 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 		//   rules have an error with the name of the chain.
 		switch errorTarget.Name.String() {
 		case errorTargetName:
-			return iptables.ErrorTarget{}, linux.SizeOfXTErrorTarget, nil
+			return iptables.ErrorTarget{}, nil
 		default:
 			log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
-			return nil, 0, syserr.ErrInvalidArgument
+			return nil, syserr.ErrInvalidArgument
 		}
 	}
 
 	// Unknown target.
 	log.Infof("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
-	return nil, 0, syserr.ErrInvalidArgument
+	return nil, syserr.ErrInvalidArgument
 }
 
 func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, *syserr.Error) {
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index 297eaccaf..ff4e3c932 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -7,7 +7,9 @@ go_library(
     srcs = [
         "iptables.go",
         "targets.go",
+        "tcp_matcher.go",
         "types.go",
+        "udp_matcher.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/iptables",
     visibility = ["//visibility:public"],
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index fc06b5b87..accedba1e 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -138,6 +138,8 @@ func EmptyFilterTable() Table {
 // Check runs pkt through the rules for hook. It returns true when the packet
 // should continue traversing the network stack and false when it should be
 // dropped.
+//
+// Precondition: pkt.NetworkHeader is set.
 func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 	// TODO(gvisor.dev/issue/170): A lot of this is uncomplicated because
 	// we're missing features. Jumps, the call stack, etc. aren't checked
@@ -163,6 +165,7 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 	return true
 }
 
+// Precondition: pkt.NetworkHeader is set.
 func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) Verdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
diff --git a/pkg/tcpip/iptables/tcp_matcher.go b/pkg/tcpip/iptables/tcp_matcher.go
new file mode 100644
index 000000000..6acbd6eb9
--- /dev/null
+++ b/pkg/tcpip/iptables/tcp_matcher.go
@@ -0,0 +1,122 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+type TCPMatcher struct {
+	data TCPMatcherData
+
+	// tablename string
+	// unsigned int matchsize;
+	// unsigned int usersize;
+	// #ifdef CONFIG_COMPAT
+	// unsigned int compatsize;
+	// #endif
+	// unsigned int hooks;
+	// unsigned short proto;
+	// unsigned short family;
+}
+
+// TODO: Delete?
+// MatchCheckEntryParams
+
+type TCPMatcherData struct {
+	// Filter IPHeaderFilter
+
+	SourcePortStart      uint16
+	SourcePortEnd        uint16
+	DestinationPortStart uint16
+	DestinationPortEnd   uint16
+	Option               uint8
+	FlagMask             uint8
+	FlagCompare          uint8
+	InverseFlags         uint8
+}
+
+func NewTCPMatcher(filter IPHeaderFilter, data TCPMatcherData) (Matcher, error) {
+	// TODO: We currently only support source port and destination port.
+	log.Infof("Adding rule with TCPMatcherData: %+v", data)
+
+	if data.Option != 0 ||
+		data.FlagMask != 0 ||
+		data.FlagCompare != 0 ||
+		data.InverseFlags != 0 {
+		return nil, fmt.Errorf("unsupported TCP matcher flags set")
+	}
+
+	if filter.Protocol != header.TCPProtocolNumber {
+		log.Warningf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber)
+	}
+
+	return &TCPMatcher{data: data}, nil
+}
+
+// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments).
+func (tm *TCPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+	netHeader := header.IPv4(pkt.NetworkHeader)
+
+	// TODO: Do we check proto here or elsewhere? I think elsewhere (check
+	// codesearch).
+	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
+		return false, false
+	}
+
+	// We dont't match fragments.
+	if frag := netHeader.FragmentOffset(); frag != 0 {
+		if frag == 1 {
+			log.Warningf("Dropping TCP packet: malicious packet with fragment with fragment offest of 1.")
+			return false, true
+		}
+		return false, false
+	}
+
+	// Now we need the transport header. However, this may not have been set
+	// yet.
+	// TODO
+	var tcpHeader header.TCP
+	if pkt.TransportHeader != nil {
+		tcpHeader = header.TCP(pkt.TransportHeader)
+	} else {
+		// The TCP header hasn't been parsed yet. We have to do it here.
+		if len(pkt.Data.First()) < header.TCPMinimumSize {
+			// There's no valid TCP header here, so we hotdrop the
+			// packet.
+			// TODO: Stats.
+			log.Warningf("Dropping TCP packet: size to small.")
+			return false, true
+		}
+		tcpHeader = header.TCP(pkt.Data.First())
+	}
+
+	// Check whether the source and destination ports are within the
+	// matching range.
+	sourcePort := tcpHeader.SourcePort()
+	destinationPort := tcpHeader.DestinationPort()
+	if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort {
+		return false, false
+	}
+	if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort {
+		return false, false
+	}
+
+	return true, false
+}
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index a0bfc8b41..54e66f09a 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -169,12 +169,29 @@ type IPHeaderFilter struct {
 	Protocol tcpip.TransportProtocolNumber
 }
 
+// TODO: Should these be able to marshal/unmarshal themselves?
+// TODO: Something has to map the name to the matcher.
 // A Matcher is the interface for matching packets.
 type Matcher interface {
 	// Match returns whether the packet matches and whether the packet
 	// should be "hotdropped", i.e. dropped immediately. This is usually
 	// used for suspicious packets.
+	//
+	// Precondition: packet.NetworkHeader is set.
 	Match(hook Hook, packet tcpip.PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
+
+	// TODO: Make this typesafe by having each Matcher have their own, typed CheckEntry?
+	// CheckEntry(params MatchCheckEntryParams) bool
+}
+
+// TODO: Unused?
+type MatchCheckEntryParams struct {
+	Table  string // TODO: Tables should be an enum...
+	Filter IPHeaderFilter
+	Info   interface{} // TODO: Type unsafe.
+	// HookMask       uint8
+	// Family         uint8
+	// NFTCompat      bool
 }
 
 // A Target is the interface for taking an action for a packet.
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
new file mode 100644
index 000000000..ce4368a3d
--- /dev/null
+++ b/pkg/tcpip/iptables/udp_matcher.go
@@ -0,0 +1,127 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+	"fmt"
+	"runtime/debug"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+type UDPMatcher struct {
+	data UDPMatcherData
+
+	// tablename string
+	// unsigned int matchsize;
+	// unsigned int usersize;
+	// #ifdef CONFIG_COMPAT
+	// unsigned int compatsize;
+	// #endif
+	// unsigned int hooks;
+	// unsigned short proto;
+	// unsigned short family;
+}
+
+// TODO: Delete?
+// MatchCheckEntryParams
+
+type UDPMatcherData struct {
+	// Filter IPHeaderFilter
+
+	SourcePortStart      uint16
+	SourcePortEnd        uint16
+	DestinationPortStart uint16
+	DestinationPortEnd   uint16
+	InverseFlags         uint8
+}
+
+func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error) {
+	// TODO: We currently only support source port and destination port.
+	log.Infof("Adding rule with UDPMatcherData: %+v", data)
+
+	if data.InverseFlags != 0 {
+		return nil, fmt.Errorf("unsupported UDP matcher flags set")
+	}
+
+	if filter.Protocol != header.UDPProtocolNumber {
+		log.Warningf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
+	}
+
+	return &UDPMatcher{data: data}, nil
+}
+
+// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments).
+func (tm *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+	log.Infof("UDPMatcher called from: %s", string(debug.Stack()))
+	netHeader := header.IPv4(pkt.NetworkHeader)
+
+	// TODO: Do we check proto here or elsewhere? I think elsewhere (check
+	// codesearch).
+	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
+		log.Infof("UDPMatcher: wrong protocol number")
+		return false, false
+	}
+
+	// We dont't match fragments.
+	if frag := netHeader.FragmentOffset(); frag != 0 {
+		log.Infof("UDPMatcher: it's a fragment")
+		if frag == 1 {
+			return false, true
+		}
+		log.Warningf("Dropping UDP packet: malicious fragmented packet.")
+		return false, false
+	}
+
+	// Now we need the transport header. However, this may not have been set
+	// yet.
+	// TODO
+	var udpHeader header.UDP
+	if pkt.TransportHeader != nil {
+		log.Infof("UDPMatcher: transport header is not nil")
+		udpHeader = header.UDP(pkt.TransportHeader)
+	} else {
+		log.Infof("UDPMatcher: transport header is nil")
+		log.Infof("UDPMatcher: is network header nil: %t", pkt.NetworkHeader == nil)
+		// The UDP header hasn't been parsed yet. We have to do it here.
+		if len(pkt.Data.First()) < header.UDPMinimumSize {
+			// There's no valid UDP header here, so we hotdrop the
+			// packet.
+			// TODO: Stats.
+			log.Warningf("Dropping UDP packet: size to small.")
+			return false, true
+		}
+		udpHeader = header.UDP(pkt.Data.First())
+	}
+
+	// Check whether the source and destination ports are within the
+	// matching range.
+	sourcePort := udpHeader.SourcePort()
+	destinationPort := udpHeader.DestinationPort()
+	log.Infof("UDPMatcher: sport and dport are %d and %d. sports and dport start and end are (%d, %d) and (%d, %d)",
+		udpHeader.SourcePort(), udpHeader.DestinationPort(),
+		tm.data.SourcePortStart, tm.data.SourcePortEnd,
+		tm.data.DestinationPortStart, tm.data.DestinationPortEnd)
+	if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort {
+		return false, false
+	}
+	if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort {
+		return false, false
+	}
+
+	return true, false
+}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 85512f9b2..6597e6781 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -353,6 +353,11 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 	}
 	pkt.NetworkHeader = headerView[:h.HeaderLength()]
 
+	hlen := int(h.HeaderLength())
+	tlen := int(h.TotalLength())
+	pkt.Data.TrimFront(hlen)
+	pkt.Data.CapLength(tlen - hlen)
+
 	// iptables filtering. All packets that reach here are intended for
 	// this machine and will not be forwarded.
 	ipt := e.stack.IPTables()
@@ -361,11 +366,6 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 		return
 	}
 
-	hlen := int(h.HeaderLength())
-	tlen := int(h.TotalLength())
-	pkt.Data.TrimFront(hlen)
-	pkt.Data.CapLength(tlen - hlen)
-
 	more := (h.Flags() & header.IPv4FlagMoreFragments) != 0
 	if more || h.FragmentOffset() != 0 {
 		if pkt.Data.Size() == 0 {
-- 
cgit v1.2.3


From 2661101ad470548cb15dce0afc694296668d780a Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Jan 2020 14:51:28 -0800
Subject: Removed TCP work (saved in ipt-tcp-match).

---
 pkg/abi/linux/netfilter.go               |  52 -------------
 pkg/sentry/socket/netfilter/netfilter.go |  26 -------
 pkg/tcpip/iptables/tcp_matcher.go        | 122 -------------------------------
 3 files changed, 200 deletions(-)
 delete mode 100644 pkg/tcpip/iptables/tcp_matcher.go

(limited to 'pkg')

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index fb4588272..f0e544f9c 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -341,58 +341,6 @@ func goString(cstring []byte) string {
 	return string(cstring)
 }
 
-// XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp
-// in include/uapi/linux/netfilter/xt_tcpudp.h.
-type XTTCP struct {
-	// SourcePortStart specifies the inclusive start of the range of source
-	// ports to which the matcher applies.
-	SourcePortStart uint16
-
-	// SourcePortEnd specifies the inclusive end of the range of source ports
-	// to which the matcher applies.
-	SourcePortEnd uint16
-
-	// DestinationPortStart specifies the start of the destination port
-	// range to which the matcher applies.
-	DestinationPortStart uint16
-
-	// DestinationPortEnd specifies the start of the destination port
-	// range to which the matcher applies.
-	DestinationPortEnd uint16
-
-	// Option specifies that a particular TCP option must be set.
-	Option uint8
-
-	// FlagMask masks the FlagCompare byte when comparing to the TCP flag
-	// fields.
-	FlagMask uint8
-
-	// FlagCompare is binary and-ed with the TCP flag fields.
-	FlagCompare uint8
-
-	// InverseFlags flips the meaning of certain fields. See the
-	// TX_TCP_INV_* flags.
-	InverseFlags uint8
-}
-
-// SizeOfXTTCP is the size of an XTTCP.
-const SizeOfXTTCP = 12
-
-// Flags in XTTCP.InverseFlags. Corresponding constants are in
-// include/uapi/linux/netfilter/xt_tcpudp.h.
-const (
-	// Invert the meaning of SourcePortStart/End.
-	XT_TCP_INV_SRCPT = 0x01
-	// Invert the meaning of DestinationPortStart/End.
-	XT_TCP_INV_DSTPT = 0x02
-	// Invert the meaning of FlagCompare.
-	XT_TCP_INV_FLAGS = 0x04
-	// Invert the meaning of Option.
-	XT_TCP_INV_OPTION = 0x08
-	// Enable all flags.
-	XT_TCP_INV_MASK = 0x0F
-)
-
 // XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp
 // in include/uapi/linux/netfilter/xt_tcpudp.h.
 type XTUDP struct {
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 45296b339..f8ed1acbc 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -131,7 +131,6 @@ func FillDefaultIPTables(stack *stack.Stack) {
 	stack.SetIPTables(ipt)
 }
 
-// TODO: Return proto.
 // convertNetstackToBinary converts the iptables as stored in netstack to the
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
@@ -456,31 +455,6 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 		var matcher iptables.Matcher
 		var err error
 		switch match.Name.String() {
-		case "tcp":
-			if len(buf) < linux.SizeOfXTTCP {
-				log.Warningf("netfilter: optVal has insufficient size for TCP match: %d", len(optVal))
-				return nil, syserr.ErrInvalidArgument
-			}
-			var matchData linux.XTTCP
-			// For alignment reasons, the match's total size may exceed what's
-			// strictly necessary to hold matchData.
-			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
-			log.Infof("parseMatchers: parsed XTTCP: %+v", matchData)
-			matcher, err = iptables.NewTCPMatcher(filter, iptables.TCPMatcherData{
-				SourcePortStart:      matchData.SourcePortStart,
-				SourcePortEnd:        matchData.SourcePortEnd,
-				DestinationPortStart: matchData.DestinationPortStart,
-				DestinationPortEnd:   matchData.DestinationPortEnd,
-				Option:               matchData.Option,
-				FlagMask:             matchData.FlagMask,
-				FlagCompare:          matchData.FlagCompare,
-				InverseFlags:         matchData.InverseFlags,
-			})
-			if err != nil {
-				log.Warningf("netfilter: failed to create TCP matcher: %v", err)
-				return nil, syserr.ErrInvalidArgument
-			}
-
 		case "udp":
 			if len(buf) < linux.SizeOfXTUDP {
 				log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal))
diff --git a/pkg/tcpip/iptables/tcp_matcher.go b/pkg/tcpip/iptables/tcp_matcher.go
deleted file mode 100644
index 6acbd6eb9..000000000
--- a/pkg/tcpip/iptables/tcp_matcher.go
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package iptables
-
-import (
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-)
-
-type TCPMatcher struct {
-	data TCPMatcherData
-
-	// tablename string
-	// unsigned int matchsize;
-	// unsigned int usersize;
-	// #ifdef CONFIG_COMPAT
-	// unsigned int compatsize;
-	// #endif
-	// unsigned int hooks;
-	// unsigned short proto;
-	// unsigned short family;
-}
-
-// TODO: Delete?
-// MatchCheckEntryParams
-
-type TCPMatcherData struct {
-	// Filter IPHeaderFilter
-
-	SourcePortStart      uint16
-	SourcePortEnd        uint16
-	DestinationPortStart uint16
-	DestinationPortEnd   uint16
-	Option               uint8
-	FlagMask             uint8
-	FlagCompare          uint8
-	InverseFlags         uint8
-}
-
-func NewTCPMatcher(filter IPHeaderFilter, data TCPMatcherData) (Matcher, error) {
-	// TODO: We currently only support source port and destination port.
-	log.Infof("Adding rule with TCPMatcherData: %+v", data)
-
-	if data.Option != 0 ||
-		data.FlagMask != 0 ||
-		data.FlagCompare != 0 ||
-		data.InverseFlags != 0 {
-		return nil, fmt.Errorf("unsupported TCP matcher flags set")
-	}
-
-	if filter.Protocol != header.TCPProtocolNumber {
-		log.Warningf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber)
-	}
-
-	return &TCPMatcher{data: data}, nil
-}
-
-// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments).
-func (tm *TCPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
-	netHeader := header.IPv4(pkt.NetworkHeader)
-
-	// TODO: Do we check proto here or elsewhere? I think elsewhere (check
-	// codesearch).
-	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
-		return false, false
-	}
-
-	// We dont't match fragments.
-	if frag := netHeader.FragmentOffset(); frag != 0 {
-		if frag == 1 {
-			log.Warningf("Dropping TCP packet: malicious packet with fragment with fragment offest of 1.")
-			return false, true
-		}
-		return false, false
-	}
-
-	// Now we need the transport header. However, this may not have been set
-	// yet.
-	// TODO
-	var tcpHeader header.TCP
-	if pkt.TransportHeader != nil {
-		tcpHeader = header.TCP(pkt.TransportHeader)
-	} else {
-		// The TCP header hasn't been parsed yet. We have to do it here.
-		if len(pkt.Data.First()) < header.TCPMinimumSize {
-			// There's no valid TCP header here, so we hotdrop the
-			// packet.
-			// TODO: Stats.
-			log.Warningf("Dropping TCP packet: size to small.")
-			return false, true
-		}
-		tcpHeader = header.TCP(pkt.Data.First())
-	}
-
-	// Check whether the source and destination ports are within the
-	// matching range.
-	sourcePort := tcpHeader.SourcePort()
-	destinationPort := tcpHeader.DestinationPort()
-	if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort {
-		return false, false
-	}
-	if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort {
-		return false, false
-	}
-
-	return true, false
-}
-- 
cgit v1.2.3


From 421b6ff18154f80ea8cbbfd8340042ab458bf813 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Jan 2020 14:54:39 -0800
Subject: Passes all filter table UDP tests.

---
 pkg/tcpip/iptables/BUILD | 1 -
 1 file changed, 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index ff4e3c932..e41c645ed 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -7,7 +7,6 @@ go_library(
     srcs = [
         "iptables.go",
         "targets.go",
-        "tcp_matcher.go",
         "types.go",
         "udp_matcher.go",
     ],
-- 
cgit v1.2.3


From 538053538dfb378aa8bc512d484ea305177e617b Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Jan 2020 16:51:17 -0800
Subject: Adding serialization.

---
 pkg/sentry/socket/netfilter/netfilter.go | 29 ++++++++++++++++++++++++++++-
 pkg/tcpip/iptables/udp_matcher.go        | 14 +++++++-------
 2 files changed, 35 insertions(+), 8 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index f8ed1acbc..3caabca9a 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -196,7 +196,9 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 }
 
 func marshalMatcher(matcher iptables.Matcher) []byte {
-	switch matcher.(type) {
+	switch m := matcher.(type) {
+	case *iptables.UDPMatcher:
+		return marshalUDPMatcher(m)
 	default:
 		// TODO(gvisor.dev/issue/170): We don't support any matchers
 		// yet, so any call to marshalMatcher will panic.
@@ -204,6 +206,31 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
 	}
 }
 
+func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
+	type udpMatch struct {
+		linux.XTEntryMatch
+		linux.XTUDP
+	}
+	linuxMatcher := udpMatch{
+		XTEntryMatch: linux.XTEntryMatch{
+			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP,
+			// Name:      "udp",
+		},
+		XTUDP: linux.XTUDP{
+			SourcePortStart:      matcher.Data.SourcePortStart,
+			SourcePortEnd:        matcher.Data.SourcePortEnd,
+			DestinationPortStart: matcher.Data.DestinationPortStart,
+			DestinationPortEnd:   matcher.Data.DestinationPortEnd,
+			InverseFlags:         matcher.Data.InverseFlags,
+		},
+	}
+	copy(linuxMatcher.Name[:], "udp")
+
+	var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP]byte
+	binary.Marshal(buf[:], usermem.ByteOrder, linuxMatcher)
+	return buf[:]
+}
+
 func marshalTarget(target iptables.Target) []byte {
 	switch target.(type) {
 	case iptables.UnconditionalAcceptTarget:
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
index ce4368a3d..fca457199 100644
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ b/pkg/tcpip/iptables/udp_matcher.go
@@ -24,7 +24,7 @@ import (
 )
 
 type UDPMatcher struct {
-	data UDPMatcherData
+	Data UDPMatcherData
 
 	// tablename string
 	// unsigned int matchsize;
@@ -62,11 +62,11 @@ func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error)
 		log.Warningf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
 	}
 
-	return &UDPMatcher{data: data}, nil
+	return &UDPMatcher{Data: data}, nil
 }
 
 // TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments).
-func (tm *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
 	log.Infof("UDPMatcher called from: %s", string(debug.Stack()))
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
@@ -114,12 +114,12 @@ func (tm *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str
 	destinationPort := udpHeader.DestinationPort()
 	log.Infof("UDPMatcher: sport and dport are %d and %d. sports and dport start and end are (%d, %d) and (%d, %d)",
 		udpHeader.SourcePort(), udpHeader.DestinationPort(),
-		tm.data.SourcePortStart, tm.data.SourcePortEnd,
-		tm.data.DestinationPortStart, tm.data.DestinationPortEnd)
-	if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort {
+		um.Data.SourcePortStart, um.Data.SourcePortEnd,
+		um.Data.DestinationPortStart, um.Data.DestinationPortEnd)
+	if sourcePort < um.Data.SourcePortStart || um.Data.SourcePortEnd < sourcePort {
 		return false, false
 	}
-	if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort {
+	if destinationPort < um.Data.DestinationPortStart || um.Data.DestinationPortEnd < destinationPort {
 		return false, false
 	}
 
-- 
cgit v1.2.3


From b7853f688b4bcd3465c0c3087fcbd8d53bdf26ae Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 22 Jan 2020 14:46:15 -0800
Subject: Error marshalling the matcher.

The iptables binary is looking for libxt_.so when it should be looking
for libxt_udp.so, so it's having an issue reading the data in
xt_match_entry. I think it may be an alignment issue.

Trying to fix this is leading to me fighting with the metadata struct,
so I'm gonna go kill that.
---
 pkg/abi/linux/netfilter.go               |  5 +++++
 pkg/sentry/socket/netfilter/netfilter.go | 35 ++++++++++++++++++++------------
 pkg/tcpip/iptables/udp_matcher.go        |  2 +-
 3 files changed, 28 insertions(+), 14 deletions(-)

(limited to 'pkg')

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index f0e544f9c..effed7976 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -198,6 +198,11 @@ type XTEntryMatch struct {
 // SizeOfXTEntryMatch is the size of an XTEntryMatch.
 const SizeOfXTEntryMatch = 32
 
+type KernelXTEntryMatch struct {
+	XTEntryMatch
+	Data []byte
+}
+
 // XTEntryTarget holds a target for a rule. For example, it can specify that
 // packets matching the rule should DROP, ACCEPT, or use an extension target.
 // iptables-extension(8) has a list of possible targets.
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3caabca9a..b49fe5b3e 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -207,26 +207,34 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
 }
 
 func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
-	type udpMatch struct {
-		linux.XTEntryMatch
-		linux.XTUDP
-	}
-	linuxMatcher := udpMatch{
+	linuxMatcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
 			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP,
 			// Name:      "udp",
 		},
-		XTUDP: linux.XTUDP{
-			SourcePortStart:      matcher.Data.SourcePortStart,
-			SourcePortEnd:        matcher.Data.SourcePortEnd,
-			DestinationPortStart: matcher.Data.DestinationPortStart,
-			DestinationPortEnd:   matcher.Data.DestinationPortEnd,
-			InverseFlags:         matcher.Data.InverseFlags,
-		},
+		Data: make([]byte, linux.SizeOfXTUDP+22),
 	}
+	// copy(linuxMatcher.Name[:], "udp")
 	copy(linuxMatcher.Name[:], "udp")
 
-	var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP]byte
+	// TODO: Must be aligned.
+	xtudp := linux.XTUDP{
+		SourcePortStart:      matcher.Data.SourcePortStart,
+		SourcePortEnd:        matcher.Data.SourcePortEnd,
+		DestinationPortStart: matcher.Data.DestinationPortStart,
+		DestinationPortEnd:   matcher.Data.DestinationPortEnd,
+		InverseFlags:         matcher.Data.InverseFlags,
+	}
+	binary.Marshal(linuxMatcher.Data[:linux.SizeOfXTUDP], usermem.ByteOrder, xtudp)
+
+	if binary.Size(linuxMatcher)%64 != 0 {
+		panic(fmt.Sprintf("size is actually: %d", binary.Size(linuxMatcher)))
+	}
+
+	var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 22]byte
+	if len(buf)%64 != 0 {
+		panic(fmt.Sprintf("len is actually: %d", len(buf)))
+	}
 	binary.Marshal(buf[:], usermem.ByteOrder, linuxMatcher)
 	return buf[:]
 }
@@ -245,6 +253,7 @@ func marshalTarget(target iptables.Target) []byte {
 }
 
 func marshalStandardTarget(verdict iptables.Verdict) []byte {
+	// TODO: Must be aligned.
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
 		Target: linux.XTEntryTarget{
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
index fca457199..65ae7f9e0 100644
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ b/pkg/tcpip/iptables/udp_matcher.go
@@ -59,7 +59,7 @@ func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error)
 	}
 
 	if filter.Protocol != header.UDPProtocolNumber {
-		log.Warningf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
+		return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
 	}
 
 	return &UDPMatcher{Data: data}, nil
-- 
cgit v1.2.3


From 2946fe81627afa223853769ed736e2a56e0144b7 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 24 Jan 2020 17:12:03 -0800
Subject: We can now actually write out the udp matcher.

---
 pkg/sentry/socket/netfilter/netfilter.go | 78 ++++++++++++++++++++++++--------
 1 file changed, 58 insertions(+), 20 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3ca22932d..6c88a50a6 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -36,7 +36,7 @@ const errorTargetName = "ERROR"
 
 // metadata is opaque to netstack. It holds data that we need to translate
 // between Linux's and netstack's iptables representations.
-// TODO(gvisor.dev/issue/170): This might be removable.
+// TODO(gvisor.dev/issue/170): Use metadata to check correctness.
 type metadata struct {
 	HookEntry  [linux.NF_INET_NUMHOOKS]uint32
 	Underflow  [linux.NF_INET_NUMHOOKS]uint32
@@ -44,6 +44,14 @@ type metadata struct {
 	Size       uint32
 }
 
+const enableDebugLog = true
+
+func nflog(format string, args ...interface{}) {
+	if enableDebugLog {
+		log.Infof("netfilter: "+format, args...)
+	}
+}
+
 // GetInfo returns information about iptables.
 func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
 	// Read in the struct and table name.
@@ -72,6 +80,8 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	info.NumEntries = metadata.NumEntries
 	info.Size = metadata.Size
 
+	nflog("GetInfo returning info: %+v", info)
+
 	return info, nil
 }
 
@@ -80,21 +90,26 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	// Read in the struct and table name.
 	var userEntries linux.IPTGetEntries
 	if _, err := t.CopyIn(outPtr, &userEntries); err != nil {
+		log.Warningf("netfilter: couldn't copy in entries %q", userEntries.Name)
 		return linux.KernelIPTGetEntries{}, syserr.FromError(err)
 	}
 
 	// Find the appropriate table.
 	table, err := findTable(stack, userEntries.Name)
 	if err != nil {
+		log.Warningf("netfilter: couldn't find table %q", userEntries.Name)
 		return linux.KernelIPTGetEntries{}, err
 	}
 
 	// Convert netstack's iptables rules to something that the iptables
 	// tool can understand.
-	entries, _, err := convertNetstackToBinary(userEntries.Name.String(), table)
+	entries, meta, err := convertNetstackToBinary(userEntries.Name.String(), table)
 	if err != nil {
 		return linux.KernelIPTGetEntries{}, err
 	}
+	if meta != table.Metadata().(metadata) {
+		panic(fmt.Sprintf("Table %q metadata changed between writing and reading. Was saved as %+v, but is now %+v", userEntries.Name.String(), table.Metadata().(metadata), meta))
+	}
 	if binary.Size(entries) > uintptr(outLen) {
 		log.Warningf("Insufficient GetEntries output size: %d", uintptr(outLen))
 		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
@@ -148,15 +163,19 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	copy(entries.Name[:], tablename)
 
 	for ruleIdx, rule := range table.Rules {
+		nflog("Current offset: %d", entries.Size)
+
 		// Is this a chain entry point?
 		for hook, hookRuleIdx := range table.BuiltinChains {
 			if hookRuleIdx == ruleIdx {
+				nflog("Found hook %d at offset %d", hook, entries.Size)
 				meta.HookEntry[hook] = entries.Size
 			}
 		}
 		// Is this a chain underflow point?
 		for underflow, underflowRuleIdx := range table.Underflows {
 			if underflowRuleIdx == ruleIdx {
+				nflog("Found underflow %d at offset %d", underflow, entries.Size)
 				meta.Underflow[underflow] = entries.Size
 			}
 		}
@@ -176,6 +195,10 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 			// Serialize the matcher and add it to the
 			// entry.
 			serialized := marshalMatcher(matcher)
+			nflog("matcher serialized as: %v", serialized)
+			if len(serialized)%8 != 0 {
+				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
+			}
 			entry.Elems = append(entry.Elems, serialized...)
 			entry.NextOffset += uint16(len(serialized))
 			entry.TargetOffset += uint16(len(serialized))
@@ -183,18 +206,25 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 
 		// Serialize and append the target.
 		serialized := marshalTarget(rule.Target)
+		if len(serialized)%8 != 0 {
+			panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+		}
 		entry.Elems = append(entry.Elems, serialized...)
 		entry.NextOffset += uint16(len(serialized))
 
+		nflog("Adding entry: %+v", entry)
+
 		entries.Size += uint32(entry.NextOffset)
 		entries.Entrytable = append(entries.Entrytable, entry)
 		meta.NumEntries++
 	}
 
+	nflog("Finished with an marshalled size of %d", meta.Size)
 	meta.Size = entries.Size
 	return entries, meta, nil
 }
 
+// TODO: SOMEHOW THIS IS NOT GETTING APPENDED!
 func marshalMatcher(matcher iptables.Matcher) []byte {
 	switch m := matcher.(type) {
 	case *iptables.UDPMatcher:
@@ -207,17 +237,17 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
 }
 
 func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
+	nflog("Marshalling UDP matcher: %+v", matcher)
+
 	linuxMatcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
-			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP,
+			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6,
 			// Name:      "udp",
 		},
-		Data: make([]byte, linux.SizeOfXTUDP+22),
+		Data: make([]byte, 0, linux.SizeOfXTUDP),
 	}
-	// copy(linuxMatcher.Name[:], "udp")
 	copy(linuxMatcher.Name[:], "udp")
 
-	// TODO: Must be aligned.
 	xtudp := linux.XTUDP{
 		SourcePortStart:      matcher.Data.SourcePortStart,
 		SourcePortEnd:        matcher.Data.SourcePortEnd,
@@ -225,17 +255,17 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
 		DestinationPortEnd:   matcher.Data.DestinationPortEnd,
 		InverseFlags:         matcher.Data.InverseFlags,
 	}
-	binary.Marshal(linuxMatcher.Data[:linux.SizeOfXTUDP], usermem.ByteOrder, xtudp)
-
-	if binary.Size(linuxMatcher)%64 != 0 {
-		panic(fmt.Sprintf("size is actually: %d", binary.Size(linuxMatcher)))
-	}
-
-	var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 22]byte
-	if len(buf)%64 != 0 {
-		panic(fmt.Sprintf("len is actually: %d", len(buf)))
-	}
-	binary.Marshal(buf[:], usermem.ByteOrder, linuxMatcher)
+	nflog("marshalUDPMatcher: xtudp: %+v", xtudp)
+	linuxMatcher.Data = binary.Marshal(linuxMatcher.Data, usermem.ByteOrder, xtudp)
+	nflog("marshalUDPMatcher: linuxMatcher: %+v", linuxMatcher)
+
+	// We have to pad this struct size to a multiple of 8 bytes, so we make
+	// this a little longer than it needs to be.
+	buf := make([]byte, 0, linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP+6)
+	buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher)
+	buf = append(buf, []byte{0, 0, 0, 0, 0, 0}...)
+	nflog("Marshalled into matcher of size %d", len(buf))
+	nflog("marshalUDPMatcher: buf is: %v", buf)
 	return buf[:]
 }
 
@@ -253,6 +283,8 @@ func marshalTarget(target iptables.Target) []byte {
 }
 
 func marshalStandardTarget(verdict iptables.Verdict) []byte {
+	nflog("Marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
+
 	// TODO: Must be aligned.
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
@@ -321,7 +353,7 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
 // SetEntries sets iptables rules for a single table. See
 // net/ipv4/netfilter/ip_tables.c:translate_table for reference.
 func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
-	printReplace(optVal)
+	// printReplace(optVal)
 
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
@@ -343,10 +375,14 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		return syserr.ErrInvalidArgument
 	}
 
+	nflog("Setting entries in table %q", replace.Name.String())
+
 	// Convert input into a list of rules and their offsets.
 	var offset uint32
 	var offsets []uint32
 	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+		nflog("Processing entry at offset %d", offset)
+
 		// Get the struct ipt_entry.
 		if len(optVal) < linux.SizeOfIPTEntry {
 			log.Warningf("netfilter: optVal has insufficient size for entry %d", len(optVal))
@@ -464,9 +500,10 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
 // only the matchers.
 func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) {
+	nflog("Parsing matchers of size %d", len(optVal))
 	var matchers []iptables.Matcher
 	for len(optVal) > 0 {
-		log.Infof("parseMatchers: optVal has len %d", len(optVal))
+		nflog("parseMatchers: optVal has len %d", len(optVal))
 		// Get the XTEntryMatch.
 		if len(optVal) < linux.SizeOfXTEntryMatch {
 			log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal))
@@ -475,7 +512,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 		var match linux.XTEntryMatch
 		buf := optVal[:linux.SizeOfXTEntryMatch]
 		binary.Unmarshal(buf, usermem.ByteOrder, &match)
-		log.Infof("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match)
+		nflog("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match)
 
 		// Check some invariants.
 		if match.MatchSize < linux.SizeOfXTEntryMatch {
@@ -532,6 +569,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
 func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
+	nflog("Parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
 		log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
 		return nil, syserr.ErrInvalidArgument
-- 
cgit v1.2.3


From 29316e66adfc49c158425554761e34c12338f1d9 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 27 Jan 2020 12:27:04 -0800
Subject: Cleanup for GH review.

---
 pkg/abi/linux/netfilter.go               |  14 +--
 pkg/sentry/socket/netfilter/netfilter.go | 144 ++++++++++++-------------------
 pkg/tcpip/iptables/types.go              |  15 ----
 pkg/tcpip/iptables/udp_matcher.go        |  62 ++++++-------
 test/iptables/filter_input.go            |   6 +-
 5 files changed, 88 insertions(+), 153 deletions(-)

(limited to 'pkg')

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index effed7976..8e40bcc62 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -198,6 +198,8 @@ type XTEntryMatch struct {
 // SizeOfXTEntryMatch is the size of an XTEntryMatch.
 const SizeOfXTEntryMatch = 32
 
+// KernelXTEntryMatch is identical to XTEntryMatch, but contains
+// variable-length Data field.
 type KernelXTEntryMatch struct {
 	XTEntryMatch
 	Data []byte
@@ -349,19 +351,19 @@ func goString(cstring []byte) string {
 // XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp
 // in include/uapi/linux/netfilter/xt_tcpudp.h.
 type XTUDP struct {
-	// SourcePortStart specifies the inclusive start of the range of source
-	// ports to which the matcher applies.
+	// SourcePortStart is the inclusive start of the range of source ports
+	// to which the matcher applies.
 	SourcePortStart uint16
 
-	// SourcePortEnd specifies the inclusive end of the range of source ports
-	// to which the matcher applies.
+	// SourcePortEnd is the inclusive end of the range of source ports to
+	// which the matcher applies.
 	SourcePortEnd uint16
 
-	// DestinationPortStart specifies the start of the destination port
+	// DestinationPortStart is the inclusive start of the destination port
 	// range to which the matcher applies.
 	DestinationPortStart uint16
 
-	// DestinationPortEnd specifies the start of the destination port
+	// DestinationPortEnd is the inclusive end of the destination port
 	// range to which the matcher applies.
 	DestinationPortEnd uint16
 
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 6c88a50a6..b8848f08a 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -34,9 +34,16 @@ import (
 // shouldn't be reached - an error has occurred if we fall through to one.
 const errorTargetName = "ERROR"
 
-// metadata is opaque to netstack. It holds data that we need to translate
-// between Linux's and netstack's iptables representations.
-// TODO(gvisor.dev/issue/170): Use metadata to check correctness.
+const (
+	matcherNameUDP = "udp"
+)
+
+// Metadata is used to verify that we are correctly serializing and
+// deserializing iptables into structs consumable by the iptables tool. We save
+// a metadata struct when the tables are written, and when they are read out we
+// verify that certain fields are the same.
+//
+// metadata is opaque to netstack.
 type metadata struct {
 	HookEntry  [linux.NF_INET_NUMHOOKS]uint32
 	Underflow  [linux.NF_INET_NUMHOOKS]uint32
@@ -44,10 +51,12 @@ type metadata struct {
 	Size       uint32
 }
 
-const enableDebugLog = true
+const enableDebug = false
 
+// nflog logs messages related to the writing and reading of iptables, but only
+// when enableDebug is true.
 func nflog(format string, args ...interface{}) {
-	if enableDebugLog {
+	if enableDebug {
 		log.Infof("netfilter: "+format, args...)
 	}
 }
@@ -80,7 +89,7 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	info.NumEntries = metadata.NumEntries
 	info.Size = metadata.Size
 
-	nflog("GetInfo returning info: %+v", info)
+	nflog("returning info: %+v", info)
 
 	return info, nil
 }
@@ -163,19 +172,19 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	copy(entries.Name[:], tablename)
 
 	for ruleIdx, rule := range table.Rules {
-		nflog("Current offset: %d", entries.Size)
+		nflog("convert to binary: current offset: %d", entries.Size)
 
 		// Is this a chain entry point?
 		for hook, hookRuleIdx := range table.BuiltinChains {
 			if hookRuleIdx == ruleIdx {
-				nflog("Found hook %d at offset %d", hook, entries.Size)
+				nflog("convert to binary: found hook %d at offset %d", hook, entries.Size)
 				meta.HookEntry[hook] = entries.Size
 			}
 		}
 		// Is this a chain underflow point?
 		for underflow, underflowRuleIdx := range table.Underflows {
 			if underflowRuleIdx == ruleIdx {
-				nflog("Found underflow %d at offset %d", underflow, entries.Size)
+				nflog("convert to binary: found underflow %d at offset %d", underflow, entries.Size)
 				meta.Underflow[underflow] = entries.Size
 			}
 		}
@@ -195,7 +204,7 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 			// Serialize the matcher and add it to the
 			// entry.
 			serialized := marshalMatcher(matcher)
-			nflog("matcher serialized as: %v", serialized)
+			nflog("convert to binary: matcher serialized as: %v", serialized)
 			if len(serialized)%8 != 0 {
 				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
 			}
@@ -212,14 +221,14 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 		entry.Elems = append(entry.Elems, serialized...)
 		entry.NextOffset += uint16(len(serialized))
 
-		nflog("Adding entry: %+v", entry)
+		nflog("convert to binary: adding entry: %+v", entry)
 
 		entries.Size += uint32(entry.NextOffset)
 		entries.Entrytable = append(entries.Entrytable, entry)
 		meta.NumEntries++
 	}
 
-	nflog("Finished with an marshalled size of %d", meta.Size)
+	nflog("convert to binary: finished with an marshalled size of %d", meta.Size)
 	meta.Size = entries.Size
 	return entries, meta, nil
 }
@@ -237,16 +246,18 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
 }
 
 func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
-	nflog("Marshalling UDP matcher: %+v", matcher)
+	nflog("convert to binary: marshalling UDP matcher: %+v", matcher)
+
+	// We have to pad this struct size to a multiple of 8 bytes.
+	const size = linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6
 
 	linuxMatcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
-			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6,
-			// Name:      "udp",
+			MatchSize: size,
 		},
 		Data: make([]byte, 0, linux.SizeOfXTUDP),
 	}
-	copy(linuxMatcher.Name[:], "udp")
+	copy(linuxMatcher.Name[:], matcherNameUDP)
 
 	xtudp := linux.XTUDP{
 		SourcePortStart:      matcher.Data.SourcePortStart,
@@ -255,17 +266,12 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
 		DestinationPortEnd:   matcher.Data.DestinationPortEnd,
 		InverseFlags:         matcher.Data.InverseFlags,
 	}
-	nflog("marshalUDPMatcher: xtudp: %+v", xtudp)
 	linuxMatcher.Data = binary.Marshal(linuxMatcher.Data, usermem.ByteOrder, xtudp)
-	nflog("marshalUDPMatcher: linuxMatcher: %+v", linuxMatcher)
 
-	// We have to pad this struct size to a multiple of 8 bytes, so we make
-	// this a little longer than it needs to be.
-	buf := make([]byte, 0, linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP+6)
+	buf := make([]byte, 0, size)
 	buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher)
 	buf = append(buf, []byte{0, 0, 0, 0, 0, 0}...)
-	nflog("Marshalled into matcher of size %d", len(buf))
-	nflog("marshalUDPMatcher: buf is: %v", buf)
+	nflog("convert to binary: marshalled UDP matcher into %v", buf)
 	return buf[:]
 }
 
@@ -283,9 +289,8 @@ func marshalTarget(target iptables.Target) []byte {
 }
 
 func marshalStandardTarget(verdict iptables.Verdict) []byte {
-	nflog("Marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
+	nflog("convert to binary: marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
 
-	// TODO: Must be aligned.
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
 		Target: linux.XTEntryTarget{
@@ -353,8 +358,6 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
 // SetEntries sets iptables rules for a single table. See
 // net/ipv4/netfilter/ip_tables.c:translate_table for reference.
 func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
-	// printReplace(optVal)
-
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
 		log.Warningf("netfilter.SetEntries: optVal has insufficient size for replace %d", len(optVal))
@@ -375,13 +378,13 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		return syserr.ErrInvalidArgument
 	}
 
-	nflog("Setting entries in table %q", replace.Name.String())
+	nflog("set entries: setting entries in table %q", replace.Name.String())
 
 	// Convert input into a list of rules and their offsets.
 	var offset uint32
 	var offsets []uint32
 	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
-		nflog("Processing entry at offset %d", offset)
+		nflog("set entries: processing entry at offset %d", offset)
 
 		// Get the struct ipt_entry.
 		if len(optVal) < linux.SizeOfIPTEntry {
@@ -406,11 +409,13 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 			return err
 		}
 
-		// TODO: Matchers (and maybe targets) can specify that they only work for certiain protocols, hooks, tables.
+		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
+		// that they only work for certiain protocols, hooks, tables.
 		// Get matchers.
 		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
 		if len(optVal) < int(matchersSize) {
 			log.Warningf("netfilter: entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+			return syserr.ErrInvalidArgument
 		}
 		matchers, err := parseMatchers(filter, optVal[:matchersSize])
 		if err != nil {
@@ -423,6 +428,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		targetSize := entry.NextOffset - entry.TargetOffset
 		if len(optVal) < int(targetSize) {
 			log.Warningf("netfilter: entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+			return syserr.ErrInvalidArgument
 		}
 		target, err := parseTarget(optVal[:targetSize])
 		if err != nil {
@@ -500,10 +506,11 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
 // only the matchers.
 func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) {
-	nflog("Parsing matchers of size %d", len(optVal))
+	nflog("set entries: parsing matchers of size %d", len(optVal))
 	var matchers []iptables.Matcher
 	for len(optVal) > 0 {
-		nflog("parseMatchers: optVal has len %d", len(optVal))
+		nflog("set entries: optVal has len %d", len(optVal))
+
 		// Get the XTEntryMatch.
 		if len(optVal) < linux.SizeOfXTEntryMatch {
 			log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal))
@@ -512,7 +519,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 		var match linux.XTEntryMatch
 		buf := optVal[:linux.SizeOfXTEntryMatch]
 		binary.Unmarshal(buf, usermem.ByteOrder, &match)
-		nflog("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match)
+		nflog("set entries: parsed entry match %q: %+v", match.Name.String(), match)
 
 		// Check some invariants.
 		if match.MatchSize < linux.SizeOfXTEntryMatch {
@@ -528,17 +535,17 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 		var matcher iptables.Matcher
 		var err error
 		switch match.Name.String() {
-		case "udp":
+		case matcherNameUDP:
 			if len(buf) < linux.SizeOfXTUDP {
 				log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal))
 				return nil, syserr.ErrInvalidArgument
 			}
+			// For alignment reasons, the match's total size may
+			// exceed what's strictly necessary to hold matchData.
 			var matchData linux.XTUDP
-			// For alignment reasons, the match's total size may exceed what's
-			// strictly necessary to hold matchData.
 			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
 			log.Infof("parseMatchers: parsed XTUDP: %+v", matchData)
-			matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherData{
+			matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherParams{
 				SourcePortStart:      matchData.SourcePortStart,
 				SourcePortEnd:        matchData.SourcePortEnd,
 				DestinationPortStart: matchData.DestinationPortStart,
@@ -557,19 +564,22 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 		matchers = append(matchers, matcher)
 
-		// TODO: Support revision.
-		// TODO: Support proto -- matchers usually specify which proto(s) they work with.
+		// TODO(gvisor.dev/issue/170): Check the revision field.
 		optVal = optVal[match.MatchSize:]
 	}
 
-	// TODO: Check that optVal is exhausted.
+	if len(optVal) != 0 {
+		log.Warningf("netfilter: optVal should be exhausted after parsing matchers")
+		return nil, syserr.ErrInvalidArgument
+	}
+
 	return matchers, nil
 }
 
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
 func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
-	nflog("Parsing target of size %d", len(optVal))
+	nflog("set entries: parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
 		log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
 		return nil, syserr.ErrInvalidArgument
@@ -598,7 +608,8 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 		case iptables.Drop:
 			return iptables.UnconditionalDropTarget{}, nil
 		default:
-			panic(fmt.Sprintf("Unknown verdict: %v", verdict))
+			log.Warningf("Unknown verdict: %v", verdict)
+			return nil, syserr.ErrInvalidArgument
 		}
 
 	case errorTargetName:
@@ -673,52 +684,3 @@ func hookFromLinux(hook int) iptables.Hook {
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
-
-// printReplace prints information about the struct ipt_replace in optVal. It
-// is only for debugging.
-func printReplace(optVal []byte) {
-	// Basic replace info.
-	var replace linux.IPTReplace
-	replaceBuf := optVal[:linux.SizeOfIPTReplace]
-	optVal = optVal[linux.SizeOfIPTReplace:]
-	binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace)
-	log.Infof("Replacing table %q: %+v", replace.Name.String(), replace)
-
-	// Read in the list of entries at the end of replace.
-	var totalOffset uint16
-	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
-		var entry linux.IPTEntry
-		entryBuf := optVal[:linux.SizeOfIPTEntry]
-		binary.Unmarshal(entryBuf, usermem.ByteOrder, &entry)
-		log.Infof("Entry %d (total offset %d): %+v", entryIdx, totalOffset, entry)
-
-		totalOffset += entry.NextOffset
-		if entry.TargetOffset == linux.SizeOfIPTEntry {
-			log.Infof("Entry has no matches.")
-		} else {
-			log.Infof("Entry has matches.")
-		}
-
-		var target linux.XTEntryTarget
-		targetBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTEntryTarget]
-		binary.Unmarshal(targetBuf, usermem.ByteOrder, &target)
-		log.Infof("Target named %q: %+v", target.Name.String(), target)
-
-		switch target.Name.String() {
-		case "":
-			var standardTarget linux.XTStandardTarget
-			stBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTStandardTarget]
-			binary.Unmarshal(stBuf, usermem.ByteOrder, &standardTarget)
-			log.Infof("Standard target with verdict %q (%d).", linux.VerdictStrings[standardTarget.Verdict], standardTarget.Verdict)
-		case errorTargetName:
-			var errorTarget linux.XTErrorTarget
-			etBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTErrorTarget]
-			binary.Unmarshal(etBuf, usermem.ByteOrder, &errorTarget)
-			log.Infof("Error target with name %q.", errorTarget.Name.String())
-		default:
-			log.Infof("Unknown target type.")
-		}
-
-		optVal = optVal[entry.NextOffset:]
-	}
-}
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index d47447d40..ba5ed75b4 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -169,8 +169,6 @@ type IPHeaderFilter struct {
 	Protocol tcpip.TransportProtocolNumber
 }
 
-// TODO: Should these be able to marshal/unmarshal themselves?
-// TODO: Something has to map the name to the matcher.
 // A Matcher is the interface for matching packets.
 type Matcher interface {
 	// Match returns whether the packet matches and whether the packet
@@ -179,19 +177,6 @@ type Matcher interface {
 	//
 	// Precondition: packet.NetworkHeader is set.
 	Match(hook Hook, packet tcpip.PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
-
-	// TODO: Make this typesafe by having each Matcher have their own, typed CheckEntry?
-	// CheckEntry(params MatchCheckEntryParams) bool
-}
-
-// TODO: Unused?
-type MatchCheckEntryParams struct {
-	Table  string // TODO: Tables should be an enum...
-	Filter IPHeaderFilter
-	Info   interface{} // TODO: Type unsafe.
-	// HookMask       uint8
-	// Family         uint8
-	// NFTCompat      bool
 }
 
 // A Target is the interface for taking an action for a packet.
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
index 65ae7f9e0..f59ca2027 100644
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ b/pkg/tcpip/iptables/udp_matcher.go
@@ -16,33 +16,28 @@ package iptables
 
 import (
 	"fmt"
-	"runtime/debug"
 
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
+// TODO(gvisor.dev/issue/170): The following per-matcher params should be
+// supported:
+// - Table name
+// - Match size
+// - User size
+// - Hooks
+// - Proto
+// - Family
+
+// UDPMatcher matches UDP packets and their headers. It implements Matcher.
 type UDPMatcher struct {
-	Data UDPMatcherData
-
-	// tablename string
-	// unsigned int matchsize;
-	// unsigned int usersize;
-	// #ifdef CONFIG_COMPAT
-	// unsigned int compatsize;
-	// #endif
-	// unsigned int hooks;
-	// unsigned short proto;
-	// unsigned short family;
+	Data UDPMatcherParams
 }
 
-// TODO: Delete?
-// MatchCheckEntryParams
-
-type UDPMatcherData struct {
-	// Filter IPHeaderFilter
-
+// UDPMatcherParams are the parameters used to create a UDPMatcher.
+type UDPMatcherParams struct {
 	SourcePortStart      uint16
 	SourcePortEnd        uint16
 	DestinationPortStart uint16
@@ -50,12 +45,12 @@ type UDPMatcherData struct {
 	InverseFlags         uint8
 }
 
-func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error) {
-	// TODO: We currently only support source port and destination port.
-	log.Infof("Adding rule with UDPMatcherData: %+v", data)
+// NewUDPMatcher returns a new instance of UDPMatcher.
+func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherParams) (Matcher, error) {
+	log.Infof("Adding rule with UDPMatcherParams: %+v", data)
 
 	if data.InverseFlags != 0 {
-		return nil, fmt.Errorf("unsupported UDP matcher flags set")
+		return nil, fmt.Errorf("unsupported UDP matcher inverse flags set")
 	}
 
 	if filter.Protocol != header.UDPProtocolNumber {
@@ -65,21 +60,18 @@ func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error)
 	return &UDPMatcher{Data: data}, nil
 }
 
-// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments).
+// Match implements Matcher.Match.
 func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
-	log.Infof("UDPMatcher called from: %s", string(debug.Stack()))
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
-	// TODO: Do we check proto here or elsewhere? I think elsewhere (check
-	// codesearch).
+	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
+	// into the iptables.Check codepath as matchers are added.
 	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
-		log.Infof("UDPMatcher: wrong protocol number")
 		return false, false
 	}
 
 	// We dont't match fragments.
 	if frag := netHeader.FragmentOffset(); frag != 0 {
-		log.Infof("UDPMatcher: it's a fragment")
 		if frag == 1 {
 			return false, true
 		}
@@ -89,20 +81,18 @@ func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str
 
 	// Now we need the transport header. However, this may not have been set
 	// yet.
-	// TODO
+	// TODO(gvisor.dev/issue/170): Parsing the transport header should
+	// ultimately be moved into the iptables.Check codepath as matchers are
+	// added.
 	var udpHeader header.UDP
 	if pkt.TransportHeader != nil {
-		log.Infof("UDPMatcher: transport header is not nil")
 		udpHeader = header.UDP(pkt.TransportHeader)
 	} else {
-		log.Infof("UDPMatcher: transport header is nil")
-		log.Infof("UDPMatcher: is network header nil: %t", pkt.NetworkHeader == nil)
 		// The UDP header hasn't been parsed yet. We have to do it here.
 		if len(pkt.Data.First()) < header.UDPMinimumSize {
 			// There's no valid UDP header here, so we hotdrop the
 			// packet.
-			// TODO: Stats.
-			log.Warningf("Dropping UDP packet: size to small.")
+			log.Warningf("Dropping UDP packet: size too small.")
 			return false, true
 		}
 		udpHeader = header.UDP(pkt.Data.First())
@@ -112,10 +102,6 @@ func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str
 	// matching range.
 	sourcePort := udpHeader.SourcePort()
 	destinationPort := udpHeader.DestinationPort()
-	log.Infof("UDPMatcher: sport and dport are %d and %d. sports and dport start and end are (%d, %d) and (%d, %d)",
-		udpHeader.SourcePort(), udpHeader.DestinationPort(),
-		um.Data.SourcePortStart, um.Data.SourcePortEnd,
-		um.Data.DestinationPortStart, um.Data.DestinationPortEnd)
 	if sourcePort < um.Data.SourcePortStart || um.Data.SourcePortEnd < sourcePort {
 		return false, false
 	}
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index bc963d40e..e9f0978eb 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -264,9 +264,9 @@ func (FilterInputMultiUDPRules) ContainerAction(ip net.IP) error {
 	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
 		return err
 	}
-	// if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"); err != nil {
-	// 	return err
-	// }
+	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"); err != nil {
+		return err
+	}
 	return filterTable("-L")
 }
 
-- 
cgit v1.2.3


From e889f95671a9b7b1c6f65cb6fbc1b865a896e827 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 27 Jan 2020 12:30:06 -0800
Subject: More cleanup.

---
 pkg/tcpip/iptables/udp_matcher.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
index f59ca2027..3bb076f9c 100644
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ b/pkg/tcpip/iptables/udp_matcher.go
@@ -73,9 +73,9 @@ func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str
 	// We dont't match fragments.
 	if frag := netHeader.FragmentOffset(); frag != 0 {
 		if frag == 1 {
+			log.Warningf("Dropping UDP packet: malicious fragmented packet.")
 			return false, true
 		}
-		log.Warningf("Dropping UDP packet: malicious fragmented packet.")
 		return false, false
 	}
 
-- 
cgit v1.2.3


From d6a2e01d3e57e0837c7e5cfda3b56c4dcfbb4627 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 27 Jan 2020 16:40:46 -0800
Subject: Address GH comments.

---
 pkg/sentry/socket/netfilter/netfilter.go | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index b8848f08a..a06562743 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -43,7 +43,7 @@ const (
 // a metadata struct when the tables are written, and when they are read out we
 // verify that certain fields are the same.
 //
-// metadata is opaque to netstack.
+// metadata is used by this serialization/deserializing code, not netstack.
 type metadata struct {
 	HookEntry  [linux.NF_INET_NUMHOOKS]uint32
 	Underflow  [linux.NF_INET_NUMHOOKS]uint32
@@ -51,14 +51,10 @@ type metadata struct {
 	Size       uint32
 }
 
-const enableDebug = false
-
 // nflog logs messages related to the writing and reading of iptables, but only
 // when enableDebug is true.
 func nflog(format string, args ...interface{}) {
-	if enableDebug {
-		log.Infof("netfilter: "+format, args...)
-	}
+	log.Infof("netfilter: "+format, args...)
 }
 
 // GetInfo returns information about iptables.
@@ -233,14 +229,12 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	return entries, meta, nil
 }
 
-// TODO: SOMEHOW THIS IS NOT GETTING APPENDED!
 func marshalMatcher(matcher iptables.Matcher) []byte {
 	switch m := matcher.(type) {
 	case *iptables.UDPMatcher:
 		return marshalUDPMatcher(m)
 	default:
-		// TODO(gvisor.dev/issue/170): We don't support any matchers
-		// yet, so any call to marshalMatcher will panic.
+		// TODO(gvisor.dev/issue/170): Support other matchers.
 		panic(fmt.Errorf("unknown matcher of type %T", matcher))
 	}
 }
@@ -249,11 +243,11 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
 	nflog("convert to binary: marshalling UDP matcher: %+v", matcher)
 
 	// We have to pad this struct size to a multiple of 8 bytes.
-	const size = linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6
+	size := alignUp(linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP, 8)
 
 	linuxMatcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
-			MatchSize: size,
+			MatchSize: uint16(size),
 		},
 		Data: make([]byte, 0, linux.SizeOfXTUDP),
 	}
@@ -270,7 +264,7 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
 
 	buf := make([]byte, 0, size)
 	buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher)
-	buf = append(buf, []byte{0, 0, 0, 0, 0, 0}...)
+	buf = append(buf, make([]byte, size-len(buf))...)
 	nflog("convert to binary: marshalled UDP matcher into %v", buf)
 	return buf[:]
 }
@@ -410,7 +404,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 
 		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
-		// that they only work for certiain protocols, hooks, tables.
+		// that they only work for certain protocols, hooks, tables.
 		// Get matchers.
 		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
 		if len(optVal) < int(matchersSize) {
@@ -684,3 +678,8 @@ func hookFromLinux(hook int) iptables.Hook {
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
+
+// alignUp rounds a length up to an alignment. align must be a power of 2.
+func alignUp(length int, align uint) int {
+	return (length + int(align) - 1) & ^(int(align) - 1)
+}
-- 
cgit v1.2.3


From 6adbdfe232c3da42a7f6f3a7d882d140196e4068 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Wed, 29 Jan 2020 07:50:34 -0500
Subject: supporting sError in guest kernel on Arm64

For test case 'TestBounce', we use KVM_SET_VCPU_EVENTS to trigger sError
to leave guest.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/ring0/aarch64.go       |  6 +++---
 pkg/sentry/platform/ring0/entry_arm64.s    | 14 +++++++++++++-
 pkg/sentry/platform/ring0/offsets_arm64.go |  1 +
 3 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go
index 6b078cd1e..f6da41c27 100644
--- a/pkg/sentry/platform/ring0/aarch64.go
+++ b/pkg/sentry/platform/ring0/aarch64.go
@@ -88,14 +88,14 @@ const (
 	El0Sync_undef
 	El0Sync_dbg
 	El0Sync_inv
-	VirtualizationException
 	_NR_INTERRUPTS
 )
 
 // System call vectors.
 const (
-	Syscall   Vector = El0Sync_svc
-	PageFault Vector = El0Sync_da
+	Syscall                 Vector = El0Sync_svc
+	PageFault               Vector = El0Sync_da
+	VirtualizationException Vector = El0Error
 )
 
 // VirtualAddressBits returns the number bits available for virtual addresses.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 679842288..baa6c4910 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -601,7 +601,19 @@ TEXT ·El0_fiq(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
 TEXT ·El0_error(SB),NOSPLIT,$0
-	B ·Shutdown(SB)
+	KERNEL_ENTRY_FROM_EL0
+	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
+	WORD $0xd538601a     //MRS   FAR_EL1, R26
+
+	MOVD R26, CPU_FAULT_ADDR(RSV_REG)
+
+	MOVD $1, R3
+	MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user.
+
+	MOVD $VirtualizationException, R3
+	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
+
+	B ·Halt(SB)
 
 TEXT ·El0_sync_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index 8c960c749..057fb5c69 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -85,6 +85,7 @@ func Emit(w io.Writer) {
 
 	fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault)
 	fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall)
+	fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException)
 
 	p := &syscall.PtraceRegs{}
 	fmt.Fprintf(w, "\n// Ptrace registers.\n")
-- 
cgit v1.2.3


From 8dcedc953a610b97efe9f68ac8fecf5e15a7e26b Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 29 Jan 2020 10:08:25 -0800
Subject: Add //pkg/sentry/devices/memdev.

PiperOrigin-RevId: 292165063
---
 pkg/abi/linux/dev.go                |  3 ++
 pkg/sentry/devices/memdev/BUILD     | 28 +++++++++++
 pkg/sentry/devices/memdev/full.go   | 75 ++++++++++++++++++++++++++++++
 pkg/sentry/devices/memdev/memdev.go | 59 ++++++++++++++++++++++++
 pkg/sentry/devices/memdev/null.go   | 76 ++++++++++++++++++++++++++++++
 pkg/sentry/devices/memdev/random.go | 92 +++++++++++++++++++++++++++++++++++++
 pkg/sentry/devices/memdev/zero.go   | 88 +++++++++++++++++++++++++++++++++++
 7 files changed, 421 insertions(+)
 create mode 100644 pkg/sentry/devices/memdev/BUILD
 create mode 100644 pkg/sentry/devices/memdev/full.go
 create mode 100644 pkg/sentry/devices/memdev/memdev.go
 create mode 100644 pkg/sentry/devices/memdev/null.go
 create mode 100644 pkg/sentry/devices/memdev/random.go
 create mode 100644 pkg/sentry/devices/memdev/zero.go

(limited to 'pkg')

diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go
index 421e11256..89f9a793f 100644
--- a/pkg/abi/linux/dev.go
+++ b/pkg/abi/linux/dev.go
@@ -36,6 +36,9 @@ func DecodeDeviceID(rdev uint32) (uint16, uint32) {
 //
 // See Documentations/devices.txt and uapi/linux/major.h.
 const (
+	// MEM_MAJOR is the major device number for "memory" character devices.
+	MEM_MAJOR = 1
+
 	// TTYAUX_MAJOR is the major device number for alternate TTY devices.
 	TTYAUX_MAJOR = 5
 
diff --git a/pkg/sentry/devices/memdev/BUILD b/pkg/sentry/devices/memdev/BUILD
new file mode 100644
index 000000000..abe58f818
--- /dev/null
+++ b/pkg/sentry/devices/memdev/BUILD
@@ -0,0 +1,28 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "memdev",
+    srcs = [
+        "full.go",
+        "memdev.go",
+        "null.go",
+        "random.go",
+        "zero.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/rand",
+        "//pkg/safemem",
+        "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go
new file mode 100644
index 000000000..c7e197691
--- /dev/null
+++ b/pkg/sentry/devices/memdev/full.go
@@ -0,0 +1,75 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memdev
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const fullDevMinor = 7
+
+// fullDevice implements vfs.Device for /dev/full.
+type fullDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (fullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &fullFD{}
+	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// fullFD implements vfs.FileDescriptionImpl for /dev/full.
+type fullFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *fullFD) Release() {
+	// noop
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fullFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return dst.ZeroOut(ctx, dst.NumBytes())
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *fullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return dst.ZeroOut(ctx, dst.NumBytes())
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *fullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.ENOSPC
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *fullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.ENOSPC
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *fullFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return 0, nil
+}
diff --git a/pkg/sentry/devices/memdev/memdev.go b/pkg/sentry/devices/memdev/memdev.go
new file mode 100644
index 000000000..5759900c4
--- /dev/null
+++ b/pkg/sentry/devices/memdev/memdev.go
@@ -0,0 +1,59 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memdev implements "mem" character devices, as implemented in Linux
+// by drivers/char/mem.c and drivers/char/random.c.
+package memdev
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Register registers all devices implemented by this package in vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+	for minor, dev := range map[uint32]vfs.Device{
+		nullDevMinor:    nullDevice{},
+		zeroDevMinor:    zeroDevice{},
+		fullDevMinor:    fullDevice{},
+		randomDevMinor:  randomDevice{},
+		urandomDevMinor: randomDevice{},
+	} {
+		if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MEM_MAJOR, minor, dev, &vfs.RegisterDeviceOptions{
+			GroupName: "mem",
+		}); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// CreateDevtmpfsFiles creates device special files in dev representing all
+// devices implemented by this package.
+func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error {
+	for minor, name := range map[uint32]string{
+		nullDevMinor:    "null",
+		zeroDevMinor:    "zero",
+		fullDevMinor:    "full",
+		randomDevMinor:  "random",
+		urandomDevMinor: "urandom",
+	} {
+		if err := dev.CreateDeviceFile(ctx, name, vfs.CharDevice, linux.MEM_MAJOR, minor, 0666 /* mode */); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go
new file mode 100644
index 000000000..33d060d02
--- /dev/null
+++ b/pkg/sentry/devices/memdev/null.go
@@ -0,0 +1,76 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memdev
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const nullDevMinor = 3
+
+// nullDevice implements vfs.Device for /dev/null.
+type nullDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (nullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &nullFD{}
+	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// nullFD implements vfs.FileDescriptionImpl for /dev/null.
+type nullFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *nullFD) Release() {
+	// noop
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *nullFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, io.EOF
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *nullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return 0, io.EOF
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *nullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return src.NumBytes(), nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *nullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return src.NumBytes(), nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *nullFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return 0, nil
+}
diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go
new file mode 100644
index 000000000..acfa23149
--- /dev/null
+++ b/pkg/sentry/devices/memdev/random.go
@@ -0,0 +1,92 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memdev
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+	randomDevMinor  = 8
+	urandomDevMinor = 9
+)
+
+// randomDevice implements vfs.Device for /dev/random and /dev/urandom.
+type randomDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (randomDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &randomFD{}
+	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// randomFD implements vfs.FileDescriptionImpl for /dev/random.
+type randomFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	// off is the "file offset". off is accessed using atomic memory
+	// operations.
+	off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *randomFD) Release() {
+	// noop
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *randomFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader})
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *randomFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader})
+	atomic.AddInt64(&fd.off, n)
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *randomFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	// In Linux, this mixes the written bytes into the entropy pool; we just
+	// throw them away.
+	return src.NumBytes(), nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *randomFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	atomic.AddInt64(&fd.off, src.NumBytes())
+	return src.NumBytes(), nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *randomFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	// Linux: drivers/char/random.c:random_fops.llseek == urandom_fops.llseek
+	// == noop_llseek
+	return atomic.LoadInt64(&fd.off), nil
+}
diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go
new file mode 100644
index 000000000..3b1372b9e
--- /dev/null
+++ b/pkg/sentry/devices/memdev/zero.go
@@ -0,0 +1,88 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memdev
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const zeroDevMinor = 5
+
+// zeroDevice implements vfs.Device for /dev/zero.
+type zeroDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (zeroDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &zeroFD{}
+	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// zeroFD implements vfs.FileDescriptionImpl for /dev/zero.
+type zeroFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *zeroFD) Release() {
+	// noop
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *zeroFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return dst.ZeroOut(ctx, dst.NumBytes())
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *zeroFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return dst.ZeroOut(ctx, dst.NumBytes())
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *zeroFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return src.NumBytes(), nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *zeroFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return src.NumBytes(), nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *zeroFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return 0, nil
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *zeroFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	m, err := mm.NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
+	if err != nil {
+		return err
+	}
+	opts.MappingIdentity = m
+	opts.Mappable = m
+	return nil
+}
-- 
cgit v1.2.3


From 37bb502670caefd4113da062495b4e318ea0f72e Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 29 Jan 2020 10:35:32 -0800
Subject: sentry: rename SetRSEQInterruptedIP to SetOldRSeqInterruptedIP for
 arm64

For amd64, this has been done on cl/288342928.

PiperOrigin-RevId: 292170856
---
 pkg/sentry/arch/arch_arm64.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index 94f1a808f..ac98897b5 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -137,8 +137,8 @@ func (c *context64) SetTLS(value uintptr) bool {
 	return false
 }
 
-// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP.
-func (c *context64) SetRSEQInterruptedIP(value uintptr) {
+// SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP.
+func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
 	c.Regs.Regs[3] = uint64(value)
 }
 
-- 
cgit v1.2.3


From 148fda60e8dee29f2df85e3104e3d5de1a225bcf Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 29 Jan 2020 11:15:59 -0800
Subject: Add plumbing for file locks in VFS2.

Updates #1480

PiperOrigin-RevId: 292180192
---
 pkg/sentry/vfs/BUILD                         |  1 +
 pkg/sentry/vfs/file_description.go           | 21 ++++++++++++++++++++-
 pkg/sentry/vfs/file_description_impl_util.go | 21 +++++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index ced9d07b1..14b39eb9d 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -44,6 +44,7 @@ go_library(
         "//pkg/context",
         "//pkg/fspath",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
         "//pkg/sync",
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index badacb55e..5bac660c7 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -393,7 +394,25 @@ type FileDescriptionImpl interface {
 	// Removexattr removes the given extended attribute from the file.
 	Removexattr(ctx context.Context, name string) error
 
-	// TODO: file locking
+	// LockBSD tries to acquire a BSD-style advisory file lock.
+	//
+	// TODO(gvisor.dev/issue/1480): BSD-style file locking
+	LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
+
+	// LockBSD releases a BSD-style advisory file lock.
+	//
+	// TODO(gvisor.dev/issue/1480): BSD-style file locking
+	UnlockBSD(ctx context.Context, uid lock.UniqueID) error
+
+	// LockPOSIX tries to acquire a POSIX-style advisory file lock.
+	//
+	// TODO(gvisor.dev/issue/1480): POSIX-style file locking
+	LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error
+
+	// UnlockPOSIX releases a POSIX-style advisory file lock.
+	//
+	// TODO(gvisor.dev/issue/1480): POSIX-style file locking
+	UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error
 }
 
 // Dirent holds the information contained in struct linux_dirent64.
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index a4900c170..c2a52ec1b 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -152,6 +153,26 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string)
 	return syserror.ENOTSUP
 }
 
+// LockBSD implements FileDescriptionImpl.LockBSD.
+func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
+	return syserror.EBADF
+}
+
+// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
+func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
+	return syserror.EBADF
+}
+
+// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
+func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
+	return syserror.EBADF
+}
+
+// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
+func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
+	return syserror.EBADF
+}
+
 // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
 // FileDescriptionImpl that always represent directories to obtain
 // implementations of non-directory I/O methods that return EISDIR.
-- 
cgit v1.2.3


From 51b783505b1ec164b02b48a0fd234509fba01a73 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 29 Jan 2020 15:41:51 -0800
Subject: Add support for TCP_DEFER_ACCEPT.

PiperOrigin-RevId: 292233574
---
 pkg/sentry/socket/netstack/netstack.go      |  22 ++++
 pkg/tcpip/tcpip.go                          |   6 ++
 pkg/tcpip/transport/tcp/BUILD               |   1 +
 pkg/tcpip/transport/tcp/accept.go           |  25 ++---
 pkg/tcpip/transport/tcp/connect.go          |  53 +++++++++-
 pkg/tcpip/transport/tcp/endpoint.go         |  26 ++++-
 pkg/tcpip/transport/tcp/forwarder.go        |   4 +-
 pkg/tcpip/transport/tcp/tcp_test.go         | 126 ++++++++++++++++++++++
 test/syscalls/linux/socket_inet_loopback.cc | 158 ++++++++++++++++++++++++++++
 test/syscalls/linux/tcp_socket.cc           |  53 ++++++++++
 10 files changed, 451 insertions(+), 23 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 8619cc506..049d04bf2 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1260,6 +1260,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 
 		return int32(time.Duration(v) / time.Second), nil
 
+	case linux.TCP_DEFER_ACCEPT:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.TCPDeferAcceptOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(time.Duration(v) / time.Second), nil
+
 	default:
 		emitUnimplementedEventTCP(t, name)
 	}
@@ -1713,6 +1725,16 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		v := usermem.ByteOrder.Uint32(optVal)
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))))
 
+	case linux.TCP_DEFER_ACCEPT:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+		v := int32(usermem.ByteOrder.Uint32(optVal))
+		if v < 0 {
+			v = 0
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))))
+
 	case linux.TCP_REPAIR_OPTIONS:
 		t.Kernel().EmitUnimplementedEvent(t)
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 59c9b3fb0..0fa141d58 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -626,6 +626,12 @@ type TCPLingerTimeoutOption time.Duration
 // before being marked closed.
 type TCPTimeWaitTimeoutOption time.Duration
 
+// TCPDeferAcceptOption is used by SetSockOpt/GetSockOpt to allow a
+// accept to return a completed connection only when there is data to be
+// read. This usually means the listening socket will drop the final ACK
+// for a handshake till the specified timeout until a segment with data arrives.
+type TCPDeferAcceptOption time.Duration
+
 // MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
 // TTL value for multicast messages. The default is 1.
 type MulticastTTLOption uint8
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 4acd9fb9a..7b4a87a2d 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -57,6 +57,7 @@ go_library(
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/log",
         "//pkg/rand",
         "//pkg/sleep",
         "//pkg/sync",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index d469758eb..6101f2945 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -222,13 +222,13 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu
 
 // createConnectingEndpoint creates a new endpoint in a connecting state, with
 // the connection parameters given by the arguments.
-func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
+func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create a new endpoint.
 	netProto := l.netProto
 	if netProto == 0 {
 		netProto = s.route.NetProto
 	}
-	n := newEndpoint(l.stack, netProto, nil)
+	n := newEndpoint(l.stack, netProto, queue)
 	n.v6only = l.v6only
 	n.ID = s.id
 	n.boundNICID = s.route.NICID()
@@ -273,16 +273,17 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 
 // createEndpoint creates a new endpoint in connected state and then performs
 // the TCP 3-way handshake.
-func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
+func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create new endpoint.
 	irs := s.sequenceNumber
 	isn := generateSecureISN(s.id, l.stack.Seed())
-	ep, err := l.createConnectingEndpoint(s, isn, irs, opts)
+	ep, err := l.createConnectingEndpoint(s, isn, irs, opts, queue)
 	if err != nil {
 		return nil, err
 	}
 
 	// listenEP is nil when listenContext is used by tcp.Forwarder.
+	deferAccept := time.Duration(0)
 	if l.listenEP != nil {
 		l.listenEP.mu.Lock()
 		if l.listenEP.EndpointState() != StateListen {
@@ -290,13 +291,12 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 			return nil, tcpip.ErrConnectionAborted
 		}
 		l.addPendingEndpoint(ep)
+		deferAccept = l.listenEP.deferAccept
 		l.listenEP.mu.Unlock()
 	}
 
 	// Perform the 3-way handshake.
-	h := newHandshake(ep, seqnum.Size(ep.initialReceiveWindow()))
-
-	h.resetToSynRcvd(isn, irs, opts)
+	h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept)
 	if err := h.execute(); err != nil {
 		ep.Close()
 		if l.listenEP != nil {
@@ -377,16 +377,14 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 	defer e.decSynRcvdCount()
 	defer s.decRef()
 
-	n, err := ctx.createEndpointAndPerformHandshake(s, opts)
+	n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{})
 	if err != nil {
 		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 		e.stats.FailedConnectionAttempts.Increment()
 		return
 	}
 	ctx.removePendingEndpoint(n)
-	// Start the protocol goroutine.
-	wq := &waiter.Queue{}
-	n.startAcceptedLoop(wq)
+	n.startAcceptedLoop()
 	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 
 	e.deliverAccepted(n)
@@ -546,7 +544,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr
 		}
 
-		n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions)
+		n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions, &waiter.Queue{})
 		if err != nil {
 			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 			e.stats.FailedConnectionAttempts.Increment()
@@ -576,8 +574,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// space available in the backlog.
 
 		// Start the protocol goroutine.
-		wq := &waiter.Queue{}
-		n.startAcceptedLoop(wq)
+		n.startAcceptedLoop()
 		e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 		go e.deliverAccepted(n)
 	}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 4e3c5419c..9ff7ac261 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -86,6 +86,19 @@ type handshake struct {
 
 	// rcvWndScale is the receive window scale, as defined in RFC 1323.
 	rcvWndScale int
+
+	// startTime is the time at which the first SYN/SYN-ACK was sent.
+	startTime time.Time
+
+	// deferAccept if non-zero will drop the final ACK for a passive
+	// handshake till an ACK segment with data is received or the timeout is
+	// hit.
+	deferAccept time.Duration
+
+	// acked is true if the the final ACK for a 3-way handshake has
+	// been received. This is required to stop retransmitting the
+	// original SYN-ACK when deferAccept is enabled.
+	acked bool
 }
 
 func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
@@ -112,6 +125,12 @@ func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
 	return h
 }
 
+func newPassiveHandshake(ep *endpoint, rcvWnd seqnum.Size, isn, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) handshake {
+	h := newHandshake(ep, rcvWnd)
+	h.resetToSynRcvd(isn, irs, opts, deferAccept)
+	return h
+}
+
 // FindWndScale determines the window scale to use for the given maximum window
 // size.
 func FindWndScale(wnd seqnum.Size) int {
@@ -181,7 +200,7 @@ func (h *handshake) effectiveRcvWndScale() uint8 {
 
 // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
 // state.
-func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions) {
+func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) {
 	h.active = false
 	h.state = handshakeSynRcvd
 	h.flags = header.TCPFlagSyn | header.TCPFlagAck
@@ -189,6 +208,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
 	h.ackNum = irs + 1
 	h.mss = opts.MSS
 	h.sndWndScale = opts.WS
+	h.deferAccept = deferAccept
 	h.ep.mu.Lock()
 	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
@@ -352,6 +372,14 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 	// We have previously received (and acknowledged) the peer's SYN. If the
 	// peer acknowledges our SYN, the handshake is completed.
 	if s.flagIsSet(header.TCPFlagAck) {
+		// If deferAccept is not zero and this is a bare ACK and the
+		// timeout is not hit then drop the ACK.
+		if h.deferAccept != 0 && s.data.Size() == 0 && time.Since(h.startTime) < h.deferAccept {
+			h.acked = true
+			h.ep.stack.Stats().DroppedPackets.Increment()
+			return nil
+		}
+
 		// If the timestamp option is negotiated and the segment does
 		// not carry a timestamp option then the segment must be dropped
 		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
@@ -365,10 +393,16 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
 		}
 		h.state = handshakeCompleted
+
 		h.ep.mu.Lock()
 		h.ep.transitionToStateEstablishedLocked(h)
+		// If the segment has data then requeue it for the receiver
+		// to process it again once main loop is started.
+		if s.data.Size() > 0 {
+			s.incRef()
+			h.ep.enqueueSegment(s)
+		}
 		h.ep.mu.Unlock()
-
 		return nil
 	}
 
@@ -471,6 +505,7 @@ func (h *handshake) execute() *tcpip.Error {
 		}
 	}
 
+	h.startTime = time.Now()
 	// Initialize the resend timer.
 	resendWaker := sleep.Waker{}
 	timeOut := time.Duration(time.Second)
@@ -524,11 +559,21 @@ func (h *handshake) execute() *tcpip.Error {
 		switch index, _ := s.Fetch(true); index {
 		case wakerForResend:
 			timeOut *= 2
-			if timeOut > 60*time.Second {
+			if timeOut > MaxRTO {
 				return tcpip.ErrTimeout
 			}
 			rt.Reset(timeOut)
-			h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+			// Resend the SYN/SYN-ACK only if the following conditions hold.
+			//  - It's an active handshake (deferAccept does not apply)
+			//  - It's a passive handshake and we have not yet got the final-ACK.
+			//  - It's a passive handshake and we got an ACK but deferAccept is
+			//    enabled and we are now past the deferAccept duration.
+			// The last is required to provide a way for the peer to complete
+			// the connection with another ACK or data (as ACKs are never
+			// retransmitted on their own).
+			if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept {
+				h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+			}
 
 		case wakerForNotification:
 			n := h.ep.fetchNotifications()
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 13718ff55..8d52414b7 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -498,6 +498,13 @@ type endpoint struct {
 	// without any data being acked.
 	userTimeout time.Duration
 
+	// deferAccept if non-zero specifies a user specified time during
+	// which the final ACK of a handshake will be dropped provided the
+	// ACK is a bare ACK and carries no data. If the timeout is crossed then
+	// the bare ACK is accepted and the connection is delivered to the
+	// listener.
+	deferAccept time.Duration
+
 	// pendingAccepted is a synchronization primitive used to track number
 	// of connections that are queued up to be delivered to the accepted
 	// channel. We use this to ensure that all goroutines blocked on writing
@@ -1574,6 +1581,15 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.TCPDeferAcceptOption:
+		e.mu.Lock()
+		if time.Duration(v) > MaxRTO {
+			v = tcpip.TCPDeferAcceptOption(MaxRTO)
+		}
+		e.deferAccept = time.Duration(v)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return nil
 	}
@@ -1798,6 +1814,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case *tcpip.TCPDeferAcceptOption:
+		e.mu.Lock()
+		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -2149,9 +2171,8 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 
 // startAcceptedLoop sets up required state and starts a goroutine with the
 // main loop for accepted connections.
-func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
+func (e *endpoint) startAcceptedLoop() {
 	e.mu.Lock()
-	e.waiterQueue = waiterQueue
 	e.workerRunning = true
 	e.mu.Unlock()
 	wakerInitDone := make(chan struct{})
@@ -2177,7 +2198,6 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	default:
 		return nil, nil, tcpip.ErrWouldBlock
 	}
-
 	return n, n.waiterQueue, nil
 }
 
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 7eb613be5..c9ee5bf06 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -157,13 +157,13 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 		TSVal:         r.synOptions.TSVal,
 		TSEcr:         r.synOptions.TSEcr,
 		SACKPermitted: r.synOptions.SACKPermitted,
-	})
+	}, queue)
 	if err != nil {
 		return nil, err
 	}
 
 	// Start the protocol goroutine.
-	ep.startAcceptedLoop(queue)
+	ep.startAcceptedLoop()
 
 	return ep, nil
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index df2fb1071..a12336d47 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -6787,3 +6787,129 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) {
 		),
 	)
 }
+
+func TestTCPDeferAccept(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	const tcpDeferAccept = 1 * time.Second
+	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Send data. This should result in an acceptable endpoint.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+
+	// Give a bit of time for the socket to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+	aep, _, err := c.EP.Accept()
+	if err != nil {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err)
+	}
+
+	aep.Close()
+	// Closing aep without reading the data should trigger a RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
+
+func TestTCPDeferAcceptTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	const tcpDeferAccept = 1 * time.Second
+	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Sleep for a little of the tcpDeferAccept timeout.
+	time.Sleep(tcpDeferAccept + 100*time.Millisecond)
+
+	// On timeout expiry we should get a SYN-ACK retransmission.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
+		checker.AckNum(uint32(irs)+1)))
+
+	// Send data. This should result in an acceptable endpoint.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+
+	// Give sometime for the endpoint to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+	aep, _, err := c.EP.Accept()
+	if err != nil {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err)
+	}
+
+	aep.Close()
+	// Closing aep without reading the data should trigger a RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 2f9821555..3bf7081b9 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -828,6 +828,164 @@ TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
   EXPECT_EQ(get, kUserTimeout);
 }
 
+// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+// saved. Enable S/R once issue is fixed.
+TEST_P(SocketInetLoopbackTest, TCPDeferAccept_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+  // saved. Enable S/R issue is fixed.
+  DisableSave ds;
+
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Set the TCP_DEFER_ACCEPT on the listening socket.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Set the listening socket to nonblock so that we can verify that there is no
+  // connection in queue despite the connect above succeeding since the peer has
+  // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the
+  // FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Set FD back to blocking.
+  opts &= ~O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Now write some data to the socket.
+  int data = 0;
+  ASSERT_THAT(RetryEINTR(write)(conn_fd.get(), &data, sizeof(data)),
+              SyscallSucceedsWithValue(sizeof(data)));
+
+  // This should now cause the connection to complete and be delivered to the
+  // accept socket.
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  // Verify that the accepted socket returns the data written.
+  int get = -1;
+  ASSERT_THAT(RetryEINTR(recv)(accepted.get(), &get, sizeof(get), 0),
+              SyscallSucceedsWithValue(sizeof(get)));
+
+  EXPECT_EQ(get, data);
+}
+
+// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+// saved. Enable S/R once issue is fixed.
+TEST_P(SocketInetLoopbackTest, TCPDeferAcceptTimeout_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+  // saved. Enable S/R once issue is fixed.
+  DisableSave ds;
+
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Set the TCP_DEFER_ACCEPT on the listening socket.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Set the listening socket to nonblock so that we can verify that there is no
+  // connection in queue despite the connect above succeeding since the peer has
+  // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the
+  // FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Verify that there is no acceptable connection before TCP_DEFER_ACCEPT
+  // timeout is hit.
+  absl::SleepFor(absl::Seconds(kTCPDeferAccept - 1));
+  ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Set FD back to blocking.
+  opts &= ~O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Now sleep for a little over the TCP_DEFER_ACCEPT duration. When the timeout
+  // is hit a SYN-ACK should be retransmitted by the listener as a last ditch
+  // attempt to complete the connection with or without data.
+  absl::SleepFor(absl::Seconds(2));
+
+  // Verify that we have a connection that can be accepted even though no
+  // data was written.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     All, SocketInetLoopbackTest,
     ::testing::Values(
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 33a5ac66c..525ccbd88 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1286,6 +1286,59 @@ TEST_P(SimpleTcpSocketTest, SetTCPUserTimeout) {
   EXPECT_EQ(get, kTCPUserTimeout);
 }
 
+TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptNeg) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // -ve TCP_DEFER_ACCEPT is same as setting it to zero.
+  constexpr int kNeg = -1;
+  EXPECT_THAT(
+      setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &kNeg, sizeof(kNeg)),
+      SyscallSucceeds());
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);
+}
+
+TEST_P(SimpleTcpSocketTest, GetTCPDeferAcceptDefault) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptGreaterThanZero) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  // kTCPDeferAccept is in seconds.
+  // NOTE: linux translates seconds to # of retries and back from
+  //   #of retries to seconds. Which means only certain values
+  //   translate back exactly. That's why we use 3 here, a value of
+  //   5 will result in us getting back 7 instead of 5 in the
+  //   getsockopt.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &get, &get_len),
+      SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kTCPDeferAccept);
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From 6f841c304d7bd9af6167d7d049bd5c594358a1b9 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Wed, 29 Jan 2020 19:54:11 -0800
Subject: Do not spawn a goroutine when calling stack.NDPDispatcher's methods

Do not start a new goroutine when calling
stack.NDPDispatcher.OnDuplicateAddressDetectionStatus.

PiperOrigin-RevId: 292268574
---
 pkg/tcpip/stack/ndp.go      | 8 ++++----
 pkg/tcpip/stack/ndp_test.go | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 245694118..281ae786d 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -167,8 +167,8 @@ type NDPDispatcher interface {
 	// reason, such as the address being removed). If an error occured
 	// during DAD, err will be set and resolved must be ignored.
 	//
-	// This function is permitted to block indefinitely without interfering
-	// with the stack's operation.
+	// This function is not permitted to block indefinitely. This function
+	// is also not permitted to call into the stack.
 	OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error)
 
 	// OnDefaultRouterDiscovered will be called when a new default router is
@@ -607,8 +607,8 @@ func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
 	delete(ndp.dad, addr)
 
 	// Let the integrator know DAD did not resolve.
-	if ndp.nic.stack.ndpDisp != nil {
-		go ndp.nic.stack.ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, false, nil)
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, false, nil)
 	}
 }
 
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 726468e41..8c76e80f2 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -497,7 +497,7 @@ func TestDADFail(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			ndpDisp := ndpDispatcher{
-				dadC: make(chan ndpDADEvent),
+				dadC: make(chan ndpDADEvent, 1),
 			}
 			ndpConfigs := stack.DefaultNDPConfigurations()
 			opts := stack.Options{
@@ -576,7 +576,7 @@ func TestDADFail(t *testing.T) {
 // removed.
 func TestDADStop(t *testing.T) {
 	ndpDisp := ndpDispatcher{
-		dadC: make(chan ndpDADEvent),
+		dadC: make(chan ndpDADEvent, 1),
 	}
 	ndpConfigs := stack.NDPConfigurations{
 		RetransmitTimer:        time.Second,
-- 
cgit v1.2.3


From ec0679737e8f9ab31ef6c7c3adb5a0005586b5a7 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 30 Jan 2020 07:12:04 -0800
Subject: Do not include the Source Link Layer option with an unspecified
 source address

When sending NDP messages with an unspecified source address, the Source
Link Layer address must not be included.

Test: stack_test.TestDADResolve
PiperOrigin-RevId: 292341334
---
 pkg/tcpip/stack/ndp.go      | 22 ++--------------------
 pkg/tcpip/stack/ndp_test.go | 12 ++++++++----
 2 files changed, 10 insertions(+), 24 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 281ae786d..31294345d 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -538,29 +538,11 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
 	r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 	defer r.Release()
 
-	linkAddr := ndp.nic.linkEP.LinkAddress()
-	isValidLinkAddr := header.IsValidUnicastEthernetAddress(linkAddr)
-	ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize
-	if isValidLinkAddr {
-		// Only include a Source Link Layer Address option if the NIC has a valid
-		// link layer address.
-		//
-		// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
-		// LinkEndpoint.LinkAddress) before reaching this point.
-		ndpNSSize += header.NDPLinkLayerAddressSize
-	}
-
-	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + ndpNSSize)
-	pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
+	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
 	pkt.SetType(header.ICMPv6NeighborSolicit)
 	ns := header.NDPNeighborSolicit(pkt.NDPPayload())
 	ns.SetTargetAddress(addr)
-
-	if isValidLinkAddr {
-		ns.Options().Serialize(header.NDPOptionsSerializer{
-			header.NDPSourceLinkLayerAddressOption(linkAddr),
-		})
-	}
 	pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 	sent := r.Stats().ICMP.V6PacketsSent
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 8c76e80f2..bc7cfbcb4 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -413,14 +413,18 @@ func TestDADResolve(t *testing.T) {
 					t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
 				}
 
-				// Check NDP packet.
+				// Check NDP NS packet.
+				//
+				// As per RFC 4861 section 4.3, a possible option is the Source Link
+				// Layer option, but this option MUST NOT be included when the source
+				// address of the packet is the unspecified address.
 				checker.IPv6(t, p.Pkt.Header.View().ToVectorisedView().First(),
+					checker.SrcAddr(header.IPv6Any),
+					checker.DstAddr(header.SolicitedNodeAddr(addr1)),
 					checker.TTL(header.NDPHopLimit),
 					checker.NDPNS(
 						checker.NDPNSTargetAddress(addr1),
-						checker.NDPNSOptions([]header.NDPOption{
-							header.NDPSourceLinkLayerAddressOption(linkAddr1),
-						}),
+						checker.NDPNSOptions(nil),
 					))
 			}
 		})
-- 
cgit v1.2.3


From ede8dfab3760afc8063c3418f217e52f7ec70d42 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 30 Jan 2020 09:13:36 -0800
Subject: Enforce splice offset limits

Splice must not allow negative offsets. Writes also must not allow offset +
size to overflow int64. Reads are similarly broken, but not just in splice
(b/148095030).

Reported-by: syzbot+0e1ff0b95fb2859b4190@syzkaller.appspotmail.com
PiperOrigin-RevId: 292361208
---
 pkg/sentry/fs/tmpfs/inode_file.go       | 10 ++++--
 pkg/sentry/syscalls/linux/sys_splice.go | 16 ++++------
 test/syscalls/linux/splice.cc           | 56 +++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+), 12 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index dabc10662..25abbc151 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -17,6 +17,7 @@ package tmpfs
 import (
 	"fmt"
 	"io"
+	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -444,10 +445,15 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 	defer rw.f.dataMu.Unlock()
 
 	// Compute the range to write.
-	end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
-	if end == rw.offset { // srcs.NumBytes() == 0?
+	if srcs.NumBytes() == 0 {
+		// Nothing to do.
 		return 0, nil
 	}
+	end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
+	if end == math.MaxInt64 {
+		// Overflow.
+		return 0, syserror.EINVAL
+	}
 
 	// Check if seals prevent either file growth or all writes.
 	switch {
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index f43d6c155..fd642834b 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -25,6 +25,10 @@ import (
 
 // doSplice implements a blocking splice operation.
 func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) {
+	if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 {
+		return 0, syserror.EINVAL
+	}
+
 	var (
 		total int64
 		n     int64
@@ -82,11 +86,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	offsetAddr := args[2].Pointer()
 	count := int64(args[3].SizeT())
 
-	// Don't send a negative number of bytes.
-	if count < 0 {
-		return 0, nil, syserror.EINVAL
-	}
-
 	// Get files.
 	inFile := t.GetFile(inFD)
 	if inFile == nil {
@@ -136,11 +135,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 			return 0, nil, err
 		}
 
-		// The offset must be valid.
-		if offset < 0 {
-			return 0, nil, syserror.EINVAL
-		}
-
 		// Do the splice.
 		n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
 			Length:    count,
@@ -227,6 +221,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			if _, err := t.CopyIn(outOffset, &offset); err != nil {
 				return 0, nil, err
 			}
+
 			// Use the destination offset.
 			opts.DstOffset = true
 			opts.DstStart = offset
@@ -244,6 +239,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			if _, err := t.CopyIn(inOffset, &offset); err != nil {
 				return 0, nil, err
 			}
+
 			// Use the source offset.
 			opts.SrcOffset = true
 			opts.SrcStart = offset
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index 85232cb1f..faa1247f6 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -60,6 +60,62 @@ TEST(SpliceTest, TwoRegularFiles) {
               SyscallFailsWithErrno(EINVAL));
 }
 
+int memfd_create(const std::string& name, unsigned int flags) {
+  return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+TEST(SpliceTest, NegativeOffset) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill the pipe.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Open the output file as write only.
+  int fd;
+  EXPECT_THAT(fd = memfd_create("negative", 0), SyscallSucceeds());
+  const FileDescriptor out_fd(fd);
+
+  loff_t out_offset = 0xffffffffffffffffull;
+  constexpr int kSize = 2;
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kSize, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Write offset + size overflows int64.
+//
+// This is a regression test for b/148041624.
+TEST(SpliceTest, WriteOverflow) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill the pipe.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Open the output file.
+  int fd;
+  EXPECT_THAT(fd = memfd_create("overflow", 0), SyscallSucceeds());
+  const FileDescriptor out_fd(fd);
+
+  // out_offset + kSize overflows INT64_MAX.
+  loff_t out_offset = 0x7ffffffffffffffeull;
+  constexpr int kSize = 3;
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kSize, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(SpliceTest, SamePipe) {
   // Create a new pipe.
   int fds[2];
-- 
cgit v1.2.3


From 4ee64a248ec16fcc9e526a457a66648546611bfb Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 30 Jan 2020 11:48:36 -0800
Subject: Fix for panic in endpoint.Close().

When sending a RST on shutdown we need to double check the
state after acquiring the work mutex as the endpoint could
have transitioned out of a connected state from the time
we checked it and we acquired the workMutex.

I added two tests but sadly neither reproduce the panic. I am
going to leave the tests in as they are good to have anyway.

PiperOrigin-RevId: 292393800
---
 pkg/tcpip/transport/tcp/BUILD                |  1 +
 pkg/tcpip/transport/tcp/endpoint.go          | 10 ++++-
 pkg/tcpip/transport/tcp/tcp_test.go          | 55 ++++++++++++++++++++++++++++
 test/syscalls/linux/BUILD                    |  1 +
 test/syscalls/linux/socket_ip_tcp_generic.cc | 33 +++++++++++++++++
 5 files changed, 98 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 7b4a87a2d..272e8f570 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -91,6 +91,7 @@ go_test(
     tags = ["flaky"],
     deps = [
         ":tcp",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/checker",
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 8d52414b7..b5a8e15ee 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2047,8 +2047,14 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 				// work mutex is available.
 				if e.workMu.TryLock() {
 					e.mu.Lock()
-					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
-					e.notifyProtocolGoroutine(notifyTickleWorker)
+					// We need to double check here to make
+					// sure worker has not transitioned the
+					// endpoint out of a connected state
+					// before trying to send a reset.
+					if e.EndpointState().connected() {
+						e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+						e.notifyProtocolGoroutine(notifyTickleWorker)
+					}
 					e.mu.Unlock()
 					e.workMu.Unlock()
 				} else {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index a12336d47..2c1505067 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -21,6 +21,7 @@ import (
 	"testing"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/checker"
@@ -6913,3 +6914,57 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 		checker.SeqNum(uint32(iss+1)),
 		checker.AckNum(uint32(irs+5))))
 }
+
+func TestResetDuringClose(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	iss := seqnum.Value(789)
+	c.CreateConnected(iss, 30000, -1 /* epRecvBuf */)
+	// Send some data to make sure there is some unread
+	// data to trigger a reset on c.Close.
+	irs := c.IRS
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss.Add(1),
+		AckNum:  irs.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(irs.Add(1))),
+		checker.AckNum(uint32(iss.Add(5)))))
+
+	// Close in a separate goroutine so that we can trigger
+	// a race with the RST we send below. This should not
+	// panic due to the route being released depeding on
+	// whether Close() sends an active RST or the RST sent
+	// below is processed by the worker first.
+	var wg sync.WaitGroup
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		c.SendPacket(nil, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			SeqNum:  iss.Add(5),
+			AckNum:  c.IRS.Add(5),
+			RcvWnd:  30000,
+			Flags:   header.TCPFlagRst,
+		})
+	}()
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		c.EP.Close()
+	}()
+
+	wg.Wait()
+}
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 74bf068ec..7958fd0d7 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2173,6 +2173,7 @@ cc_library(
         ":socket_test_util",
         "//test/util:test_util",
         "//test/util:thread_util",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 57ce8e169..27779e47c 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -24,6 +24,7 @@
 #include <sys/un.h>
 
 #include "gtest/gtest.h"
+#include "absl/memory/memory.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/syscalls/linux/socket_test_util.h"
@@ -875,5 +876,37 @@ TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) {
   EXPECT_EQ(get, kAbove);
 }
 
+TEST_P(TCPSocketPairTest, TCPResetDuringClose_NoRandomSave) {
+  DisableSave ds;  // Too many syscalls.
+  constexpr int kThreadCount = 1000;
+  std::unique_ptr<ScopedThread> instances[kThreadCount];
+  for (int i = 0; i < kThreadCount; i++) {
+    instances[i] = absl::make_unique<ScopedThread>([&]() {
+      auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+      ScopedThread t([&]() {
+        // Close one end to trigger sending of a FIN.
+        struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+        // Wait up to 20 seconds for the data.
+        constexpr int kPollTimeoutMs = 20000;
+        ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+                    SyscallSucceedsWithValue(1));
+        ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+      });
+
+      // Send some data then close.
+      constexpr char kStr[] = "abc";
+      ASSERT_THAT(write(sockets->first_fd(), kStr, 3),
+                  SyscallSucceedsWithValue(3));
+      absl::SleepFor(absl::Milliseconds(10));
+      ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+      t.Join();
+    });
+  }
+  for (int i = 0; i < kThreadCount; i++) {
+    instances[i]->Join();
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From bc3a24d62788bd6b881afeda230f1a5550a5709a Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 31 Jan 2020 11:49:37 -0800
Subject: Internal change.

PiperOrigin-RevId: 292587459
---
 kokoro/kythe/generate_xrefs.sh |  2 +-
 pkg/sentry/fs/g3doc/inotify.md | 16 ++++++++--------
 pkg/sentry/mm/README.md        |  8 ++++----
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'pkg')

diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
index 7a0fbb3cd..323b0f77b 100644
--- a/kokoro/kythe/generate_xrefs.sh
+++ b/kokoro/kythe/generate_xrefs.sh
@@ -23,7 +23,7 @@ bazel version
 
 python3 -V
 
-readonly KYTHE_VERSION='v0.0.39'
+readonly KYTHE_VERSION='v0.0.41'
 readonly WORKDIR="$(mktemp -d)"
 readonly KYTHE_DIR="${WORKDIR}/kythe-${KYTHE_VERSION}"
 if [[ -n "$KOKORO_GIT_COMMIT" ]]; then
diff --git a/pkg/sentry/fs/g3doc/inotify.md b/pkg/sentry/fs/g3doc/inotify.md
index 71a577d9d..85063d4e6 100644
--- a/pkg/sentry/fs/g3doc/inotify.md
+++ b/pkg/sentry/fs/g3doc/inotify.md
@@ -112,11 +112,11 @@ attempts to queue a new event, it is already holding `fs.Watches.mu`. If we used
 `Inotify.mu` to also protect the event queue, this would violate the above lock
 ordering.
 
-[dirent]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/dirent.go
-[event]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_event.go
-[fd_table]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_table.go
-[inode]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode.go
-[inode_watches]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode_inotify.go
-[inotify]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify.go
-[syscall_dir]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/syscalls/linux/
-[watch]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_watch.go
+[dirent]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/dirent.go
+[event]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inotify_event.go
+[fd_table]: https://github.com/google/gvisor/blob/master/pkg/sentry/kernel/fd_table.go
+[inode]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inode.go
+[inode_watches]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inode_inotify.go
+[inotify]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inotify.go
+[syscall_dir]: https://github.com/google/gvisor/blob/master/pkg/sentry/syscalls/linux/
+[watch]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inotify_watch.go
diff --git a/pkg/sentry/mm/README.md b/pkg/sentry/mm/README.md
index e1322e373..f4d43d927 100644
--- a/pkg/sentry/mm/README.md
+++ b/pkg/sentry/mm/README.md
@@ -274,7 +274,7 @@ In the sentry:
     methods
     [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform].
 
-[memmap]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/memmap/memmap.go
-[mm]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/mm/mm.go
-[pgalloc]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/pgalloc/pgalloc.go
-[platform]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/platform/platform.go
+[memmap]: https://github.com/google/gvisor/blob/master/pkg/sentry/memmap/memmap.go
+[mm]: https://github.com/google/gvisor/blob/master/pkg/sentry/mm/mm.go
+[pgalloc]: https://github.com/google/gvisor/blob/master/pkg/sentry/pgalloc/pgalloc.go
+[platform]: https://github.com/google/gvisor/blob/master/pkg/sentry/platform/platform.go
-- 
cgit v1.2.3


From 528dd1ec72fee1dd63c734fe92d1b972b5735b8f Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 31 Jan 2020 13:24:48 -0800
Subject: Extract multicast IP to Ethernet address mapping

Test: header.TestEthernetAddressFromMulticastIPAddress
PiperOrigin-RevId: 292604649
---
 pkg/tcpip/header/eth.go        | 41 +++++++++++++++++++++++++++++++++++++++++
 pkg/tcpip/header/eth_test.go   | 34 ++++++++++++++++++++++++++++++++++
 pkg/tcpip/network/arp/arp.go   | 19 ++-----------------
 pkg/tcpip/network/ipv6/icmp.go | 20 ++------------------
 4 files changed, 79 insertions(+), 35 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go
index f5d2c127f..b1e92d2d7 100644
--- a/pkg/tcpip/header/eth.go
+++ b/pkg/tcpip/header/eth.go
@@ -134,3 +134,44 @@ func IsValidUnicastEthernetAddress(addr tcpip.LinkAddress) bool {
 	// addr is a valid unicast ethernet address.
 	return true
 }
+
+// EthernetAddressFromMulticastIPv4Address returns a multicast Ethernet address
+// for a multicast IPv4 address.
+//
+// addr MUST be a multicast IPv4 address.
+func EthernetAddressFromMulticastIPv4Address(addr tcpip.Address) tcpip.LinkAddress {
+	var linkAddrBytes [EthernetAddressSize]byte
+	// RFC 1112 Host Extensions for IP Multicasting
+	//
+	// 6.4. Extensions to an Ethernet Local Network Module:
+	//
+	// An IP host group address is mapped to an Ethernet multicast
+	// address by placing the low-order 23-bits of the IP address
+	// into the low-order 23 bits of the Ethernet multicast address
+	// 01-00-5E-00-00-00 (hex).
+	linkAddrBytes[0] = 0x1
+	linkAddrBytes[2] = 0x5e
+	linkAddrBytes[3] = addr[1] & 0x7F
+	copy(linkAddrBytes[4:], addr[IPv4AddressSize-2:])
+	return tcpip.LinkAddress(linkAddrBytes[:])
+}
+
+// EthernetAddressFromMulticastIPv6Address returns a multicast Ethernet address
+// for a multicast IPv6 address.
+//
+// addr MUST be a multicast IPv6 address.
+func EthernetAddressFromMulticastIPv6Address(addr tcpip.Address) tcpip.LinkAddress {
+	// RFC 2464 Transmission of IPv6 Packets over Ethernet Networks
+	//
+	// 7. Address Mapping -- Multicast
+	//
+	// An IPv6 packet with a multicast destination address DST,
+	// consisting of the sixteen octets DST[1] through DST[16], is
+	// transmitted to the Ethernet multicast address whose first
+	// two octets are the value 3333 hexadecimal and whose last
+	// four octets are the last four octets of DST.
+	linkAddrBytes := []byte(addr[IPv6AddressSize-EthernetAddressSize:])
+	linkAddrBytes[0] = 0x33
+	linkAddrBytes[1] = 0x33
+	return tcpip.LinkAddress(linkAddrBytes[:])
+}
diff --git a/pkg/tcpip/header/eth_test.go b/pkg/tcpip/header/eth_test.go
index 6634c90f5..7a0014ad9 100644
--- a/pkg/tcpip/header/eth_test.go
+++ b/pkg/tcpip/header/eth_test.go
@@ -66,3 +66,37 @@ func TestIsValidUnicastEthernetAddress(t *testing.T) {
 		})
 	}
 }
+
+func TestEthernetAddressFromMulticastIPv4Address(t *testing.T) {
+	tests := []struct {
+		name             string
+		addr             tcpip.Address
+		expectedLinkAddr tcpip.LinkAddress
+	}{
+		{
+			name:             "IPv4 Multicast without 24th bit set",
+			addr:             "\xe0\x7e\xdc\xba",
+			expectedLinkAddr: "\x01\x00\x5e\x7e\xdc\xba",
+		},
+		{
+			name:             "IPv4 Multicast with 24th bit set",
+			addr:             "\xe0\xfe\xdc\xba",
+			expectedLinkAddr: "\x01\x00\x5e\x7e\xdc\xba",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := EthernetAddressFromMulticastIPv4Address(test.addr); got != test.expectedLinkAddr {
+				t.Fatalf("got EthernetAddressFromMulticastIPv4Address(%s) = %s, want = %s", got, test.expectedLinkAddr)
+			}
+		})
+	}
+}
+
+func TestEthernetAddressFromMulticastIPv6Address(t *testing.T) {
+	addr := tcpip.Address("\xff\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x1a")
+	if got, want := EthernetAddressFromMulticastIPv6Address(addr), tcpip.LinkAddress("\x33\x33\x0d\x0e\x0f\x1a"); got != want {
+		t.Fatalf("got EthernetAddressFromMulticastIPv6Address(%s) = %s, want = %s", addr, got, want)
+	}
+}
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 1ceaebfbd..4da13c5df 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -178,24 +178,9 @@ func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bo
 		return broadcastMAC, true
 	}
 	if header.IsV4MulticastAddress(addr) {
-		// RFC 1112 Host Extensions for IP Multicasting
-		//
-		// 6.4. Extensions to an Ethernet Local Network Module:
-		//
-		// An IP host group address is mapped to an Ethernet multicast
-		// address by placing the low-order 23-bits of the IP address
-		// into the low-order 23 bits of the Ethernet multicast address
-		// 01-00-5E-00-00-00 (hex).
-		return tcpip.LinkAddress([]byte{
-			0x01,
-			0x00,
-			0x5e,
-			addr[header.IPv4AddressSize-3] & 0x7f,
-			addr[header.IPv4AddressSize-2],
-			addr[header.IPv4AddressSize-1],
-		}), true
+		return header.EthernetAddressFromMulticastIPv4Address(addr), true
 	}
-	return "", false
+	return tcpip.LinkAddress([]byte(nil)), false
 }
 
 // SetOption implements NetworkProtocol.
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index dc20c0fd7..7491cfc41 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -441,23 +441,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 // ResolveStaticAddress implements stack.LinkAddressResolver.
 func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
 	if header.IsV6MulticastAddress(addr) {
-		// RFC 2464 Transmission of IPv6 Packets over Ethernet Networks
-		//
-		// 7. Address Mapping -- Multicast
-		//
-		// An IPv6 packet with a multicast destination address DST,
-		// consisting of the sixteen octets DST[1] through DST[16], is
-		// transmitted to the Ethernet multicast address whose first
-		// two octets are the value 3333 hexadecimal and whose last
-		// four octets are the last four octets of DST.
-		return tcpip.LinkAddress([]byte{
-			0x33,
-			0x33,
-			addr[header.IPv6AddressSize-4],
-			addr[header.IPv6AddressSize-3],
-			addr[header.IPv6AddressSize-2],
-			addr[header.IPv6AddressSize-1],
-		}), true
+		return header.EthernetAddressFromMulticastIPv6Address(addr), true
 	}
-	return "", false
+	return tcpip.LinkAddress([]byte(nil)), false
 }
-- 
cgit v1.2.3


From eba7bdc24d31388ca81eeab251ed2db108f785dc Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 31 Jan 2020 13:46:13 -0800
Subject: iptables: enable TCP matching with "-m tcp".

A couple other things that changed:

- There's a proper extension registration system for matchers. Anyone
  adding another matcher can use tcp_matcher.go or udp_matcher.go as a
  template.
- All logging and use of syserr.Error in the netfilter package happens at the
  highest possible level (public functions). Lower-level functions just
  return normal, descriptive golang errors.
---
 pkg/abi/linux/netfilter.go                 |  52 ++++++++
 pkg/sentry/socket/netfilter/BUILD          |   4 +
 pkg/sentry/socket/netfilter/extensions.go  |  98 +++++++++++++++
 pkg/sentry/socket/netfilter/netfilter.go   | 187 ++++++++---------------------
 pkg/sentry/socket/netfilter/tcp_matcher.go | 143 ++++++++++++++++++++++
 pkg/sentry/socket/netfilter/udp_matcher.go | 142 ++++++++++++++++++++++
 pkg/tcpip/iptables/BUILD                   |   1 -
 pkg/tcpip/iptables/types.go                |   3 +
 pkg/tcpip/iptables/udp_matcher.go          | 113 -----------------
 9 files changed, 495 insertions(+), 248 deletions(-)
 create mode 100644 pkg/sentry/socket/netfilter/extensions.go
 create mode 100644 pkg/sentry/socket/netfilter/tcp_matcher.go
 create mode 100644 pkg/sentry/socket/netfilter/udp_matcher.go
 delete mode 100644 pkg/tcpip/iptables/udp_matcher.go

(limited to 'pkg')

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index 8e40bcc62..e4aabb6bb 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -348,6 +348,58 @@ func goString(cstring []byte) string {
 	return string(cstring)
 }
 
+// XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp
+// in include/uapi/linux/netfilter/xt_tcpudp.h.
+type XTTCP struct {
+	// SourcePortStart specifies the inclusive start of the range of source
+	// ports to which the matcher applies.
+	SourcePortStart uint16
+
+	// SourcePortEnd specifies the inclusive end of the range of source ports
+	// to which the matcher applies.
+	SourcePortEnd uint16
+
+	// DestinationPortStart specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortStart uint16
+
+	// DestinationPortEnd specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortEnd uint16
+
+	// Option specifies that a particular TCP option must be set.
+	Option uint8
+
+	// FlagMask masks the FlagCompare byte when comparing to the TCP flag
+	// fields.
+	FlagMask uint8
+
+	// FlagCompare is binary and-ed with the TCP flag fields.
+	FlagCompare uint8
+
+	// InverseFlags flips the meaning of certain fields. See the
+	// TX_TCP_INV_* flags.
+	InverseFlags uint8
+}
+
+// SizeOfXTTCP is the size of an XTTCP.
+const SizeOfXTTCP = 12
+
+// Flags in XTTCP.InverseFlags. Corresponding constants are in
+// include/uapi/linux/netfilter/xt_tcpudp.h.
+const (
+	// Invert the meaning of SourcePortStart/End.
+	XT_TCP_INV_SRCPT = 0x01
+	// Invert the meaning of DestinationPortStart/End.
+	XT_TCP_INV_DSTPT = 0x02
+	// Invert the meaning of FlagCompare.
+	XT_TCP_INV_FLAGS = 0x04
+	// Invert the meaning of Option.
+	XT_TCP_INV_OPTION = 0x08
+	// Enable all flags.
+	XT_TCP_INV_MASK = 0x0F
+)
+
 // XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp
 // in include/uapi/linux/netfilter/xt_tcpudp.h.
 type XTUDP struct {
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index fa2a2cb66..c91ec7494 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -5,7 +5,10 @@ package(licenses = ["notice"])
 go_library(
     name = "netfilter",
     srcs = [
+        "extensions.go",
         "netfilter.go",
+        "tcp_matcher.go",
+        "udp_matcher.go",
     ],
     # This target depends on netstack and should only be used by epsocket,
     # which is allowed to depend on netstack.
@@ -17,6 +20,7 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/syserr",
         "//pkg/tcpip",
+        "//pkg/tcpip/header",
         "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/usermem",
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
new file mode 100644
index 000000000..5a4cac84c
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -0,0 +1,98 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// TODO(gvisor.dev/issue/170): The following per-matcher params should be
+// supported:
+// - Table name
+// - Match size
+// - User size
+// - Hooks
+// - Proto
+// - Family
+
+// matchMarshaler knows how to (un)marshal the matcher named name().
+type matchMarshaler interface {
+	// name is the matcher name as stored in the xt_entry_match struct.
+	name() string
+
+	// marshal converts from an iptables.Matcher to an ABI struct.
+	marshal(matcher iptables.Matcher) []byte
+
+	// unmarshal converts from the ABI matcher struct to an
+	// iptables.Matcher.
+	unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error)
+}
+
+var matchMarshalers = map[string]matchMarshaler{}
+
+// registerMatchMarshaler should be called by match extensions to register them
+// with the netfilter package.
+func registerMatchMarshaler(mm matchMarshaler) {
+	if _, ok := matchMarshalers[mm.name()]; ok {
+		panic(fmt.Sprintf("Multiple matches registered with name %q.", mm.name()))
+	}
+	matchMarshalers[mm.name()] = mm
+}
+
+func marshalMatcher(matcher iptables.Matcher) []byte {
+	matchMaker, ok := matchMarshalers[matcher.Name()]
+	if !ok {
+		panic(fmt.Errorf("Unknown matcher of type %T.", matcher))
+	}
+	return matchMaker.marshal(matcher)
+}
+
+// marshalEntryMatch creates a marshalled XTEntryMatch with the given name and
+// data appended at the end.
+func marshalEntryMatch(name string, data []byte) []byte {
+	nflog("marshaling matcher %q", name)
+
+	// We have to pad this struct size to a multiple of 8 bytes.
+	size := alignUp(linux.SizeOfXTEntryMatch+len(data), 8)
+	matcher := linux.KernelXTEntryMatch{
+		XTEntryMatch: linux.XTEntryMatch{
+			MatchSize: uint16(size),
+		},
+		Data: data,
+	}
+	copy(matcher.Name[:], name)
+
+	buf := make([]byte, 0, size)
+	buf = binary.Marshal(buf, usermem.ByteOrder, matcher)
+	return append(buf, make([]byte, size-len(buf))...)
+}
+
+func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter, buf []byte) (iptables.Matcher, error) {
+	matchMaker, ok := matchMarshalers[match.Name.String()]
+	if !ok {
+		return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String())
+	}
+	return matchMaker.unmarshal(buf, filter)
+}
+
+// alignUp rounds a length up to an alignment. align must be a power of 2.
+func alignUp(length int, align uint) int {
+	return (length + int(align) - 1) & ^(int(align) - 1)
+}
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3dda6c7a1..8f14643b0 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -17,6 +17,7 @@
 package netfilter
 
 import (
+	"errors"
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -34,10 +35,6 @@ import (
 // shouldn't be reached - an error has occurred if we fall through to one.
 const errorTargetName = "ERROR"
 
-const (
-	matcherNameUDP = "udp"
-)
-
 // Metadata is used to verify that we are correctly serializing and
 // deserializing iptables into structs consumable by the iptables tool. We save
 // a metadata struct when the tables are written, and when they are read out we
@@ -68,7 +65,8 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	// Find the appropriate table.
 	table, err := findTable(stack, info.Name)
 	if err != nil {
-		return linux.IPTGetinfo{}, err
+		nflog("%v", err)
+		return linux.IPTGetinfo{}, syserr.ErrInvalidArgument
 	}
 
 	// Get the hooks that apply to this table.
@@ -95,39 +93,40 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	// Read in the struct and table name.
 	var userEntries linux.IPTGetEntries
 	if _, err := t.CopyIn(outPtr, &userEntries); err != nil {
-		log.Warningf("netfilter: couldn't copy in entries %q", userEntries.Name)
+		nflog("couldn't copy in entries %q", userEntries.Name)
 		return linux.KernelIPTGetEntries{}, syserr.FromError(err)
 	}
 
 	// Find the appropriate table.
 	table, err := findTable(stack, userEntries.Name)
 	if err != nil {
-		log.Warningf("netfilter: couldn't find table %q", userEntries.Name)
-		return linux.KernelIPTGetEntries{}, err
+		nflog("%v", err)
+		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
 	}
 
 	// Convert netstack's iptables rules to something that the iptables
 	// tool can understand.
 	entries, meta, err := convertNetstackToBinary(userEntries.Name.String(), table)
 	if err != nil {
-		return linux.KernelIPTGetEntries{}, err
+		nflog("couldn't read entries: %v", err)
+		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
 	}
 	if meta != table.Metadata().(metadata) {
 		panic(fmt.Sprintf("Table %q metadata changed between writing and reading. Was saved as %+v, but is now %+v", userEntries.Name.String(), table.Metadata().(metadata), meta))
 	}
 	if binary.Size(entries) > uintptr(outLen) {
-		log.Warningf("Insufficient GetEntries output size: %d", uintptr(outLen))
+		nflog("insufficient GetEntries output size: %d", uintptr(outLen))
 		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
 	}
 
 	return entries, nil
 }
 
-func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, *syserr.Error) {
+func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, error) {
 	ipt := stack.IPTables()
 	table, ok := ipt.Tables[tablename.String()]
 	if !ok {
-		return iptables.Table{}, syserr.ErrInvalidArgument
+		return iptables.Table{}, fmt.Errorf("couldn't find table %q", tablename)
 	}
 	return table, nil
 }
@@ -151,19 +150,19 @@ func FillDefaultIPTables(stack *stack.Stack) {
 	stack.SetIPTables(ipt)
 }
 
+// TODO: Return proto.
 // convertNetstackToBinary converts the iptables as stored in netstack to the
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
 // jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) {
+func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, error) {
 	// Return values.
 	var entries linux.KernelIPTGetEntries
 	var meta metadata
 
 	// The table name has to fit in the struct.
 	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
-		log.Warningf("Table name %q too long.", tablename)
-		return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
+		return linux.KernelIPTGetEntries{}, metadata{}, fmt.Errorf("Table name %q too long.", tablename)
 	}
 	copy(entries.Name[:], tablename)
 
@@ -229,46 +228,6 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	return entries, meta, nil
 }
 
-func marshalMatcher(matcher iptables.Matcher) []byte {
-	switch m := matcher.(type) {
-	case *iptables.UDPMatcher:
-		return marshalUDPMatcher(m)
-	default:
-		// TODO(gvisor.dev/issue/170): Support other matchers.
-		panic(fmt.Errorf("unknown matcher of type %T", matcher))
-	}
-}
-
-func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
-	nflog("convert to binary: marshalling UDP matcher: %+v", matcher)
-
-	// We have to pad this struct size to a multiple of 8 bytes.
-	size := alignUp(linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP, 8)
-
-	linuxMatcher := linux.KernelXTEntryMatch{
-		XTEntryMatch: linux.XTEntryMatch{
-			MatchSize: uint16(size),
-		},
-		Data: make([]byte, 0, linux.SizeOfXTUDP),
-	}
-	copy(linuxMatcher.Name[:], matcherNameUDP)
-
-	xtudp := linux.XTUDP{
-		SourcePortStart:      matcher.Data.SourcePortStart,
-		SourcePortEnd:        matcher.Data.SourcePortEnd,
-		DestinationPortStart: matcher.Data.DestinationPortStart,
-		DestinationPortEnd:   matcher.Data.DestinationPortEnd,
-		InverseFlags:         matcher.Data.InverseFlags,
-	}
-	linuxMatcher.Data = binary.Marshal(linuxMatcher.Data, usermem.ByteOrder, xtudp)
-
-	buf := make([]byte, 0, size)
-	buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher)
-	buf = append(buf, make([]byte, size-len(buf))...)
-	nflog("convert to binary: marshalled UDP matcher into %v", buf)
-	return buf[:]
-}
-
 func marshalTarget(target iptables.Target) []byte {
 	switch target.(type) {
 	case iptables.UnconditionalAcceptTarget:
@@ -332,7 +291,7 @@ func translateFromStandardVerdict(verdict iptables.Verdict) int32 {
 
 // translateToStandardVerdict translates from the value in a
 // linux.XTStandardTarget to an iptables.Verdict.
-func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
+func translateToStandardVerdict(val int32) (iptables.Verdict, error) {
 	// TODO(gvisor.dev/issue/170): Support other verdicts.
 	switch val {
 	case -linux.NF_ACCEPT - 1:
@@ -340,13 +299,12 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
 	case -linux.NF_DROP - 1:
 		return iptables.Drop, nil
 	case -linux.NF_QUEUE - 1:
-		log.Warningf("Unsupported iptables verdict QUEUE.")
+		return iptables.Invalid, errors.New("unsupported iptables verdict QUEUE")
 	case linux.NF_RETURN:
-		log.Warningf("Unsupported iptables verdict RETURN.")
+		return iptables.Invalid, errors.New("unsupported iptables verdict RETURN")
 	default:
-		log.Warningf("Unknown iptables verdict %d.", val)
+		return iptables.Invalid, fmt.Errorf("unknown iptables verdict %d.", val)
 	}
-	return iptables.Invalid, syserr.ErrInvalidArgument
 }
 
 // SetEntries sets iptables rules for a single table. See
@@ -354,7 +312,7 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
 func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
-		log.Warningf("netfilter.SetEntries: optVal has insufficient size for replace %d", len(optVal))
+		nflog("optVal has insufficient size for replace %d", len(optVal))
 		return syserr.ErrInvalidArgument
 	}
 	var replace linux.IPTReplace
@@ -368,7 +326,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	case iptables.TablenameFilter:
 		table = iptables.EmptyFilterTable()
 	default:
-		log.Warningf("We don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
+		nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
 		return syserr.ErrInvalidArgument
 	}
 
@@ -382,7 +340,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 		// Get the struct ipt_entry.
 		if len(optVal) < linux.SizeOfIPTEntry {
-			log.Warningf("netfilter: optVal has insufficient size for entry %d", len(optVal))
+			nflog("optVal has insufficient size for entry %d", len(optVal))
 			return syserr.ErrInvalidArgument
 		}
 		var entry linux.IPTEntry
@@ -392,7 +350,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		optVal = optVal[linux.SizeOfIPTEntry:]
 
 		if entry.TargetOffset < linux.SizeOfIPTEntry {
-			log.Warningf("netfilter: entry has too-small target offset %d", entry.TargetOffset)
+			nflog("entry has too-small target offset %d", entry.TargetOffset)
 			return syserr.ErrInvalidArgument
 		}
 
@@ -400,7 +358,8 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		// filtering fields.
 		filter, err := filterFromIPTIP(entry.IP)
 		if err != nil {
-			return err
+			nflog("bad iptip: %v", err)
+			return syserr.ErrInvalidArgument
 		}
 
 		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
@@ -408,25 +367,26 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		// Get matchers.
 		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
 		if len(optVal) < int(matchersSize) {
-			log.Warningf("netfilter: entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+			nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
 			return syserr.ErrInvalidArgument
 		}
 		matchers, err := parseMatchers(filter, optVal[:matchersSize])
 		if err != nil {
-			log.Warningf("netfilter: failed to parse matchers: %v", err)
-			return err
+			nflog("failed to parse matchers: %v", err)
+			return syserr.ErrInvalidArgument
 		}
 		optVal = optVal[matchersSize:]
 
 		// Get the target of the rule.
 		targetSize := entry.NextOffset - entry.TargetOffset
 		if len(optVal) < int(targetSize) {
-			log.Warningf("netfilter: entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
 			return syserr.ErrInvalidArgument
 		}
 		target, err := parseTarget(optVal[:targetSize])
 		if err != nil {
-			return err
+			nflog("failed to parse target: %v", err)
+			return syserr.ErrInvalidArgument
 		}
 		optVal = optVal[targetSize:]
 
@@ -439,7 +399,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		offset += uint32(entry.NextOffset)
 
 		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
-			log.Warningf("netfilter: entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
 		}
 	}
 
@@ -457,11 +417,11 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 				}
 			}
 			if ruleIdx := table.BuiltinChains[hk]; ruleIdx == iptables.HookUnset {
-				log.Warningf("Hook %v is unset.", hk)
+				nflog("hook %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
 			if ruleIdx := table.Underflows[hk]; ruleIdx == iptables.HookUnset {
-				log.Warningf("Underflow %v is unset.", hk)
+				nflog("underflow %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
 		}
@@ -473,7 +433,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	for hook, ruleIdx := range table.BuiltinChains {
 		if hook != iptables.Input {
 			if _, ok := table.Rules[ruleIdx].Target.(iptables.UnconditionalAcceptTarget); !ok {
-				log.Warningf("Hook %d is unsupported.", hook)
+				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
 			}
 		}
@@ -499,7 +459,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
 // only the matchers.
-func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) {
+func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, error) {
 	nflog("set entries: parsing matchers of size %d", len(optVal))
 	var matchers []iptables.Matcher
 	for len(optVal) > 0 {
@@ -507,8 +467,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 		// Get the XTEntryMatch.
 		if len(optVal) < linux.SizeOfXTEntryMatch {
-			log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal))
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("optVal has insufficient size for entry match: %d", len(optVal))
 		}
 		var match linux.XTEntryMatch
 		buf := optVal[:linux.SizeOfXTEntryMatch]
@@ -517,45 +476,18 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 		// Check some invariants.
 		if match.MatchSize < linux.SizeOfXTEntryMatch {
-			log.Warningf("netfilter: match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
-			return nil, syserr.ErrInvalidArgument
+
+			return nil, fmt.Errorf("match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
 		}
 		if len(optVal) < int(match.MatchSize) {
-			log.Warningf("netfilter: optVal has insufficient size for match: %d", len(optVal))
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("optVal has insufficient size for match: %d", len(optVal))
 		}
 
-		buf = optVal[linux.SizeOfXTEntryMatch:match.MatchSize]
-		var matcher iptables.Matcher
-		var err error
-		switch match.Name.String() {
-		case matcherNameUDP:
-			if len(buf) < linux.SizeOfXTUDP {
-				log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal))
-				return nil, syserr.ErrInvalidArgument
-			}
-			// For alignment reasons, the match's total size may
-			// exceed what's strictly necessary to hold matchData.
-			var matchData linux.XTUDP
-			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
-			log.Infof("parseMatchers: parsed XTUDP: %+v", matchData)
-			matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherParams{
-				SourcePortStart:      matchData.SourcePortStart,
-				SourcePortEnd:        matchData.SourcePortEnd,
-				DestinationPortStart: matchData.DestinationPortStart,
-				DestinationPortEnd:   matchData.DestinationPortEnd,
-				InverseFlags:         matchData.InverseFlags,
-			})
-			if err != nil {
-				log.Warningf("netfilter: failed to create UDP matcher: %v", err)
-				return nil, syserr.ErrInvalidArgument
-			}
-
-		default:
-			log.Warningf("netfilter: unsupported matcher with name %q", match.Name.String())
-			return nil, syserr.ErrInvalidArgument
+		// Parse the specific matcher.
+		matcher, err := unmarshalMatcher(match, filter, optVal[linux.SizeOfXTEntryMatch:match.MatchSize])
+		if err != nil {
+			return nil, fmt.Errorf("failed to create matcher: %v", err)
 		}
-
 		matchers = append(matchers, matcher)
 
 		// TODO(gvisor.dev/issue/170): Check the revision field.
@@ -563,8 +495,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 	}
 
 	if len(optVal) != 0 {
-		log.Warningf("netfilter: optVal should be exhausted after parsing matchers")
-		return nil, syserr.ErrInvalidArgument
+		return nil, errors.New("optVal should be exhausted after parsing matchers")
 	}
 
 	return matchers, nil
@@ -572,11 +503,10 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
-func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
+func parseTarget(optVal []byte) (iptables.Target, error) {
 	nflog("set entries: parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
-		log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
-		return nil, syserr.ErrInvalidArgument
+		return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal))
 	}
 	var target linux.XTEntryTarget
 	buf := optVal[:linux.SizeOfXTEntryTarget]
@@ -585,8 +515,7 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 	case "":
 		// Standard target.
 		if len(optVal) != linux.SizeOfXTStandardTarget {
-			log.Warningf("netfilter.SetEntries: optVal has wrong size for standard target %d", len(optVal))
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("optVal has wrong size for standard target %d", len(optVal))
 		}
 		var standardTarget linux.XTStandardTarget
 		buf = optVal[:linux.SizeOfXTStandardTarget]
@@ -602,15 +531,13 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 		case iptables.Drop:
 			return iptables.UnconditionalDropTarget{}, nil
 		default:
-			log.Warningf("Unknown verdict: %v", verdict)
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("Unknown verdict: %v", verdict)
 		}
 
 	case errorTargetName:
 		// Error target.
 		if len(optVal) != linux.SizeOfXTErrorTarget {
-			log.Infof("netfilter.SetEntries: optVal has insufficient size for error target %d", len(optVal))
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("optVal has insufficient size for error target %d", len(optVal))
 		}
 		var errorTarget linux.XTErrorTarget
 		buf = optVal[:linux.SizeOfXTErrorTarget]
@@ -627,20 +554,17 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 		case errorTargetName:
 			return iptables.ErrorTarget{}, nil
 		default:
-			log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
 		}
 	}
 
 	// Unknown target.
-	log.Infof("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
-	return nil, syserr.ErrInvalidArgument
+	return nil, fmt.Errorf("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
 }
 
-func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, *syserr.Error) {
+func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, error) {
 	if containsUnsupportedFields(iptip) {
-		log.Warningf("netfilter: unsupported fields in struct iptip: %+v", iptip)
-		return iptables.IPHeaderFilter{}, syserr.ErrInvalidArgument
+		return iptables.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
 	}
 	return iptables.IPHeaderFilter{
 		Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
@@ -678,8 +602,3 @@ func hookFromLinux(hook int) iptables.Hook {
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
-
-// alignUp rounds a length up to an alignment. align must be a power of 2.
-func alignUp(length int, align uint) int {
-	return (length + int(align) - 1) & ^(int(align) - 1)
-}
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
new file mode 100644
index 000000000..1646d22f7
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -0,0 +1,143 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const matcherNameTCP = "tcp"
+
+func init() {
+	registerMatchMarshaler(tcpMarshaler{})
+}
+
+// tcpMarshaler implements matchMarshaler for TCP matching.
+type tcpMarshaler struct{}
+
+// name implements matchMarshaler.name.
+func (tcpMarshaler) name() string {
+	return matcherNameTCP
+}
+
+// marshal implements matchMarshaler.marshal.
+func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
+	matcher := mr.(*TCPMatcher)
+	xttcp := linux.XTTCP{
+		SourcePortStart:      matcher.sourcePortStart,
+		SourcePortEnd:        matcher.sourcePortEnd,
+		DestinationPortStart: matcher.destinationPortStart,
+		DestinationPortEnd:   matcher.destinationPortEnd,
+	}
+	buf := make([]byte, 0, linux.SizeOfXTUDP)
+	return marshalEntryMatch(matcherNameTCP, binary.Marshal(buf, usermem.ByteOrder, xttcp))
+}
+
+// unmarshal implements matchMarshaler.unmarshal.
+func (tcpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
+	if len(buf) < linux.SizeOfXTTCP {
+		return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf))
+	}
+
+	// For alignment reasons, the match's total size may
+	// exceed what's strictly necessary to hold matchData.
+	var matchData linux.XTTCP
+	binary.Unmarshal(buf[:linux.SizeOfXTTCP], usermem.ByteOrder, &matchData)
+	nflog("parseMatchers: parsed XTTCP: %+v", matchData)
+
+	if matchData.Option != 0 ||
+		matchData.FlagMask != 0 ||
+		matchData.FlagCompare != 0 ||
+		matchData.InverseFlags != 0 {
+		return nil, fmt.Errorf("unsupported TCP matcher flags set")
+	}
+
+	if filter.Protocol != header.TCPProtocolNumber {
+		return nil, fmt.Errorf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber)
+	}
+
+	return &TCPMatcher{
+		sourcePortStart:      matchData.SourcePortStart,
+		sourcePortEnd:        matchData.SourcePortEnd,
+		destinationPortStart: matchData.DestinationPortStart,
+		destinationPortEnd:   matchData.DestinationPortEnd,
+	}, nil
+}
+
+// TCPMatcher matches TCP packets and their headers. It implements Matcher.
+type TCPMatcher struct {
+	sourcePortStart      uint16
+	sourcePortEnd        uint16
+	destinationPortStart uint16
+	destinationPortEnd   uint16
+}
+
+// Name implements Matcher.Name.
+func (*TCPMatcher) Name() string {
+	return matcherNameTCP
+}
+
+// Match implements Matcher.Match.
+func (tm *TCPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+	netHeader := header.IPv4(pkt.NetworkHeader)
+
+	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
+		return false, false
+	}
+
+	// We dont't match fragments.
+	if frag := netHeader.FragmentOffset(); frag != 0 {
+		if frag == 1 {
+			return false, true
+		}
+		return false, false
+	}
+
+	// Now we need the transport header. However, this may not have been set
+	// yet.
+	// TODO(gvisor.dev/issue/170): Parsing the transport header should
+	// ultimately be moved into the iptables.Check codepath as matchers are
+	// added.
+	var tcpHeader header.TCP
+	if pkt.TransportHeader != nil {
+		tcpHeader = header.TCP(pkt.TransportHeader)
+	} else {
+		// The TCP header hasn't been parsed yet. We have to do it here.
+		if len(pkt.Data.First()) < header.TCPMinimumSize {
+			// There's no valid TCP header here, so we hotdrop the
+			// packet.
+			return false, true
+		}
+		tcpHeader = header.TCP(pkt.Data.First())
+	}
+
+	// Check whether the source and destination ports are within the
+	// matching range.
+	if sourcePort := tcpHeader.SourcePort(); sourcePort < tm.sourcePortStart || tm.sourcePortEnd < sourcePort {
+		return false, false
+	}
+	if destinationPort := tcpHeader.DestinationPort(); destinationPort < tm.destinationPortStart || tm.destinationPortEnd < destinationPort {
+		return false, false
+	}
+
+	return true, false
+}
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
new file mode 100644
index 000000000..b6e95bbc5
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -0,0 +1,142 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const matcherNameUDP = "udp"
+
+func init() {
+	registerMatchMarshaler(udpMarshaler{})
+}
+
+// udpMarshaler implements matchMarshaler for UDP matching.
+type udpMarshaler struct{}
+
+// name implements matchMarshaler.name.
+func (udpMarshaler) name() string {
+	return matcherNameUDP
+}
+
+// marshal implements matchMarshaler.marshal.
+func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
+	matcher := mr.(*UDPMatcher)
+	xtudp := linux.XTUDP{
+		SourcePortStart:      matcher.sourcePortStart,
+		SourcePortEnd:        matcher.sourcePortEnd,
+		DestinationPortStart: matcher.destinationPortStart,
+		DestinationPortEnd:   matcher.destinationPortEnd,
+	}
+	buf := make([]byte, 0, linux.SizeOfXTUDP)
+	return marshalEntryMatch(matcherNameUDP, binary.Marshal(buf, usermem.ByteOrder, xtudp))
+}
+
+// unmarshal implements matchMarshaler.unmarshal.
+func (udpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
+	if len(buf) < linux.SizeOfXTUDP {
+		return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf))
+	}
+
+	// For alignment reasons, the match's total size may exceed what's
+	// strictly necessary to hold matchData.
+	var matchData linux.XTUDP
+	binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
+	nflog("parseMatchers: parsed XTUDP: %+v", matchData)
+
+	if matchData.InverseFlags != 0 {
+		return nil, fmt.Errorf("unsupported UDP matcher inverse flags set")
+	}
+
+	if filter.Protocol != header.UDPProtocolNumber {
+		return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
+	}
+
+	return &UDPMatcher{
+		sourcePortStart:      matchData.SourcePortStart,
+		sourcePortEnd:        matchData.SourcePortEnd,
+		destinationPortStart: matchData.DestinationPortStart,
+		destinationPortEnd:   matchData.DestinationPortEnd,
+	}, nil
+}
+
+// UDPMatcher matches UDP packets and their headers. It implements Matcher.
+type UDPMatcher struct {
+	sourcePortStart      uint16
+	sourcePortEnd        uint16
+	destinationPortStart uint16
+	destinationPortEnd   uint16
+}
+
+// Name implements Matcher.Name.
+func (*UDPMatcher) Name() string {
+	return matcherNameUDP
+}
+
+// Match implements Matcher.Match.
+func (um *UDPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+	netHeader := header.IPv4(pkt.NetworkHeader)
+
+	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
+	// into the iptables.Check codepath as matchers are added.
+	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
+		return false, false
+	}
+
+	// We dont't match fragments.
+	if frag := netHeader.FragmentOffset(); frag != 0 {
+		if frag == 1 {
+			return false, true
+		}
+		return false, false
+	}
+
+	// Now we need the transport header. However, this may not have been set
+	// yet.
+	// TODO(gvisor.dev/issue/170): Parsing the transport header should
+	// ultimately be moved into the iptables.Check codepath as matchers are
+	// added.
+	var udpHeader header.UDP
+	if pkt.TransportHeader != nil {
+		udpHeader = header.UDP(pkt.TransportHeader)
+	} else {
+		// The UDP header hasn't been parsed yet. We have to do it here.
+		if len(pkt.Data.First()) < header.UDPMinimumSize {
+			// There's no valid UDP header here, so we hotdrop the
+			// packet.
+			return false, true
+		}
+		udpHeader = header.UDP(pkt.Data.First())
+	}
+
+	// Check whether the source and destination ports are within the
+	// matching range.
+	if sourcePort := udpHeader.SourcePort(); sourcePort < um.sourcePortStart || um.sourcePortEnd < sourcePort {
+		return false, false
+	}
+	if destinationPort := udpHeader.DestinationPort(); destinationPort < um.destinationPortStart || um.destinationPortEnd < destinationPort {
+		return false, false
+	}
+
+	return true, false
+}
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index bab26580b..d1b73cfdf 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -8,7 +8,6 @@ go_library(
         "iptables.go",
         "targets.go",
         "types.go",
-        "udp_matcher.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 7f77802a0..d660aab04 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -171,6 +171,9 @@ type IPHeaderFilter struct {
 
 // A Matcher is the interface for matching packets.
 type Matcher interface {
+	// Name returns the name of the Matcher.
+	Name() string
+
 	// Match returns whether the packet matches and whether the packet
 	// should be "hotdropped", i.e. dropped immediately. This is usually
 	// used for suspicious packets.
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
deleted file mode 100644
index 3bb076f9c..000000000
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ /dev/null
@@ -1,113 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package iptables
-
-import (
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-)
-
-// TODO(gvisor.dev/issue/170): The following per-matcher params should be
-// supported:
-// - Table name
-// - Match size
-// - User size
-// - Hooks
-// - Proto
-// - Family
-
-// UDPMatcher matches UDP packets and their headers. It implements Matcher.
-type UDPMatcher struct {
-	Data UDPMatcherParams
-}
-
-// UDPMatcherParams are the parameters used to create a UDPMatcher.
-type UDPMatcherParams struct {
-	SourcePortStart      uint16
-	SourcePortEnd        uint16
-	DestinationPortStart uint16
-	DestinationPortEnd   uint16
-	InverseFlags         uint8
-}
-
-// NewUDPMatcher returns a new instance of UDPMatcher.
-func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherParams) (Matcher, error) {
-	log.Infof("Adding rule with UDPMatcherParams: %+v", data)
-
-	if data.InverseFlags != 0 {
-		return nil, fmt.Errorf("unsupported UDP matcher inverse flags set")
-	}
-
-	if filter.Protocol != header.UDPProtocolNumber {
-		return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
-	}
-
-	return &UDPMatcher{Data: data}, nil
-}
-
-// Match implements Matcher.Match.
-func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
-	netHeader := header.IPv4(pkt.NetworkHeader)
-
-	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
-	// into the iptables.Check codepath as matchers are added.
-	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
-		return false, false
-	}
-
-	// We dont't match fragments.
-	if frag := netHeader.FragmentOffset(); frag != 0 {
-		if frag == 1 {
-			log.Warningf("Dropping UDP packet: malicious fragmented packet.")
-			return false, true
-		}
-		return false, false
-	}
-
-	// Now we need the transport header. However, this may not have been set
-	// yet.
-	// TODO(gvisor.dev/issue/170): Parsing the transport header should
-	// ultimately be moved into the iptables.Check codepath as matchers are
-	// added.
-	var udpHeader header.UDP
-	if pkt.TransportHeader != nil {
-		udpHeader = header.UDP(pkt.TransportHeader)
-	} else {
-		// The UDP header hasn't been parsed yet. We have to do it here.
-		if len(pkt.Data.First()) < header.UDPMinimumSize {
-			// There's no valid UDP header here, so we hotdrop the
-			// packet.
-			log.Warningf("Dropping UDP packet: size too small.")
-			return false, true
-		}
-		udpHeader = header.UDP(pkt.Data.First())
-	}
-
-	// Check whether the source and destination ports are within the
-	// matching range.
-	sourcePort := udpHeader.SourcePort()
-	destinationPort := udpHeader.DestinationPort()
-	if sourcePort < um.Data.SourcePortStart || um.Data.SourcePortEnd < sourcePort {
-		return false, false
-	}
-	if destinationPort < um.Data.DestinationPortStart || um.Data.DestinationPortEnd < destinationPort {
-		return false, false
-	}
-
-	return true, false
-}
-- 
cgit v1.2.3


From 29ad5762e4549d961f48c65292cfdeb7256524f6 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 31 Jan 2020 13:53:58 -0800
Subject: Spelling

---
 pkg/sentry/socket/netfilter/extensions.go  | 18 +++++++++---------
 pkg/sentry/socket/netfilter/tcp_matcher.go | 10 +++++-----
 pkg/sentry/socket/netfilter/udp_matcher.go | 10 +++++-----
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index 5a4cac84c..b5fbb52e4 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -32,8 +32,8 @@ import (
 // - Proto
 // - Family
 
-// matchMarshaler knows how to (un)marshal the matcher named name().
-type matchMarshaler interface {
+// matchMaker knows how to (un)marshal the matcher named name().
+type matchMaker interface {
 	// name is the matcher name as stored in the xt_entry_match struct.
 	name() string
 
@@ -45,19 +45,19 @@ type matchMarshaler interface {
 	unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error)
 }
 
-var matchMarshalers = map[string]matchMarshaler{}
+var matchMakers = map[string]matchMaker{}
 
-// registerMatchMarshaler should be called by match extensions to register them
+// registermatchMaker should be called by match extensions to register them
 // with the netfilter package.
-func registerMatchMarshaler(mm matchMarshaler) {
-	if _, ok := matchMarshalers[mm.name()]; ok {
+func registerMatchMaker(mm matchMaker) {
+	if _, ok := matchMakers[mm.name()]; ok {
 		panic(fmt.Sprintf("Multiple matches registered with name %q.", mm.name()))
 	}
-	matchMarshalers[mm.name()] = mm
+	matchMakers[mm.name()] = mm
 }
 
 func marshalMatcher(matcher iptables.Matcher) []byte {
-	matchMaker, ok := matchMarshalers[matcher.Name()]
+	matchMaker, ok := matchMakers[matcher.Name()]
 	if !ok {
 		panic(fmt.Errorf("Unknown matcher of type %T.", matcher))
 	}
@@ -85,7 +85,7 @@ func marshalEntryMatch(name string, data []byte) []byte {
 }
 
 func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter, buf []byte) (iptables.Matcher, error) {
-	matchMaker, ok := matchMarshalers[match.Name.String()]
+	matchMaker, ok := matchMakers[match.Name.String()]
 	if !ok {
 		return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String())
 	}
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 1646d22f7..6b2f4c31a 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -28,18 +28,18 @@ import (
 const matcherNameTCP = "tcp"
 
 func init() {
-	registerMatchMarshaler(tcpMarshaler{})
+	registerMatchMaker(tcpMarshaler{})
 }
 
-// tcpMarshaler implements matchMarshaler for TCP matching.
+// tcpMarshaler implements matchMaker for TCP matching.
 type tcpMarshaler struct{}
 
-// name implements matchMarshaler.name.
+// name implements matchMaker.name.
 func (tcpMarshaler) name() string {
 	return matcherNameTCP
 }
 
-// marshal implements matchMarshaler.marshal.
+// marshal implements matchMaker.marshal.
 func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
 	matcher := mr.(*TCPMatcher)
 	xttcp := linux.XTTCP{
@@ -52,7 +52,7 @@ func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
 	return marshalEntryMatch(matcherNameTCP, binary.Marshal(buf, usermem.ByteOrder, xttcp))
 }
 
-// unmarshal implements matchMarshaler.unmarshal.
+// unmarshal implements matchMaker.unmarshal.
 func (tcpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
 	if len(buf) < linux.SizeOfXTTCP {
 		return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf))
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index b6e95bbc5..86aa11696 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -28,18 +28,18 @@ import (
 const matcherNameUDP = "udp"
 
 func init() {
-	registerMatchMarshaler(udpMarshaler{})
+	registerMatchMaker(udpMarshaler{})
 }
 
-// udpMarshaler implements matchMarshaler for UDP matching.
+// udpMarshaler implements matchMaker for UDP matching.
 type udpMarshaler struct{}
 
-// name implements matchMarshaler.name.
+// name implements matchMaker.name.
 func (udpMarshaler) name() string {
 	return matcherNameUDP
 }
 
-// marshal implements matchMarshaler.marshal.
+// marshal implements matchMaker.marshal.
 func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
 	matcher := mr.(*UDPMatcher)
 	xtudp := linux.XTUDP{
@@ -52,7 +52,7 @@ func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
 	return marshalEntryMatch(matcherNameUDP, binary.Marshal(buf, usermem.ByteOrder, xtudp))
 }
 
-// unmarshal implements matchMarshaler.unmarshal.
+// unmarshal implements matchMaker.unmarshal.
 func (udpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
 	if len(buf) < linux.SizeOfXTUDP {
 		return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf))
-- 
cgit v1.2.3


From 77bf586db75b3dbd9dcb14c349bde8372d26425c Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 31 Jan 2020 13:54:57 -0800
Subject: Use multicast Ethernet address for multicast NDP

As per RFC 2464 section 7, an IPv6 packet with a multicast destination
address is transmitted to the mapped Ethernet multicast address.

Test:
- ipv6.TestLinkResolution
- stack_test.TestDADResolve
- stack_test.TestRouterSolicitation
PiperOrigin-RevId: 292610529
---
 pkg/tcpip/header/ipv6_test.go                      | 29 ++++++++++++++++++++++
 pkg/tcpip/link/channel/channel.go                  | 29 ++++++++++++++--------
 pkg/tcpip/network/ipv6/icmp.go                     |  6 ++++-
 pkg/tcpip/network/ipv6/icmp_test.go                | 12 ++++++---
 pkg/tcpip/stack/ndp.go                             | 17 +++++++++++++
 pkg/tcpip/stack/ndp_test.go                        | 16 +++++++++++-
 pkg/tcpip/stack/route.go                           |  4 ++-
 pkg/tcpip/transport/tcp/testing/context/context.go |  6 ++++-
 8 files changed, 101 insertions(+), 18 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go
index 29f54bc57..c3ad503aa 100644
--- a/pkg/tcpip/header/ipv6_test.go
+++ b/pkg/tcpip/header/ipv6_test.go
@@ -17,6 +17,7 @@ package header_test
 import (
 	"bytes"
 	"crypto/sha256"
+	"fmt"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -300,3 +301,31 @@ func TestScopeForIPv6Address(t *testing.T) {
 		})
 	}
 }
+
+func TestSolicitedNodeAddr(t *testing.T) {
+	tests := []struct {
+		addr tcpip.Address
+		want tcpip.Address
+	}{
+		{
+			addr: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\xa0",
+			want: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff\x0e\x0f\xa0",
+		},
+		{
+			addr: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\xdd\x0e\x0f\xa0",
+			want: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff\x0e\x0f\xa0",
+		},
+		{
+			addr: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\xdd\x01\x02\x03",
+			want: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff\x01\x02\x03",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(fmt.Sprintf("%s", test.addr), func(t *testing.T) {
+			if got := header.SolicitedNodeAddr(test.addr); got != test.want {
+				t.Fatalf("got header.SolicitedNodeAddr(%s) = %s, want = %s", test.addr, got, test.want)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 71b9da797..78d447acd 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -30,15 +30,16 @@ type PacketInfo struct {
 	Pkt   tcpip.PacketBuffer
 	Proto tcpip.NetworkProtocolNumber
 	GSO   *stack.GSO
+	Route stack.Route
 }
 
 // Endpoint is link layer endpoint that stores outbound packets in a channel
 // and allows injection of inbound packets.
 type Endpoint struct {
-	dispatcher stack.NetworkDispatcher
-	mtu        uint32
-	linkAddr   tcpip.LinkAddress
-	GSO        bool
+	dispatcher         stack.NetworkDispatcher
+	mtu                uint32
+	linkAddr           tcpip.LinkAddress
+	LinkEPCapabilities stack.LinkEndpointCapabilities
 
 	// c is where outbound packets are queued.
 	c chan PacketInfo
@@ -122,11 +123,7 @@ func (e *Endpoint) MTU() uint32 {
 
 // Capabilities implements stack.LinkEndpoint.Capabilities.
 func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
-	caps := stack.LinkEndpointCapabilities(0)
-	if e.GSO {
-		caps |= stack.CapabilityHardwareGSO
-	}
-	return caps
+	return e.LinkEPCapabilities
 }
 
 // GSOMaxSize returns the maximum GSO packet size.
@@ -146,11 +143,16 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+	// Clone r then release its resource so we only get the relevant fields from
+	// stack.Route without holding a reference to a NIC's endpoint.
+	route := r.Clone()
+	route.Release()
 	p := PacketInfo{
 		Pkt:   pkt,
 		Proto: protocol,
 		GSO:   gso,
+		Route: route,
 	}
 
 	select {
@@ -162,7 +164,11 @@ func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 }
 
 // WritePackets stores outbound packets into the channel.
-func (e *Endpoint) WritePackets(_ *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	// Clone r then release its resource so we only get the relevant fields from
+	// stack.Route without holding a reference to a NIC's endpoint.
+	route := r.Clone()
+	route.Release()
 	payloadView := pkts[0].Data.ToView()
 	n := 0
 packetLoop:
@@ -176,6 +182,7 @@ packetLoop:
 			},
 			Proto: protocol,
 			GSO:   gso,
+			Route: route,
 		}
 
 		select {
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 7491cfc41..60817d36d 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -408,10 +408,14 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 // LinkAddressRequest implements stack.LinkAddressResolver.
 func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error {
 	snaddr := header.SolicitedNodeAddr(addr)
+
+	// TODO(b/148672031): Use stack.FindRoute instead of manually creating the
+	// route here. Note, we would need the nicID to do this properly so the right
+	// NIC (associated to linkEP) is used to send the NDP NS message.
 	r := &stack.Route{
 		LocalAddress:      localAddr,
 		RemoteAddress:     snaddr,
-		RemoteLinkAddress: broadcastMAC,
+		RemoteLinkAddress: header.EthernetAddressFromMulticastIPv6Address(snaddr),
 	}
 	hdr := buffer.NewPrependable(int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize)
 	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 7a6820643..d0e930e20 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -270,8 +270,9 @@ func (c *testContext) cleanup() {
 }
 
 type routeArgs struct {
-	src, dst *channel.Endpoint
-	typ      header.ICMPv6Type
+	src, dst       *channel.Endpoint
+	typ            header.ICMPv6Type
+	remoteLinkAddr tcpip.LinkAddress
 }
 
 func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.ICMPv6)) {
@@ -292,6 +293,11 @@ func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.
 		t.Errorf("unexpected protocol number %d", pi.Proto)
 		return
 	}
+
+	if len(args.remoteLinkAddr) != 0 && args.remoteLinkAddr != pi.Route.RemoteLinkAddress {
+		t.Errorf("got remote link address = %s, want = %s", pi.Route.RemoteLinkAddress, args.remoteLinkAddr)
+	}
+
 	ipv6 := header.IPv6(pi.Pkt.Header.View())
 	transProto := tcpip.TransportProtocolNumber(ipv6.NextHeader())
 	if transProto != header.ICMPv6ProtocolNumber {
@@ -339,7 +345,7 @@ func TestLinkResolution(t *testing.T) {
 				t.Fatalf("ep.Write(_) = _, <non-nil>, %s, want = _, <non-nil>, tcpip.ErrNoLinkAddress", err)
 			}
 			for _, args := range []routeArgs{
-				{src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6NeighborSolicit},
+				{src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6NeighborSolicit, remoteLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.SolicitedNodeAddr(lladdr1))},
 				{src: c.linkEP1, dst: c.linkEP0, typ: header.ICMPv6NeighborAdvert},
 			} {
 				routeICMPv6Packet(t, args, func(t *testing.T, icmpv6 header.ICMPv6) {
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 31294345d..6123fda33 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -538,6 +538,14 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
 	r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 	defer r.Release()
 
+	// Route should resolve immediately since snmc is a multicast address so a
+	// remote link address can be calculated without a resolution process.
+	if c, err := r.Resolve(nil); err != nil {
+		log.Fatalf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.nic.ID(), err)
+	} else if c != nil {
+		log.Fatalf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.nic.ID())
+	}
+
 	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
 	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
 	pkt.SetType(header.ICMPv6NeighborSolicit)
@@ -1197,6 +1205,15 @@ func (ndp *ndpState) startSolicitingRouters() {
 		r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 		defer r.Release()
 
+		// Route should resolve immediately since
+		// header.IPv6AllRoutersMulticastAddress is a multicast address so a
+		// remote link address can be calculated without a resolution process.
+		if c, err := r.Resolve(nil); err != nil {
+			log.Fatalf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID(), err)
+		} else if c != nil {
+			log.Fatalf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID())
+		}
+
 		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize
 		hdr := buffer.NewPrependable(header.IPv6MinimumSize + payloadSize)
 		pkt := header.ICMPv6(hdr.Prepend(payloadSize))
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index bc7cfbcb4..8af8565f7 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -336,6 +336,7 @@ func TestDADResolve(t *testing.T) {
 			opts.NDPConfigs.DupAddrDetectTransmits = test.dupAddrDetectTransmits
 
 			e := channel.New(int(test.dupAddrDetectTransmits), 1280, linkAddr1)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 			s := stack.New(opts)
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -413,6 +414,12 @@ func TestDADResolve(t *testing.T) {
 					t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
 				}
 
+				// Make sure the right remote link address is used.
+				snmc := header.SolicitedNodeAddr(addr1)
+				if want := header.EthernetAddressFromMulticastIPv6Address(snmc); p.Route.RemoteLinkAddress != want {
+					t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want)
+				}
+
 				// Check NDP NS packet.
 				//
 				// As per RFC 4861 section 4.3, a possible option is the Source Link
@@ -420,7 +427,7 @@ func TestDADResolve(t *testing.T) {
 				// address of the packet is the unspecified address.
 				checker.IPv6(t, p.Pkt.Header.View().ToVectorisedView().First(),
 					checker.SrcAddr(header.IPv6Any),
-					checker.DstAddr(header.SolicitedNodeAddr(addr1)),
+					checker.DstAddr(snmc),
 					checker.TTL(header.NDPHopLimit),
 					checker.NDPNS(
 						checker.NDPNSTargetAddress(addr1),
@@ -3292,6 +3299,7 @@ func TestRouterSolicitation(t *testing.T) {
 			t.Run(test.name, func(t *testing.T) {
 				t.Parallel()
 				e := channel.New(int(test.maxRtrSolicit), 1280, linkAddr1)
+				e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 				waitForPkt := func(timeout time.Duration) {
 					t.Helper()
 					ctx, _ := context.WithTimeout(context.Background(), timeout)
@@ -3304,6 +3312,12 @@ func TestRouterSolicitation(t *testing.T) {
 					if p.Proto != header.IPv6ProtocolNumber {
 						t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
 					}
+
+					// Make sure the right remote link address is used.
+					if want := header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersMulticastAddress); p.Route.RemoteLinkAddress != want {
+						t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want)
+					}
+
 					checker.IPv6(t,
 						p.Pkt.Header.View(),
 						checker.SrcAddr(header.IPv6Any),
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 517f4b941..f565aafb2 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -225,7 +225,9 @@ func (r *Route) Release() {
 // Clone Clone a route such that the original one can be released and the new
 // one will remain valid.
 func (r *Route) Clone() Route {
-	r.ref.incRef()
+	if r.ref != nil {
+		r.ref.incRef()
+	}
 	return *r
 }
 
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 730ac4292..1e9a0dea3 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -1082,7 +1082,11 @@ func (c *Context) SACKEnabled() bool {
 
 // SetGSOEnabled enables or disables generic segmentation offload.
 func (c *Context) SetGSOEnabled(enable bool) {
-	c.linkEP.GSO = enable
+	if enable {
+		c.linkEP.LinkEPCapabilities |= stack.CapabilityHardwareGSO
+	} else {
+		c.linkEP.LinkEPCapabilities &^= stack.CapabilityHardwareGSO
+	}
 }
 
 // MSSWithoutOptions returns the value for the MSS used by the stack when no
-- 
cgit v1.2.3


From 6c3072243dfbf70062de5f610e14fd6ed2ce5f32 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 31 Jan 2020 14:14:52 -0800
Subject: Implement file locks for regular tmpfs files in VFSv2.

Add a file lock implementation that can be embedded into various filesystem
implementations.

Updates #1480

PiperOrigin-RevId: 292614758
---
 pkg/sentry/fsimpl/tmpfs/BUILD                |  3 ++
 pkg/sentry/fsimpl/tmpfs/regular_file.go      | 23 +++++++++
 pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 56 ++++++++++++++++++++++
 pkg/sentry/fsimpl/tmpfs/tmpfs.go             | 43 +++++++++++++++++
 pkg/sentry/vfs/lock/BUILD                    | 13 +++++
 pkg/sentry/vfs/lock/lock.go                  | 72 ++++++++++++++++++++++++++++
 6 files changed, 210 insertions(+)
 create mode 100644 pkg/sentry/vfs/lock/BUILD
 create mode 100644 pkg/sentry/vfs/lock/lock.go

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index c61366224..57abd5583 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -38,6 +38,7 @@ go_library(
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/pipe",
@@ -47,6 +48,7 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
@@ -86,6 +88,7 @@ go_test(
         "//pkg/context",
         "//pkg/fspath",
         "//pkg/sentry/contexttest",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/contexttest",
         "//pkg/sentry/vfs",
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index e9e6faf67..dab346a41 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -192,6 +193,28 @@ func (fd *regularFileFD) Sync(ctx context.Context) error {
 	return nil
 }
 
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *regularFileFD) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
+	return fd.inode().lockBSD(uid, t, block)
+}
+
+// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+func (fd *regularFileFD) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
+	fd.inode().unlockBSD(uid)
+	return nil
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
+	return fd.inode().lockPOSIX(uid, t, rng, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
+	fd.inode().unlockPOSIX(uid, rng)
+	return nil
+}
+
 // regularFileReadWriter implements safemem.Reader and Safemem.Writer.
 type regularFileReadWriter struct {
 	file *regularFile
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index 32552e261..2b52992ea 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -24,9 +24,11 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -260,6 +262,60 @@ func TestPWrite(t *testing.T) {
 	}
 }
 
+func TestLocks(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, 0644)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	var (
+		uid1 lock.UniqueID
+		uid2 lock.UniqueID
+		// Non-blocking.
+		block lock.Blocker
+	)
+
+	uid1 = 123
+	uid2 = 456
+
+	if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, block); err != nil {
+		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
+	}
+	if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, block); err != nil {
+		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
+	}
+	if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block), syserror.ErrWouldBlock; got != want {
+		t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want)
+	}
+	if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil {
+		t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err)
+	}
+	if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block); err != nil {
+		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
+	}
+
+	rng1 := lock.LockRange{0, 1}
+	rng2 := lock.LockRange{1, 2}
+
+	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, rng1, block); err != nil {
+		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
+	}
+	if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng2, block); err != nil {
+		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
+	}
+	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, rng1, block); err != nil {
+		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
+	}
+	if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng1, block), syserror.ErrWouldBlock; got != want {
+		t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want)
+	}
+	if err := fd.Impl().UnlockPOSIX(ctx, uid1, rng1); err != nil {
+		t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err)
+	}
+}
+
 func TestPRead(t *testing.T) {
 	ctx := contexttest.Context(t)
 	fd, cleanup, err := newFileFD(ctx, 0644)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 88dbd6e35..2108d0f4d 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -30,10 +30,12 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -153,6 +155,9 @@ type inode struct {
 	rdevMajor uint32
 	rdevMinor uint32
 
+	// Advisory file locks, which lock at the inode level.
+	locks lock.FileLocks
+
 	impl interface{} // immutable
 }
 
@@ -352,6 +357,44 @@ func (i *inode) setStat(stat linux.Statx) error {
 	return nil
 }
 
+// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
+func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	switch i.impl.(type) {
+	case *regularFile:
+		return i.locks.LockBSD(uid, t, block)
+	}
+	return syserror.EBADF
+}
+
+// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
+func (i *inode) unlockBSD(uid fslock.UniqueID) error {
+	switch i.impl.(type) {
+	case *regularFile:
+		i.locks.UnlockBSD(uid)
+		return nil
+	}
+	return syserror.EBADF
+}
+
+// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
+func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
+	switch i.impl.(type) {
+	case *regularFile:
+		return i.locks.LockPOSIX(uid, t, rng, block)
+	}
+	return syserror.EBADF
+}
+
+// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
+func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error {
+	switch i.impl.(type) {
+	case *regularFile:
+		i.locks.UnlockPOSIX(uid, rng)
+		return nil
+	}
+	return syserror.EBADF
+}
+
 // allocatedBlocksForSize returns the number of 512B blocks needed to
 // accommodate the given size in bytes, as appropriate for struct
 // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD
new file mode 100644
index 000000000..d9ab063b7
--- /dev/null
+++ b/pkg/sentry/vfs/lock/BUILD
@@ -0,0 +1,13 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "lock",
+    srcs = ["lock.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/sentry/fs/lock",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock/lock.go
new file mode 100644
index 000000000..724dfe743
--- /dev/null
+++ b/pkg/sentry/vfs/lock/lock.go
@@ -0,0 +1,72 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package lock provides POSIX and BSD style file locking for VFS2 file
+// implementations.
+//
+// The actual implementations can be found in the lock package under
+// sentry/fs/lock.
+package lock
+
+import (
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2)
+// and flock(2) respectively in Linux. It can be embedded into various file
+// implementations for VFS2 that support locking.
+//
+// Note that in Linux these two types of locks are _not_ cooperative, because
+// race and deadlock conditions make merging them prohibitive. We do the same
+// and keep them oblivious to each other.
+type FileLocks struct {
+	// bsd is a set of BSD-style advisory file wide locks, see flock(2).
+	bsd fslock.Locks
+
+	// posix is a set of POSIX-style regional advisory locks, see fcntl(2).
+	posix fslock.Locks
+}
+
+// LockBSD tries to acquire a BSD-style lock on the entire file.
+func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) {
+		return nil
+	}
+	return syserror.ErrWouldBlock
+}
+
+// UnlockBSD releases a BSD-style lock on the entire file.
+//
+// This operation is always successful, even if there did not exist a lock on
+// the requested region held by uid in the first place.
+func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) {
+	fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF})
+}
+
+// LockPOSIX tries to acquire a POSIX-style lock on a file region.
+func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
+	if fl.posix.LockRegion(uid, t, rng, block) {
+		return nil
+	}
+	return syserror.ErrWouldBlock
+}
+
+// UnlockPOSIX releases a POSIX-style lock on a file region.
+//
+// This operation is always successful, even if there did not exist a lock on
+// the requested region held by uid in the first place.
+func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) {
+	fl.posix.UnlockRegion(uid, rng)
+}
-- 
cgit v1.2.3


From 02997af5abd62d778fca4d01b047a6bdebab2090 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 31 Jan 2020 15:08:17 -0800
Subject: Fix method comment to match method name.

PiperOrigin-RevId: 292624867
---
 pkg/tcpip/transport/tcp/accept.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 6101f2945..08afb7c17 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -271,8 +271,8 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 	return n, nil
 }
 
-// createEndpoint creates a new endpoint in connected state and then performs
-// the TCP 3-way handshake.
+// createEndpointAndPerformHandshake creates a new endpoint in connected state
+// and then performs the TCP 3-way handshake.
 func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create new endpoint.
 	irs := s.sequenceNumber
-- 
cgit v1.2.3


From f37e913a358820ea98013772dd2880cc8a3c9218 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 3 Feb 2020 16:15:16 -0800
Subject: seccomp: allow to filter syscalls by instruction pointer

PiperOrigin-RevId: 293029446
---
 pkg/seccomp/seccomp.go       | 20 ++++++++++++++++----
 pkg/seccomp/seccomp_rules.go |  6 +++++-
 pkg/seccomp/seccomp_test.go  | 27 +++++++++++++++++++++++++++
 3 files changed, 48 insertions(+), 5 deletions(-)

(limited to 'pkg')

diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index fc36efa23..55fd6967e 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -219,24 +219,36 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action linux.BPFAc
 				switch a := arg.(type) {
 				case AllowAny:
 				case AllowValue:
+					dataOffsetLow := seccompDataOffsetArgLow(i)
+					dataOffsetHigh := seccompDataOffsetArgHigh(i)
+					if i == RuleIP {
+						dataOffsetLow = seccompDataOffsetIPLow
+						dataOffsetHigh = seccompDataOffsetIPHigh
+					}
 					high, low := uint32(a>>32), uint32(a)
 					// assert arg_low == low
-					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i))
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					// assert arg_high == high
-					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					labelled = true
 				case GreaterThan:
+					dataOffsetLow := seccompDataOffsetArgLow(i)
+					dataOffsetHigh := seccompDataOffsetArgHigh(i)
+					if i == RuleIP {
+						dataOffsetLow = seccompDataOffsetIPLow
+						dataOffsetHigh = seccompDataOffsetIPHigh
+					}
 					labelGood := fmt.Sprintf("gt%v", i)
 					high, low := uint32(a>>32), uint32(a)
 					// assert arg_high < high
-					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					// arg_high > high
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
 					// arg_low < low
-					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i))
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jgt|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
 					labelled = true
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
index 84c841d7f..06308cd29 100644
--- a/pkg/seccomp/seccomp_rules.go
+++ b/pkg/seccomp/seccomp_rules.go
@@ -62,7 +62,11 @@ func (a AllowValue) String() (s string) {
 // rule := Rule {
 //       AllowValue(linux.ARCH_GET_FS | linux.ARCH_SET_FS), // arg0
 // }
-type Rule [6]interface{}
+type Rule [7]interface{} // 6 arguments + RIP
+
+// RuleIP indicates what rules in the Rule array have to be applied to
+// instruction pointer.
+const RuleIP = 6
 
 func (r Rule) String() (s string) {
 	if len(r) == 0 {
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index abbee7051..da5a5e4b2 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -388,6 +388,33 @@ func TestBasic(t *testing.T) {
 				},
 			},
 		},
+		{
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								RuleIP: AllowValue(0x7aabbccdd),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			specs: []spec{
+				{
+					desc: "IP: Syscall instruction pointer allowed",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{}, instructionPointer: 0x7aabbccdd},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "IP: Syscall instruction pointer disallowed",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{}, instructionPointer: 0x711223344},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
 	} {
 		instrs, err := BuildProgram(test.ruleSets, test.defaultAction)
 		if err != nil {
-- 
cgit v1.2.3


From d7cd484091543827678f1548b8e5668a7a86e13f Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 4 Feb 2020 08:20:10 -0800
Subject: Add support for sentry internal pipe for gofer mounts

Internal pipes are supported similarly to how internal UDS is done.
It is also controlled by the same flag.

Fixes #1102

PiperOrigin-RevId: 293150045
---
 pkg/sentry/fs/gofer/BUILD            |   2 +
 pkg/sentry/fs/gofer/cache_policy.go  |   3 +
 pkg/sentry/fs/gofer/fifo.go          |  40 ++++++++
 pkg/sentry/fs/gofer/gofer_test.go    |   2 +-
 pkg/sentry/fs/gofer/path.go          | 183 +++++++++++++++++++++++------------
 pkg/sentry/fs/gofer/session.go       | 183 ++++++++++++++++++++++-------------
 pkg/sentry/fs/gofer/session_state.go |  12 +--
 pkg/sentry/fs/gofer/socket.go        |   8 +-
 test/syscalls/BUILD                  |   1 -
 9 files changed, 293 insertions(+), 141 deletions(-)
 create mode 100644 pkg/sentry/fs/gofer/fifo.go

(limited to 'pkg')

diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index 971d3718e..fea135eea 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -9,6 +9,7 @@ go_library(
         "cache_policy.go",
         "context_file.go",
         "device.go",
+        "fifo.go",
         "file.go",
         "file_state.go",
         "fs.go",
@@ -38,6 +39,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/host",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index ebea03c42..07a564e92 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -127,6 +127,9 @@ func (cp cachePolicy) revalidate(ctx context.Context, name string, parent, child
 
 	childIops, ok := child.InodeOperations.(*inodeOperations)
 	if !ok {
+		if _, ok := child.InodeOperations.(*fifo); ok {
+			return false
+		}
 		panic(fmt.Sprintf("revalidating inode operations of unknown type %T", child.InodeOperations))
 	}
 	parentIops, ok := parent.InodeOperations.(*inodeOperations)
diff --git a/pkg/sentry/fs/gofer/fifo.go b/pkg/sentry/fs/gofer/fifo.go
new file mode 100644
index 000000000..456557058
--- /dev/null
+++ b/pkg/sentry/fs/gofer/fifo.go
@@ -0,0 +1,40 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+)
+
+// +stateify savable
+type fifo struct {
+	fs.InodeOperations
+	fileIops *inodeOperations
+}
+
+var _ fs.InodeOperations = (*fifo)(nil)
+
+// Rename implements fs.InodeOperations. It forwards the call to the underlying
+// file inode to handle the file rename. Note that file key remains the same
+// after the rename to keep the endpoint mapping.
+func (i *fifo) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+	return i.fileIops.Rename(ctx, inode, oldParent, oldName, newParent, newName, replacement)
+}
+
+// StatFS implements fs.InodeOperations.
+func (i *fifo) StatFS(ctx context.Context) (fs.Info, error) {
+	return i.fileIops.StatFS(ctx)
+}
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 0c2f89ae8..2df2fe889 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -61,7 +61,7 @@ func rootTest(t *testing.T, name string, cp cachePolicy, fn func(context.Context
 		ctx := contexttest.Context(t)
 		sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{
 			file: rootFile,
-		}, root.QID, p9.AttrMaskAll(), root.Attr, false /* socket */)
+		}, root.QID, p9.AttrMaskAll(), root.Attr)
 		m := fs.NewMountSource(ctx, s, &filesystem{}, fs.MountSourceFlags{})
 		rootInode := fs.NewInode(ctx, rootInodeOperations, m, sattr)
 
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 0c1be05ef..a35c3a23d 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -23,14 +23,24 @@ import (
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // maxFilenameLen is the maximum length of a filename. This is dictated by 9P's
 // encoding of strings, which uses 2 bytes for the length prefix.
 const maxFilenameLen = (1 << 16) - 1
 
+func changeType(mode p9.FileMode, newType p9.FileMode) p9.FileMode {
+	if newType&^p9.FileModeMask != 0 {
+		panic(fmt.Sprintf("newType contained more bits than just file mode: %x", newType))
+	}
+	clear := mode &^ p9.FileModeMask
+	return clear | newType
+}
+
 // Lookup loads an Inode at name into a Dirent based on the session's cache
 // policy.
 func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
@@ -69,8 +79,25 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string
 		return nil, err
 	}
 
+	if i.session().overrides != nil {
+		// Check if file belongs to a internal named pipe. Note that it doesn't need
+		// to check for sockets because it's done in newInodeOperations below.
+		deviceKey := device.MultiDeviceKey{
+			Device:          p9attr.RDev,
+			SecondaryDevice: i.session().connID,
+			Inode:           qids[0].Path,
+		}
+		unlock := i.session().overrides.lock()
+		if pipeInode := i.session().overrides.getPipe(deviceKey); pipeInode != nil {
+			unlock()
+			pipeInode.IncRef()
+			return fs.NewDirent(ctx, pipeInode, name), nil
+		}
+		unlock()
+	}
+
 	// Construct the Inode operations.
-	sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr, false)
+	sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr)
 
 	// Construct a positive Dirent.
 	return fs.NewDirent(ctx, fs.NewInode(ctx, node, dir.MountSource, sattr), name), nil
@@ -138,7 +165,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 	qid := qids[0]
 
 	// Construct the InodeOperations.
-	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr, false)
+	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr)
 
 	// Construct the positive Dirent.
 	d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name)
@@ -223,82 +250,115 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 		return nil, syserror.ENAMETOOLONG
 	}
 
-	if i.session().endpoints == nil {
+	if i.session().overrides == nil {
 		return nil, syscall.EOPNOTSUPP
 	}
 
-	// Create replaces the directory fid with the newly created/opened
-	// file, so clone this directory so it doesn't change out from under
-	// this node.
-	_, newFile, err := i.fileState.file.walk(ctx, nil)
+	// Stabilize the override map while creation is in progress.
+	unlock := i.session().overrides.lock()
+	defer unlock()
+
+	sattr, iops, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeSocket)
 	if err != nil {
 		return nil, err
 	}
-	// We're not going to use newFile after return.
-	defer newFile.close(ctx)
 
-	// Stabilize the endpoint map while creation is in progress.
-	unlock := i.session().endpoints.lock()
-	defer unlock()
+	// Construct the positive Dirent.
+	childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name)
+	i.session().overrides.addBoundEndpoint(iops.fileState.key, childDir, ep)
+	return childDir, nil
+}
 
-	// Create a regular file in the gofer and then mark it as a socket by
-	// adding this inode key in the 'endpoints' map.
-	owner := fs.FileOwnerFromContext(ctx)
-	hostFile, err := newFile.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID))
-	if err != nil {
-		return nil, err
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
+	if len(name) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
 	}
-	// We're not going to use this file.
-	hostFile.Close()
 
-	i.touchModificationAndStatusChangeTime(ctx, dir)
+	owner := fs.FileOwnerFromContext(ctx)
+	mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe
 
-	// Get the attributes of the file to create inode key.
-	qid, mask, attr, err := getattr(ctx, newFile)
-	if err != nil {
-		return nil, err
+	// N.B. FIFOs use major/minor numbers 0.
+	if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
+		if i.session().overrides == nil || err != syscall.EPERM {
+			return err
+		}
+		// If gofer doesn't support mknod, check if we can create an internal fifo.
+		return i.createInternalFifo(ctx, dir, name, owner, perm)
 	}
 
-	key := device.MultiDeviceKey{
-		Device:          attr.RDev,
-		SecondaryDevice: i.session().connID,
-		Inode:           qid.Path,
+	i.touchModificationAndStatusChangeTime(ctx, dir)
+	return nil
+}
+
+func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode, name string, owner fs.FileOwner, perm fs.FilePermissions) error {
+	if i.session().overrides == nil {
+		return syserror.EPERM
 	}
 
-	// Create child dirent.
+	// Stabilize the override map while creation is in progress.
+	unlock := i.session().overrides.lock()
+	defer unlock()
 
-	// Get an unopened p9.File for the file we created so that it can be
-	// cloned and re-opened multiple times after creation.
-	_, unopened, err := i.fileState.file.walk(ctx, []string{name})
+	sattr, fileOps, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeNamedPipe)
 	if err != nil {
-		return nil, err
+		return err
 	}
 
-	// Construct the InodeOperations.
-	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, attr, true)
+	// First create a pipe.
+	p := pipe.NewPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)
+
+	// Wrap the fileOps with our Fifo.
+	iops := &fifo{
+		InodeOperations: pipe.NewInodeOperations(ctx, perm, p),
+		fileIops:        fileOps,
+	}
+	inode := fs.NewInode(ctx, iops, dir.MountSource, sattr)
 
 	// Construct the positive Dirent.
 	childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name)
-	i.session().endpoints.add(key, childDir, ep)
-	return childDir, nil
+	i.session().overrides.addPipe(fileOps.fileState.key, childDir, inode)
+	return nil
 }
 
-// CreateFifo implements fs.InodeOperations.CreateFifo.
-func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
-	if len(name) > maxFilenameLen {
-		return syserror.ENAMETOOLONG
+// Caller must hold Session.endpoint lock.
+func (i *inodeOperations) createEndpointFile(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions, fileType p9.FileMode) (fs.StableAttr, *inodeOperations, error) {
+	_, dirClone, err := i.fileState.file.walk(ctx, nil)
+	if err != nil {
+		return fs.StableAttr{}, nil, err
 	}
+	// We're not going to use dirClone after return.
+	defer dirClone.close(ctx)
 
+	// Create a regular file in the gofer and then mark it as a socket by
+	// adding this inode key in the 'overrides' map.
 	owner := fs.FileOwnerFromContext(ctx)
-	mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe
-
-	// N.B. FIFOs use major/minor numbers 0.
-	if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
-		return err
+	hostFile, err := dirClone.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID))
+	if err != nil {
+		return fs.StableAttr{}, nil, err
 	}
+	// We're not going to use this file.
+	hostFile.Close()
 
 	i.touchModificationAndStatusChangeTime(ctx, dir)
-	return nil
+
+	// Get the attributes of the file to create inode key.
+	qid, mask, attr, err := getattr(ctx, dirClone)
+	if err != nil {
+		return fs.StableAttr{}, nil, err
+	}
+
+	// Get an unopened p9.File for the file we created so that it can be
+	// cloned and re-opened multiple times after creation.
+	_, unopened, err := i.fileState.file.walk(ctx, []string{name})
+	if err != nil {
+		return fs.StableAttr{}, nil, err
+	}
+
+	// Construct new inode with file type overridden.
+	attr.Mode = changeType(attr.Mode, fileType)
+	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, attr)
+	return sattr, iops, nil
 }
 
 // Remove implements InodeOperations.Remove.
@@ -307,20 +367,23 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
 		return syserror.ENAMETOOLONG
 	}
 
-	var key device.MultiDeviceKey
-	removeSocket := false
-	if i.session().endpoints != nil {
-		// Find out if file being deleted is a socket that needs to be
+	var key *device.MultiDeviceKey
+	if i.session().overrides != nil {
+		// Find out if file being deleted is a socket or pipe that needs to be
 		// removed from endpoint map.
 		if d, err := i.Lookup(ctx, dir, name); err == nil {
 			defer d.DecRef()
-			if fs.IsSocket(d.Inode.StableAttr) {
-				child := d.Inode.InodeOperations.(*inodeOperations)
-				key = child.fileState.key
-				removeSocket = true
 
-				// Stabilize the endpoint map while deletion is in progress.
-				unlock := i.session().endpoints.lock()
+			if fs.IsSocket(d.Inode.StableAttr) || fs.IsPipe(d.Inode.StableAttr) {
+				switch iops := d.Inode.InodeOperations.(type) {
+				case *inodeOperations:
+					key = &iops.fileState.key
+				case *fifo:
+					key = &iops.fileIops.fileState.key
+				}
+
+				// Stabilize the override map while deletion is in progress.
+				unlock := i.session().overrides.lock()
 				defer unlock()
 			}
 		}
@@ -329,8 +392,8 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
 	if err := i.fileState.file.unlinkAt(ctx, name, 0); err != nil {
 		return err
 	}
-	if removeSocket {
-		i.session().endpoints.remove(key)
+	if key != nil {
+		i.session().overrides.remove(*key)
 	}
 	i.touchModificationAndStatusChangeTime(ctx, dir)
 
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 498c4645a..f6b3ef178 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -33,60 +33,107 @@ import (
 var DefaultDirentCacheSize uint64 = fs.DefaultDirentCacheSize
 
 // +stateify savable
-type endpointMaps struct {
-	// mu protexts the direntMap, the keyMap, and the pathMap below.
-	mu sync.RWMutex `state:"nosave"`
+type overrideInfo struct {
+	dirent *fs.Dirent
+
+	// endpoint is set when dirent points to a socket. inode must not be set.
+	endpoint transport.BoundEndpoint
+
+	// inode is set when dirent points to a pipe. endpoint must not be set.
+	inode *fs.Inode
+}
 
-	// direntMap links sockets to their dirents.
-	// It is filled concurrently with the keyMap and is stored upon save.
-	// Before saving, this map is used to populate the pathMap.
-	direntMap map[transport.BoundEndpoint]*fs.Dirent
+func (l *overrideInfo) inodeType() fs.InodeType {
+	switch {
+	case l.endpoint != nil:
+		return fs.Socket
+	case l.inode != nil:
+		return fs.Pipe
+	}
+	panic("endpoint or node must be set")
+}
 
-	// keyMap links MultiDeviceKeys (containing inode IDs) to their sockets.
+// +stateify savable
+type overrideMaps struct {
+	// mu protexts the keyMap, and the pathMap below.
+	mu sync.RWMutex `state:"nosave"`
+
+	// keyMap links MultiDeviceKeys (containing inode IDs) to their sockets/pipes.
 	// It is not stored during save because the inode ID may change upon restore.
-	keyMap map[device.MultiDeviceKey]transport.BoundEndpoint `state:"nosave"`
+	keyMap map[device.MultiDeviceKey]*overrideInfo `state:"nosave"`
 
-	// pathMap links the sockets to their paths.
+	// pathMap links the sockets/pipes to their paths.
 	// It is filled before saving from the direntMap and is stored upon save.
 	// Upon restore, this map is used to re-populate the keyMap.
-	pathMap map[transport.BoundEndpoint]string
+	pathMap map[*overrideInfo]string
+}
+
+// addBoundEndpoint adds the bound endpoint to the map.
+// A reference is taken on the dirent argument.
+//
+// Precondition: maps must have been locked with 'lock'.
+func (e *overrideMaps) addBoundEndpoint(key device.MultiDeviceKey, d *fs.Dirent, ep transport.BoundEndpoint) {
+	d.IncRef()
+	e.keyMap[key] = &overrideInfo{dirent: d, endpoint: ep}
 }
 
-// add adds the endpoint to the maps.
+// addPipe adds the pipe inode to the map.
 // A reference is taken on the dirent argument.
 //
 // Precondition: maps must have been locked with 'lock'.
-func (e *endpointMaps) add(key device.MultiDeviceKey, d *fs.Dirent, ep transport.BoundEndpoint) {
-	e.keyMap[key] = ep
+func (e *overrideMaps) addPipe(key device.MultiDeviceKey, d *fs.Dirent, inode *fs.Inode) {
 	d.IncRef()
-	e.direntMap[ep] = d
+	e.keyMap[key] = &overrideInfo{dirent: d, inode: inode}
 }
 
 // remove deletes the key from the maps.
 //
 // Precondition: maps must have been locked with 'lock'.
-func (e *endpointMaps) remove(key device.MultiDeviceKey) {
-	endpoint := e.get(key)
+func (e *overrideMaps) remove(key device.MultiDeviceKey) {
+	endpoint := e.keyMap[key]
 	delete(e.keyMap, key)
-
-	d := e.direntMap[endpoint]
-	d.DecRef()
-	delete(e.direntMap, endpoint)
+	endpoint.dirent.DecRef()
 }
 
 // lock blocks other addition and removal operations from happening while
 // the backing file is being created or deleted. Returns a function that unlocks
 // the endpoint map.
-func (e *endpointMaps) lock() func() {
+func (e *overrideMaps) lock() func() {
 	e.mu.Lock()
 	return func() { e.mu.Unlock() }
 }
 
-// get returns the endpoint mapped to the given key.
+// getBoundEndpoint returns the bound endpoint mapped to the given key.
 //
-// Precondition: maps must have been locked for reading.
-func (e *endpointMaps) get(key device.MultiDeviceKey) transport.BoundEndpoint {
-	return e.keyMap[key]
+// Precondition: maps must have been locked.
+func (e *overrideMaps) getBoundEndpoint(key device.MultiDeviceKey) transport.BoundEndpoint {
+	if v := e.keyMap[key]; v != nil {
+		return v.endpoint
+	}
+	return nil
+}
+
+// getPipe returns the pipe inode mapped to the given key.
+//
+// Precondition: maps must have been locked.
+func (e *overrideMaps) getPipe(key device.MultiDeviceKey) *fs.Inode {
+	if v := e.keyMap[key]; v != nil {
+		return v.inode
+	}
+	return nil
+}
+
+// getType returns the inode type if there is a corresponding endpoint for the
+// given key. Returns false otherwise.
+func (e *overrideMaps) getType(key device.MultiDeviceKey) (fs.InodeType, bool) {
+	e.mu.Lock()
+	v := e.keyMap[key]
+	e.mu.Unlock()
+
+	if v != nil {
+		return v.inodeType(), true
+	}
+	return 0, false
 }
 
 // session holds state for each 9p session established during sys_mount.
@@ -137,16 +184,16 @@ type session struct {
 	// mounter is the EUID/EGID that mounted this file system.
 	mounter fs.FileOwner `state:"wait"`
 
-	// endpoints is used to map inodes that represent socket files to their
-	// corresponding endpoint. Socket files are created as regular files in the
-	// gofer and their presence in this map indicate that they should indeed be
-	// socket files. This allows unix domain sockets to be used with paths that
-	// belong to a gofer.
+	// overrides is used to map inodes that represent socket/pipes files to their
+	// corresponding endpoint/iops. These files are created as regular files in
+	// the gofer and their presence in this map indicate that they should indeed
+	// be socket/pipe files. This allows unix domain sockets and named pipes to
+	// be used with paths that belong to a gofer.
 	//
 	// TODO(gvisor.dev/issue/1200): there are few possible races with someone
 	// stat'ing the file and another deleting it concurrently, where the file
 	// will not be reported as socket file.
-	endpoints *endpointMaps `state:"wait"`
+	overrides *overrideMaps `state:"wait"`
 }
 
 // Destroy tears down the session.
@@ -179,15 +226,21 @@ func (s *session) SaveInodeMapping(inode *fs.Inode, path string) {
 	// This is very unintuitive. We *CANNOT* trust the inode's StableAttrs,
 	// because overlay copyUp may have changed them out from under us.
 	// So much for "immutable".
-	sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr
-	s.inodeMappings[sattr.InodeID] = path
+	switch iops := inode.InodeOperations.(type) {
+	case *inodeOperations:
+		s.inodeMappings[iops.fileState.sattr.InodeID] = path
+	case *fifo:
+		s.inodeMappings[iops.fileIops.fileState.sattr.InodeID] = path
+	default:
+		panic(fmt.Sprintf("Invalid type: %T", iops))
+	}
 }
 
-// newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File and attributes
-// (p9.QID, p9.AttrMask, p9.Attr).
+// newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File
+// and attributes (p9.QID, p9.AttrMask, p9.Attr).
 //
 // Endpoints lock must not be held if socket == false.
-func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr, socket bool) (fs.StableAttr, *inodeOperations) {
+func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr) (fs.StableAttr, *inodeOperations) {
 	deviceKey := device.MultiDeviceKey{
 		Device:          attr.RDev,
 		SecondaryDevice: s.connID,
@@ -201,17 +254,11 @@ func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p
 		BlockSize: bsize(attr),
 	}
 
-	if s.endpoints != nil {
-		if socket {
-			sattr.Type = fs.Socket
-		} else {
-			// If unix sockets are allowed on this filesystem, check if this file is
-			// supposed to be a socket file.
-			unlock := s.endpoints.lock()
-			if s.endpoints.get(deviceKey) != nil {
-				sattr.Type = fs.Socket
-			}
-			unlock()
+	if s.overrides != nil && sattr.Type == fs.RegularFile {
+		// If overrides are allowed on this filesystem, check if this file is
+		// supposed to be of a different type, e.g. socket.
+		if t, ok := s.overrides.getType(deviceKey); ok {
+			sattr.Type = t
 		}
 	}
 
@@ -267,7 +314,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 	s.EnableLeakCheck("gofer.session")
 
 	if o.privateunixsocket {
-		s.endpoints = newEndpointMaps()
+		s.overrides = newOverrideMaps()
 	}
 
 	// Construct the MountSource with the session and superBlockFlags.
@@ -305,26 +352,24 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 		return nil, err
 	}
 
-	sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr, false)
+	sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr)
 	return fs.NewInode(ctx, iops, m, sattr), nil
 }
 
-// newEndpointMaps creates a new endpointMaps.
-func newEndpointMaps() *endpointMaps {
-	return &endpointMaps{
-		direntMap: make(map[transport.BoundEndpoint]*fs.Dirent),
-		keyMap:    make(map[device.MultiDeviceKey]transport.BoundEndpoint),
-		pathMap:   make(map[transport.BoundEndpoint]string),
+// newOverrideMaps creates a new overrideMaps.
+func newOverrideMaps() *overrideMaps {
+	return &overrideMaps{
+		keyMap:  make(map[device.MultiDeviceKey]*overrideInfo),
+		pathMap: make(map[*overrideInfo]string),
 	}
 }
 
-// fillKeyMap populates key and dirent maps upon restore from saved
-// pathmap.
+// fillKeyMap populates key and dirent maps upon restore from saved pathmap.
 func (s *session) fillKeyMap(ctx context.Context) error {
-	unlock := s.endpoints.lock()
+	unlock := s.overrides.lock()
 	defer unlock()
 
-	for ep, dirPath := range s.endpoints.pathMap {
+	for ep, dirPath := range s.overrides.pathMap {
 		_, file, err := s.attach.walk(ctx, splitAbsolutePath(dirPath))
 		if err != nil {
 			return fmt.Errorf("error filling endpointmaps, failed to walk to %q: %v", dirPath, err)
@@ -341,25 +386,25 @@ func (s *session) fillKeyMap(ctx context.Context) error {
 			Inode:           qid.Path,
 		}
 
-		s.endpoints.keyMap[key] = ep
+		s.overrides.keyMap[key] = ep
 	}
 	return nil
 }
 
-// fillPathMap populates paths for endpoints from dirents in direntMap
+// fillPathMap populates paths for overrides from dirents in direntMap
 // before save.
 func (s *session) fillPathMap() error {
-	unlock := s.endpoints.lock()
+	unlock := s.overrides.lock()
 	defer unlock()
 
-	for ep, dir := range s.endpoints.direntMap {
-		mountRoot := dir.MountRoot()
+	for _, endpoint := range s.overrides.keyMap {
+		mountRoot := endpoint.dirent.MountRoot()
 		defer mountRoot.DecRef()
-		dirPath, _ := dir.FullName(mountRoot)
+		dirPath, _ := endpoint.dirent.FullName(mountRoot)
 		if dirPath == "" {
 			return fmt.Errorf("error getting path from dirent")
 		}
-		s.endpoints.pathMap[ep] = dirPath
+		s.overrides.pathMap[endpoint] = dirPath
 	}
 	return nil
 }
@@ -368,7 +413,7 @@ func (s *session) fillPathMap() error {
 func (s *session) restoreEndpointMaps(ctx context.Context) error {
 	// When restoring, only need to create the keyMap because the dirent and path
 	// maps got stored through the save.
-	s.endpoints.keyMap = make(map[device.MultiDeviceKey]transport.BoundEndpoint)
+	s.overrides.keyMap = make(map[device.MultiDeviceKey]*overrideInfo)
 	if err := s.fillKeyMap(ctx); err != nil {
 		return fmt.Errorf("failed to insert sockets into endpoint map: %v", err)
 	}
@@ -376,6 +421,6 @@ func (s *session) restoreEndpointMaps(ctx context.Context) error {
 	// Re-create pathMap because it can no longer be trusted as socket paths can
 	// change while process continues to run. Empty pathMap will be re-filled upon
 	// next save.
-	s.endpoints.pathMap = make(map[transport.BoundEndpoint]string)
+	s.overrides.pathMap = make(map[*overrideInfo]string)
 	return nil
 }
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 0285c5361..111da59f9 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -25,9 +25,9 @@ import (
 
 // beforeSave is invoked by stateify.
 func (s *session) beforeSave() {
-	if s.endpoints != nil {
+	if s.overrides != nil {
 		if err := s.fillPathMap(); err != nil {
-			panic("failed to save paths to endpoint map before saving" + err.Error())
+			panic("failed to save paths to override map before saving" + err.Error())
 		}
 	}
 }
@@ -74,10 +74,10 @@ func (s *session) afterLoad() {
 		panic(fmt.Sprintf("new attach name %v, want %v", opts.aname, s.aname))
 	}
 
-	// Check if endpointMaps exist when uds sockets are enabled
-	// (only pathmap will actualy have been saved).
-	if opts.privateunixsocket != (s.endpoints != nil) {
-		panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.endpoints != nil))
+	// Check if overrideMaps exist when uds sockets are enabled (only pathmaps
+	// will actually have been saved).
+	if opts.privateunixsocket != (s.overrides != nil) {
+		panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.overrides != nil))
 	}
 	if args.Flags != s.superBlockFlags {
 		panic(fmt.Sprintf("new mount flags %v, want %v", args.Flags, s.superBlockFlags))
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index 376cfce2c..10ba2f5f0 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -32,15 +32,15 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport.
 		return nil
 	}
 
-	if i.session().endpoints != nil {
-		unlock := i.session().endpoints.lock()
+	if i.session().overrides != nil {
+		unlock := i.session().overrides.lock()
 		defer unlock()
-		ep := i.session().endpoints.get(i.fileState.key)
+		ep := i.session().overrides.getBoundEndpoint(i.fileState.key)
 		if ep != nil {
 			return ep
 		}
 
-		// Not found in endpoints map, it may be a gofer backed unix socket...
+		// Not found in overrides map, it may be a gofer backed unix socket...
 	}
 
 	inode.IncRef()
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 40e974314..8f2b75a1c 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -225,7 +225,6 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:mknod_test",
-    use_tmpfs = True,  # mknod is not supported over gofer.
 )
 
 syscall_test(
-- 
cgit v1.2.3


From 492229d0176c1af2ab4ea4cf91bf211e940b5b12 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 4 Feb 2020 11:28:36 -0800
Subject: VFS2 gofer client

Updates #1198

Opening host pipes (by spinning in fdpipe) and host sockets is not yet
complete, and will be done in a future CL.

Major differences from VFS1 gofer client (sentry/fs/gofer), with varying levels
of backportability:

- "Cache policies" are replaced by InteropMode, which control the behavior of
  timestamps in addition to caching. Under InteropModeExclusive (analogous to
  cacheAll) and InteropModeWritethrough (analogous to cacheAllWritethrough),
  client timestamps are *not* written back to the server (it is not possible in
  9P or Linux for clients to set ctime, so writing back client-authoritative
  timestamps results in incoherence between atime/mtime and ctime). Under
  InteropModeShared (analogous to cacheRemoteRevalidating), client timestamps
  are not used at all (remote filesystem clocks are authoritative). cacheNone
  is translated to InteropModeShared + new option
  filesystemOptions.specialRegularFiles.

- Under InteropModeShared, "unstable attribute" reloading for permission
  checks, lookup, and revalidation are fused, which is feasible in VFS2 since
  gofer.filesystem controls path resolution. This results in a ~33% reduction
  in RPCs for filesystem operations compared to cacheRemoteRevalidating. For
  example, consider stat("/foo/bar/baz") where "/foo/bar/baz" fails
  revalidation, resulting in the instantiation of a new dentry:

  VFS1 RPCs:
  getattr("/")                          // fs.MountNamespace.FindLink() => fs.Inode.CheckPermission() => gofer.inodeOperations.check() => gofer.inodeOperations.UnstableAttr()
  walkgetattr("/", "foo") = fid1        // fs.Dirent.walk() => gofer.session.Revalidate() => gofer.cachePolicy.Revalidate()
  clunk(fid1)
  getattr("/foo")                       // CheckPermission
  walkgetattr("/foo", "bar") = fid2     // Revalidate
  clunk(fid2)
  getattr("/foo/bar")                   // CheckPermission
  walkgetattr("/foo/bar", "baz") = fid3 // Revalidate
  clunk(fid3)
  walkgetattr("/foo/bar", "baz") = fid4 // fs.Dirent.walk() => gofer.inodeOperations.Lookup
  getattr("/foo/bar/baz")               // linux.stat() => gofer.inodeOperations.UnstableAttr()

  VFS2 RPCs:
  getattr("/")                          // gofer.filesystem.walkExistingLocked()
  walkgetattr("/", "foo") = fid1        // gofer.filesystem.stepExistingLocked()
  clunk(fid1)
                                        // No getattr: walkgetattr already updated metadata for permission check
  walkgetattr("/foo", "bar") = fid2
  clunk(fid2)
  walkgetattr("/foo/bar", "baz") = fid3
                                        // No clunk: fid3 used for new gofer.dentry
                                        // No getattr: walkgetattr already updated metadata for stat()

- gofer.filesystem.unlinkAt() does not require instantiation of a dentry that
  represents the file to be deleted. Updates #898.

- gofer.regularFileFD.OnClose() skips Tflushf for regular files under
  InteropModeExclusive, as it's nonsensical to request a remote file flush
  without flushing locally-buffered writes to that remote file first.

- Symlink targets are cached when InteropModeShared is not in effect.

- p9.QID.Path (which is already required to be unique for each file within a
  server, and is accordingly already synthesized from device/inode numbers in
  all known gofers) is used as-is for inode numbers, rather than being mapped
  along with attr.RDev in the client to yet another synthetic inode number.

- Relevant parts of fsutil.CachingInodeOperations are inlined directly into
  gofer package code. This avoids having to duplicate part of its functionality
  in fsutil.HostMappable.

PiperOrigin-RevId: 293190213
---
 pkg/safemem/seq_unsafe.go                |   17 +
 pkg/sentry/fs/fsutil/BUILD               |    4 +-
 pkg/sentry/fs/fsutil/frame_ref_set.go    |   13 +-
 pkg/sentry/fs/fsutil/inode_cached.go     |    2 +-
 pkg/sentry/fsimpl/gofer/BUILD            |   55 ++
 pkg/sentry/fsimpl/gofer/directory.go     |  190 +++++
 pkg/sentry/fsimpl/gofer/filesystem.go    | 1087 ++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/gofer/gofer.go         | 1147 ++++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/gofer/handle.go        |  135 ++++
 pkg/sentry/fsimpl/gofer/handle_unsafe.go |   66 ++
 pkg/sentry/fsimpl/gofer/p9file.go        |  219 ++++++
 pkg/sentry/fsimpl/gofer/pagemath.go      |   31 +
 pkg/sentry/fsimpl/gofer/regular_file.go  |  860 ++++++++++++++++++++++
 pkg/sentry/fsimpl/gofer/special_file.go  |  159 +++++
 pkg/sentry/fsimpl/gofer/symlink.go       |   47 ++
 pkg/sentry/fsimpl/gofer/time.go          |   75 ++
 pkg/sentry/fsimpl/tmpfs/filesystem.go    |    2 +-
 pkg/sentry/socket/hostinet/socket.go     |   23 +-
 18 files changed, 4103 insertions(+), 29 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/gofer/BUILD
 create mode 100644 pkg/sentry/fsimpl/gofer/directory.go
 create mode 100644 pkg/sentry/fsimpl/gofer/filesystem.go
 create mode 100644 pkg/sentry/fsimpl/gofer/gofer.go
 create mode 100644 pkg/sentry/fsimpl/gofer/handle.go
 create mode 100644 pkg/sentry/fsimpl/gofer/handle_unsafe.go
 create mode 100644 pkg/sentry/fsimpl/gofer/p9file.go
 create mode 100644 pkg/sentry/fsimpl/gofer/pagemath.go
 create mode 100644 pkg/sentry/fsimpl/gofer/regular_file.go
 create mode 100644 pkg/sentry/fsimpl/gofer/special_file.go
 create mode 100644 pkg/sentry/fsimpl/gofer/symlink.go
 create mode 100644 pkg/sentry/fsimpl/gofer/time.go

(limited to 'pkg')

diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go
index 354a95dde..dcdfc9600 100644
--- a/pkg/safemem/seq_unsafe.go
+++ b/pkg/safemem/seq_unsafe.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"fmt"
 	"reflect"
+	"syscall"
 	"unsafe"
 )
 
@@ -297,3 +298,19 @@ func ZeroSeq(dsts BlockSeq) (uint64, error) {
 	}
 	return done, nil
 }
+
+// IovecsFromBlockSeq returns a []syscall.Iovec representing seq.
+func IovecsFromBlockSeq(bs BlockSeq) []syscall.Iovec {
+	iovs := make([]syscall.Iovec, 0, bs.NumBlocks())
+	for ; !bs.IsEmpty(); bs = bs.Tail() {
+		b := bs.Head()
+		iovs = append(iovs, syscall.Iovec{
+			Base: &b.ToSlice()[0],
+			Len:  uint64(b.Len()),
+		})
+		// We don't need to care about b.NeedSafecopy(), because the host
+		// kernel will handle such address ranges just fine (by returning
+		// EFAULT).
+	}
+	return iovs
+}
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 4ab2a384f..789369220 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -28,13 +28,13 @@ go_template_instance(
         "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
     },
     package = "fsutil",
-    prefix = "frameRef",
+    prefix = "FrameRef",
     template = "//pkg/segment:generic_set",
     types = {
         "Key": "uint64",
         "Range": "platform.FileRange",
         "Value": "uint64",
-        "Functions": "frameRefSetFunctions",
+        "Functions": "FrameRefSetFunctions",
     },
 )
 
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
index dd63db32b..6564fd0c6 100644
--- a/pkg/sentry/fs/fsutil/frame_ref_set.go
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -20,24 +20,25 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 )
 
-type frameRefSetFunctions struct{}
+// FrameRefSetFunctions implements segment.Functions for FrameRefSet.
+type FrameRefSetFunctions struct{}
 
 // MinKey implements segment.Functions.MinKey.
-func (frameRefSetFunctions) MinKey() uint64 {
+func (FrameRefSetFunctions) MinKey() uint64 {
 	return 0
 }
 
 // MaxKey implements segment.Functions.MaxKey.
-func (frameRefSetFunctions) MaxKey() uint64 {
+func (FrameRefSetFunctions) MaxKey() uint64 {
 	return math.MaxUint64
 }
 
 // ClearValue implements segment.Functions.ClearValue.
-func (frameRefSetFunctions) ClearValue(val *uint64) {
+func (FrameRefSetFunctions) ClearValue(val *uint64) {
 }
 
 // Merge implements segment.Functions.Merge.
-func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) {
+func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) {
 	if val1 != val2 {
 		return 0, false
 	}
@@ -45,6 +46,6 @@ func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.
 }
 
 // Split implements segment.Functions.Split.
-func (frameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
+func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
 	return val, val
 }
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 573b8586e..800c8b4e1 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -111,7 +111,7 @@ type CachingInodeOperations struct {
 	// refs tracks active references to data in the cache.
 	//
 	// refs is protected by dataMu.
-	refs frameRefSet
+	refs FrameRefSet
 }
 
 // CachingInodeOperationsOptions configures a CachingInodeOperations.
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
new file mode 100644
index 000000000..4ba76a1e8
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -0,0 +1,55 @@
+load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "dentry_list",
+    out = "dentry_list.go",
+    package = "gofer",
+    prefix = "dentry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*dentry",
+        "Linker": "*dentry",
+    },
+)
+
+go_library(
+    name = "gofer",
+    srcs = [
+        "dentry_list.go",
+        "directory.go",
+        "filesystem.go",
+        "gofer.go",
+        "handle.go",
+        "handle_unsafe.go",
+        "p9file.go",
+        "pagemath.go",
+        "regular_file.go",
+        "special_file.go",
+        "symlink.go",
+        "time.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fd",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/safemem",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/unet",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
new file mode 100644
index 000000000..baa2cdd8e
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -0,0 +1,190 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func (d *dentry) isDir() bool {
+	return d.fileType() == linux.S_IFDIR
+}
+
+// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop !=
+// InteropModeShared.
+func (d *dentry) cacheNegativeChildLocked(name string) {
+	if d.negativeChildren == nil {
+		d.negativeChildren = make(map[string]struct{})
+	}
+	d.negativeChildren[name] = struct{}{}
+}
+
+type directoryFD struct {
+	fileDescription
+	vfs.DirectoryFileDescriptionDefaultImpl
+
+	mu      sync.Mutex
+	off     int64
+	dirents []vfs.Dirent
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	if fd.dirents == nil {
+		ds, err := fd.dentry().getDirents(ctx)
+		if err != nil {
+			return err
+		}
+		fd.dirents = ds
+	}
+
+	for fd.off < int64(len(fd.dirents)) {
+		if !cb.Handle(fd.dirents[fd.off]) {
+			return nil
+		}
+		fd.off++
+	}
+	return nil
+}
+
+// Preconditions: d.isDir(). There exists at least one directoryFD representing d.
+func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
+	// 9P2000.L's readdir does not specify behavior in the presence of
+	// concurrent mutation of an iterated directory, so implementations may
+	// duplicate or omit entries in this case, which violates POSIX semantics.
+	// Thus we read all directory entries while holding d.dirMu to exclude
+	// directory mutations. (Note that it is impossible for the client to
+	// exclude concurrent mutation from other remote filesystem users. Since
+	// there is no way to detect if the server has incorrectly omitted
+	// directory entries, we simply assume that the server is well-behaved
+	// under InteropModeShared.) This is inconsistent with Linux (which appears
+	// to assume that directory fids have the correct semantics, and translates
+	// struct file_operations::readdir calls directly to readdir RPCs), but is
+	// consistent with VFS1.
+
+	d.fs.renameMu.RLock()
+	defer d.fs.renameMu.RUnlock()
+	d.dirMu.Lock()
+	defer d.dirMu.Unlock()
+	if d.dirents != nil {
+		return d.dirents, nil
+	}
+
+	// It's not clear if 9P2000.L's readdir is expected to return "." and "..",
+	// so we generate them here.
+	parent := d.vfsd.ParentOrSelf().Impl().(*dentry)
+	dirents := []vfs.Dirent{
+		{
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     d.ino,
+			NextOff: 1,
+		},
+		{
+			Name:    "..",
+			Type:    uint8(atomic.LoadUint32(&parent.mode) >> 12),
+			Ino:     parent.ino,
+			NextOff: 2,
+		},
+	}
+	off := uint64(0)
+	const count = 64 * 1024 // for consistency with the vfs1 client
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	if !d.handleReadable {
+		// This should not be possible because a readable handle should have
+		// been opened when the calling directoryFD was opened.
+		panic("gofer.dentry.getDirents called without a readable handle")
+	}
+	for {
+		p9ds, err := d.handle.file.readdir(ctx, off, count)
+		if err != nil {
+			return nil, err
+		}
+		if len(p9ds) == 0 {
+			// Cache dirents for future directoryFDs if permitted.
+			if d.fs.opts.interop != InteropModeShared {
+				d.dirents = dirents
+			}
+			return dirents, nil
+		}
+		for _, p9d := range p9ds {
+			if p9d.Name == "." || p9d.Name == ".." {
+				continue
+			}
+			dirent := vfs.Dirent{
+				Name:    p9d.Name,
+				Ino:     p9d.QID.Path,
+				NextOff: int64(len(dirents) + 1),
+			}
+			// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
+			// DMSOCKET.
+			switch p9d.Type {
+			case p9.TypeSymlink:
+				dirent.Type = linux.DT_LNK
+			case p9.TypeDir:
+				dirent.Type = linux.DT_DIR
+			default:
+				dirent.Type = linux.DT_REG
+			}
+			dirents = append(dirents, dirent)
+		}
+		off = p9ds[len(p9ds)-1].Offset
+	}
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		if offset == 0 {
+			// Ensure that the next call to fd.IterDirents() calls
+			// fd.dentry().getDirents().
+			fd.dirents = nil
+		}
+		fd.off = offset
+		return fd.off, nil
+	case linux.SEEK_CUR:
+		offset += fd.off
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		// Don't clear fd.dirents in this case, even if offset == 0.
+		fd.off = offset
+		return fd.off, nil
+	default:
+		return 0, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
new file mode 100644
index 000000000..8eb61debf
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -0,0 +1,1087 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	// Snapshot current dentries and special files.
+	fs.syncMu.Lock()
+	ds := make([]*dentry, 0, len(fs.dentries))
+	for d := range fs.dentries {
+		ds = append(ds, d)
+	}
+	sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs))
+	for sffd := range fs.specialFileFDs {
+		sffds = append(sffds, sffd)
+	}
+	fs.syncMu.Unlock()
+
+	// Return the first error we encounter, but sync everything we can
+	// regardless.
+	var retErr error
+
+	// Sync regular files.
+	for _, d := range ds {
+		if !d.TryIncRef() {
+			continue
+		}
+		err := d.syncSharedHandle(ctx)
+		d.DecRef()
+		if err != nil && retErr == nil {
+			retErr = err
+		}
+	}
+
+	// Sync special files, which may be writable but do not use dentry shared
+	// handles (so they won't be synced by the above).
+	for _, sffd := range sffds {
+		if !sffd.vfsfd.TryIncRef() {
+			continue
+		}
+		err := sffd.Sync(ctx)
+		sffd.vfsfd.DecRef()
+		if err != nil && retErr == nil {
+			retErr = err
+		}
+	}
+
+	return retErr
+}
+
+// maxFilenameLen is the maximum length of a filename. This is dictated by 9P's
+// encoding of strings, which uses 2 bytes for the length prefix.
+const maxFilenameLen = (1 << 16) - 1
+
+// dentrySlicePool is a pool of *[]*dentry used to store dentries for which
+// dentry.checkCachingLocked() must be called. The pool holds pointers to
+// slices because Go lacks generics, so sync.Pool operates on interface{}, so
+// every call to (what should be) sync.Pool<[]*dentry>.Put() allocates a copy
+// of the slice header on the heap.
+var dentrySlicePool = sync.Pool{
+	New: func() interface{} {
+		ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+		return &ds
+	},
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+	if ds == nil {
+		ds = dentrySlicePool.Get().(*[]*dentry)
+	}
+	*ds = append(*ds, d)
+	return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+	// Allow dentries to be GC'd.
+	for i := range *ds {
+		(*ds)[i] = nil
+	}
+	*ds = (*ds)[:0]
+	dentrySlicePool.Put(ds)
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may become cached as a result of the traversal are appended
+// to *ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached
+// metadata must be up to date.
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
+	}
+	if name == ".." {
+		parentVFSD, err := rp.ResolveParent(&d.vfsd)
+		if err != nil {
+			return nil, err
+		}
+		parent := parentVFSD.Impl().(*dentry)
+		if fs.opts.interop == InteropModeShared {
+			// We must assume that parentVFSD is correct, because if d has been
+			// moved elsewhere in the remote filesystem so that its parent has
+			// changed, we have no way of determining its new parent's location
+			// in the filesystem. Get updated metadata for parentVFSD.
+			_, attrMask, attr, err := parent.file.getAttr(ctx, dentryAttrMask())
+			if err != nil {
+				return nil, err
+			}
+			parent.updateFromP9Attrs(attrMask, &attr)
+		}
+		rp.Advance()
+		return parent, nil
+	}
+	childVFSD, err := rp.ResolveChild(&d.vfsd, name)
+	if err != nil {
+		return nil, err
+	}
+	// FIXME(jamieliu): Linux performs revalidation before mount lookup
+	// (fs/namei.c:lookup_fast() => __d_lookup_rcu(), d_revalidate(),
+	// __follow_mount_rcu()).
+	child, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, childVFSD, ds)
+	if err != nil {
+		return nil, err
+	}
+	if child == nil {
+		return nil, syserror.ENOENT
+	}
+	if child.isSymlink() && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx, rp.Mount())
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return child, nil
+}
+
+// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
+// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
+// nil) to verify that the returned child (or lack thereof) is correct. If no file
+// exists at name, revalidateChildLocked returns (nil, nil).
+//
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// parent.isDir(). name is not "." or "..".
+//
+// Postconditions: If revalidateChildLocked returns a non-nil dentry, its
+// cached metadata is up to date.
+func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, childVFSD *vfs.Dentry, ds **[]*dentry) (*dentry, error) {
+	if childVFSD != nil && fs.opts.interop != InteropModeShared {
+		// We have a cached dentry that is assumed to be correct.
+		return childVFSD.Impl().(*dentry), nil
+	}
+	// We either don't have a cached dentry or need to verify that it's still
+	// correct, either of which requires a remote lookup. Check if this name is
+	// valid before performing the lookup.
+	if len(name) > maxFilenameLen {
+		return nil, syserror.ENAMETOOLONG
+	}
+	// Check if we've already cached this lookup with a negative result.
+	if _, ok := parent.negativeChildren[name]; ok {
+		return nil, nil
+	}
+	// Perform the remote lookup.
+	qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
+	if err != nil && err != syserror.ENOENT {
+		return nil, err
+	}
+	if childVFSD != nil {
+		child := childVFSD.Impl().(*dentry)
+		if !file.isNil() && qid.Path == child.ino {
+			// The file at this path hasn't changed. Just update cached
+			// metadata.
+			file.close(ctx)
+			child.updateFromP9Attrs(attrMask, &attr)
+			return child, nil
+		}
+		// The file at this path has changed or no longer exists. Remove
+		// the stale dentry from the tree, and re-evaluate its caching
+		// status (i.e. if it has 0 references, drop it).
+		vfsObj.ForceDeleteDentry(childVFSD)
+		*ds = appendDentry(*ds, child)
+		childVFSD = nil
+	}
+	if file.isNil() {
+		// No file exists at this path now. Cache the negative lookup if
+		// allowed.
+		if fs.opts.interop != InteropModeShared {
+			parent.cacheNegativeChildLocked(name)
+		}
+		return nil, nil
+	}
+	// Create a new dentry representing the file.
+	child, err := fs.newDentry(ctx, file, qid, attrMask, &attr)
+	if err != nil {
+		file.close(ctx)
+		return nil, err
+	}
+	parent.IncRef() // reference held by child on its parent
+	parent.vfsd.InsertChild(&child.vfsd, name)
+	// For now, child has 0 references, so our caller should call
+	// child.checkCachingLocked().
+	*ds = appendDentry(*ds, child)
+	return child, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop ==
+// InteropModeShared, then d's cached metadata must be up to date.
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+	for !rp.Final() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for rp.Start() as required by fs.stepLocked().
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return nil, err
+		}
+	}
+	for !rp.Done() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	start := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for start as required by
+		// fs.walkParentDirLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+		return err
+	}
+	if parent.isDeleted() {
+		return syserror.ENOENT
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EEXIST
+	}
+	if len(name) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
+	}
+	if !dir && rp.MustBeDir() {
+		return syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	if fs.opts.interop == InteropModeShared {
+		// The existence of a dentry at name would be inconclusive because the
+		// file it represents may have been deleted from the remote filesystem,
+		// so we would need to make an RPC to revalidate the dentry. Just
+		// attempt the file creation RPC instead. If a file does exist, the RPC
+		// will fail with EEXIST like we would have. If the RPC succeeds, and a
+		// stale dentry exists, the dentry will fail revalidation next time
+		// it's used.
+		return create(parent, name)
+	}
+	if parent.vfsd.Child(name) != nil {
+		return syserror.EEXIST
+	}
+	// No cached dentry exists; however, there might still be an existing file
+	// at name. As above, we attempt the file creation RPC anyway.
+	if err := create(parent, name); err != nil {
+		return err
+	}
+	parent.touchCMtime(ctx)
+	delete(parent.negativeChildren, name)
+	parent.dirents = nil
+	return nil
+}
+
+// Preconditions: !rp.Done().
+func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	start := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for start as required by
+		// fs.walkParentDirLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+
+	name := rp.Component()
+	if dir {
+		if name == "." {
+			return syserror.EINVAL
+		}
+		if name == ".." {
+			return syserror.ENOTEMPTY
+		}
+	} else {
+		if name == "." || name == ".." {
+			return syserror.EISDIR
+		}
+	}
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	childVFSD := parent.vfsd.Child(name)
+	var child *dentry
+	// We only need a dentry representing the file at name if it can be a mount
+	// point. If childVFSD is nil, then it can't be a mount point. If childVFSD
+	// is non-nil but stale, the actual file can't be a mount point either; we
+	// detect this case by just speculatively calling PrepareDeleteDentry and
+	// only revalidating the dentry if that fails (indicating that the existing
+	// dentry is a mount point).
+	if childVFSD != nil {
+		child = childVFSD.Impl().(*dentry)
+		if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
+			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, childVFSD, &ds)
+			if err != nil {
+				return err
+			}
+			if child != nil {
+				childVFSD = &child.vfsd
+				if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
+					return err
+				}
+			} else {
+				childVFSD = nil
+			}
+		}
+	} else if _, ok := parent.negativeChildren[name]; ok {
+		return syserror.ENOENT
+	}
+	flags := uint32(0)
+	if dir {
+		if child != nil && !child.isDir() {
+			return syserror.ENOTDIR
+		}
+		flags = linux.AT_REMOVEDIR
+	} else {
+		if child != nil && child.isDir() {
+			return syserror.EISDIR
+		}
+		if rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+	err = parent.file.unlinkAt(ctx, name, flags)
+	if err != nil {
+		if childVFSD != nil {
+			vfsObj.AbortDeleteDentry(childVFSD)
+		}
+		return err
+	}
+	if fs.opts.interop != InteropModeShared {
+		parent.touchCMtime(ctx)
+		parent.cacheNegativeChildLocked(name)
+		parent.dirents = nil
+	}
+	if child != nil {
+		child.setDeleted()
+		vfsObj.CommitDeleteDentry(childVFSD)
+		ds = appendDentry(ds, child)
+	}
+	return nil
+}
+
+// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls
+// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) {
+	fs.renameMu.RUnlock()
+	if *ds == nil {
+		return
+	}
+	if len(**ds) != 0 {
+		fs.renameMu.Lock()
+		for _, d := range **ds {
+			d.checkCachingLocked()
+		}
+		fs.renameMu.Unlock()
+	}
+	putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) {
+	if *ds == nil {
+		fs.renameMu.Unlock()
+		return
+	}
+	for _, d := range **ds {
+		d.checkCachingLocked()
+	}
+	fs.renameMu.Unlock()
+	putDentrySlice(*ds)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !d.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+			return nil, err
+		}
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	start := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for start as required by
+		// fs.walkParentDirLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return nil, err
+		}
+	}
+	d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error {
+		if rp.Mount() != vd.Mount() {
+			return syserror.EXDEV
+		}
+		// 9P2000.L supports hard links, but we don't.
+		return syserror.EPERM
+	})
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
+		creds := rp.Credentials()
+		_, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+		return err
+	})
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+		creds := rp.Credentials()
+		_, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+		return err
+	})
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Reject O_TMPFILE, which is not supported; supporting it correctly in the
+	// presence of other remote filesystem users requires remote filesystem
+	// support, and it isn't clear that there's any way to implement this in
+	// 9P.
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		return nil, syserror.EOPNOTSUPP
+	}
+	mayCreate := opts.Flags&linux.O_CREAT != 0
+	mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
+
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+
+	start := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for start as required by fs.stepLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return nil, err
+		}
+	}
+	if rp.Done() {
+		return start.openLocked(ctx, rp, opts.Flags)
+	}
+
+afterTrailingSymlink:
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
+	}
+	// Determine whether or not we need to create a file.
+	parent.dirMu.Lock()
+	child, err := fs.stepLocked(ctx, rp, parent, &ds)
+	if err == syserror.ENOENT && mayCreate {
+		fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts)
+		parent.dirMu.Unlock()
+		return fd, err
+	}
+	if err != nil {
+		parent.dirMu.Unlock()
+		return nil, err
+	}
+	// Open existing child or follow symlink.
+	parent.dirMu.Unlock()
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	if child.isSymlink() && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx, rp.Mount())
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		start = parent
+		goto afterTrailingSymlink
+	}
+	return child.openLocked(ctx, rp, opts.Flags)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags uint32) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(flags)
+	if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil {
+		return nil, err
+	}
+	mnt := rp.Mount()
+	filetype := d.fileType()
+	switch {
+	case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD:
+		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0); err != nil {
+			return nil, err
+		}
+		fd := &regularFileFD{}
+		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+			AllowDirectIO: true,
+		}); err != nil {
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	case filetype == linux.S_IFDIR:
+		// Can't open directories with O_CREAT.
+		if flags&linux.O_CREAT != 0 {
+			return nil, syserror.EISDIR
+		}
+		// Can't open directories writably.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		if flags&linux.O_DIRECT != 0 {
+			return nil, syserror.EINVAL
+		}
+		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
+			return nil, err
+		}
+		fd := &directoryFD{}
+		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	case filetype == linux.S_IFLNK:
+		// Can't open symlinks without O_PATH (which is unimplemented).
+		return nil, syserror.ELOOP
+	default:
+		if flags&linux.O_DIRECT != 0 {
+			return nil, syserror.EINVAL
+		}
+		h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0)
+		if err != nil {
+			return nil, err
+		}
+		fd := &specialFileFD{
+			handle: h,
+		}
+		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			h.close(ctx)
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	}
+}
+
+// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
+func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+		return nil, err
+	}
+	if d.isDeleted() {
+		return nil, syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return nil, err
+	}
+	defer mnt.EndWrite()
+
+	// 9P2000.L's lcreate takes a fid representing the parent directory, and
+	// converts it into an open fid representing the created file, so we need
+	// to duplicate the directory fid first.
+	_, dirfile, err := d.file.walk(ctx, nil)
+	if err != nil {
+		return nil, err
+	}
+	creds := rp.Credentials()
+	name := rp.Component()
+	fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, (p9.OpenFlags)(opts.Flags), (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+	if err != nil {
+		dirfile.close(ctx)
+		return nil, err
+	}
+	// Then we need to walk to the file we just created to get a non-open fid
+	// representing it, and to get its metadata. This must use d.file since, as
+	// explained above, dirfile was invalidated by dirfile.Create().
+	walkQID, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name)
+	if err != nil {
+		openFile.close(ctx)
+		if fdobj != nil {
+			fdobj.Close()
+		}
+		return nil, err
+	}
+	// Sanity-check that we walked to the file we created.
+	if createQID.Path != walkQID.Path {
+		// Probably due to concurrent remote filesystem mutation?
+		ctx.Warningf("gofer.dentry.createAndOpenChildLocked: created file has QID %v before walk, QID %v after (interop=%v)", createQID, walkQID, d.fs.opts.interop)
+		nonOpenFile.close(ctx)
+		openFile.close(ctx)
+		if fdobj != nil {
+			fdobj.Close()
+		}
+		return nil, syserror.EAGAIN
+	}
+
+	// Construct the new dentry.
+	child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr)
+	if err != nil {
+		nonOpenFile.close(ctx)
+		openFile.close(ctx)
+		if fdobj != nil {
+			fdobj.Close()
+		}
+		return nil, err
+	}
+	// Incorporate the fid that was opened by lcreate.
+	useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
+	if useRegularFileFD {
+		child.handleMu.Lock()
+		child.handle.file = openFile
+		if fdobj != nil {
+			child.handle.fd = int32(fdobj.Release())
+		}
+		child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags)
+		child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags)
+		child.handleMu.Unlock()
+	}
+	// Take a reference on the new dentry to be held by the new file
+	// description. (This reference also means that the new dentry is not
+	// eligible for caching yet, so we don't need to append to a dentry slice.)
+	child.refs = 1
+	// Insert the dentry into the tree.
+	d.IncRef() // reference held by child on its parent d
+	d.vfsd.InsertChild(&child.vfsd, name)
+	if d.fs.opts.interop != InteropModeShared {
+		d.touchCMtime(ctx)
+		delete(d.negativeChildren, name)
+		d.dirents = nil
+	}
+
+	// Finally, construct a file description representing the created file.
+	var childVFSFD *vfs.FileDescription
+	mnt.IncRef()
+	if useRegularFileFD {
+		fd := &regularFileFD{}
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
+			AllowDirectIO: true,
+		}); err != nil {
+			return nil, err
+		}
+		childVFSFD = &fd.vfsfd
+	} else {
+		fd := &specialFileFD{
+			handle: handle{
+				file: openFile,
+				fd:   -1,
+			},
+		}
+		if fdobj != nil {
+			fd.handle.fd = int32(fdobj.Release())
+		}
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			fd.handle.close(ctx)
+			return nil, err
+		}
+		childVFSFD = &fd.vfsfd
+	}
+	return childVFSFD, nil
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	if !d.isSymlink() {
+		return "", syserror.EINVAL
+	}
+	return d.readlink(ctx, rp.Mount())
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	if opts.Flags != 0 {
+		// Requires 9P support.
+		return syserror.EINVAL
+	}
+
+	var ds *[]*dentry
+	fs.renameMu.Lock()
+	defer fs.renameMuUnlockAndCheckCaching(&ds)
+	newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
+	if err != nil {
+		return err
+	}
+	newName := rp.Component()
+	if newName == "." || newName == ".." {
+		return syserror.EBUSY
+	}
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	oldParent := oldParentVD.Dentry().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		if err := oldParent.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+		return err
+	}
+	vfsObj := rp.VirtualFilesystem()
+	// We need a dentry representing the renamed file since, if it's a
+	// directory, we need to check for write permission on it.
+	oldParent.dirMu.Lock()
+	defer oldParent.dirMu.Unlock()
+	renamed, err := fs.revalidateChildLocked(ctx, vfsObj, oldParent, oldName, oldParent.vfsd.Child(oldName), &ds)
+	if err != nil {
+		return err
+	}
+	if renamed == nil {
+		return syserror.ENOENT
+	}
+	if renamed.isDir() {
+		if renamed == newParent || renamed.vfsd.IsAncestorOf(&newParent.vfsd) {
+			return syserror.EINVAL
+		}
+		if oldParent != newParent {
+			if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+				return err
+			}
+		}
+	} else {
+		if opts.MustBeDir || rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+
+	if oldParent != newParent {
+		if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+			return err
+		}
+		newParent.dirMu.Lock()
+		defer newParent.dirMu.Unlock()
+	}
+	if newParent.isDeleted() {
+		return syserror.ENOENT
+	}
+	replacedVFSD := newParent.vfsd.Child(newName)
+	var replaced *dentry
+	// This is similar to unlinkAt, except:
+	//
+	// - We revalidate the replaced dentry unconditionally for simplicity.
+	//
+	// - If rp.MustBeDir(), then we need a dentry representing the replaced
+	// file regardless to confirm that it's a directory.
+	if replacedVFSD != nil || rp.MustBeDir() {
+		replaced, err = fs.revalidateChildLocked(ctx, vfsObj, newParent, newName, replacedVFSD, &ds)
+		if err != nil {
+			return err
+		}
+		if replaced != nil {
+			if replaced.isDir() {
+				if !renamed.isDir() {
+					return syserror.EISDIR
+				}
+			} else {
+				if rp.MustBeDir() || renamed.isDir() {
+					return syserror.ENOTDIR
+				}
+			}
+			replacedVFSD = &replaced.vfsd
+		} else {
+			replacedVFSD = nil
+		}
+	}
+
+	if oldParent == newParent && oldName == newName {
+		return nil
+	}
+	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), &renamed.vfsd, replacedVFSD); err != nil {
+		return err
+	}
+	if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
+		vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+		return err
+	}
+	if fs.opts.interop != InteropModeShared {
+		oldParent.cacheNegativeChildLocked(oldName)
+		oldParent.dirents = nil
+		delete(newParent.negativeChildren, newName)
+		newParent.dirents = nil
+	}
+	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD)
+	return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	return fs.unlinkAt(ctx, rp, true /* dir */)
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount())
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	// Since walking updates metadata for all traversed dentries under
+	// InteropModeShared, including the returned one, we can return cached
+	// metadata here regardless of fs.opts.interop.
+	var stat linux.Statx
+	d.statTo(&stat)
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	fsstat, err := d.file.statFS(ctx)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	nameLen := uint64(fsstat.NameLength)
+	if nameLen > maxFilenameLen {
+		nameLen = maxFilenameLen
+	}
+	return linux.Statfs{
+		// This is primarily for distinguishing a gofer file system in
+		// tests. Testing is important, so instead of defining
+		// something completely random, use a standard value.
+		Type:            linux.V9FS_MAGIC,
+		BlockSize:       int64(fsstat.BlockSize),
+		Blocks:          fsstat.Blocks,
+		BlocksFree:      fsstat.BlocksFree,
+		BlocksAvailable: fsstat.BlocksAvailable,
+		Files:           fsstat.Files,
+		FilesFree:       fsstat.FilesFree,
+		NameLength:      nameLen,
+	}, nil
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+		creds := rp.Credentials()
+		_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+		return err
+	})
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	return fs.unlinkAt(ctx, rp, false /* dir */)
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	return d.listxattr(ctx)
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	return d.getxattr(ctx, name)
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.setxattr(ctx, &opts)
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.removexattr(ctx, name)
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.renameMu.RLock()
+	defer fs.renameMu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
new file mode 100644
index 000000000..d0552bd99
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -0,0 +1,1147 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gofer provides a filesystem implementation that is backed by a 9p
+// server, interchangably referred to as "gofers" throughout this package.
+//
+// Lock order:
+//   regularFileFD/directoryFD.mu
+//     filesystem.renameMu
+//       dentry.dirMu
+//         filesystem.syncMu
+//         dentry.metadataMu
+//           *** "memmap.Mappable locks" below this point
+//           dentry.mapsMu
+//             *** "memmap.Mappable locks taken by Translate" below this point
+//             dentry.handleMu
+//               dentry.dataMu
+//
+// Locking dentry.dirMu in multiple dentries requires holding
+// filesystem.renameMu for writing.
+package gofer
+
+import (
+	"fmt"
+	"strconv"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// mfp is used to allocate memory that caches regular file contents. mfp is
+	// immutable.
+	mfp pgalloc.MemoryFileProvider
+
+	// Immutable options.
+	opts filesystemOptions
+
+	// client is the client used by this filesystem. client is immutable.
+	client *p9.Client
+
+	// uid and gid are the effective KUID and KGID of the filesystem's creator,
+	// and are used as the owner and group for files that don't specify one.
+	// uid and gid are immutable.
+	uid auth.KUID
+	gid auth.KGID
+
+	// renameMu serves two purposes:
+	//
+	// - It synchronizes path resolution with renaming initiated by this
+	// client.
+	//
+	// - It is held by path resolution to ensure that reachable dentries remain
+	// valid. A dentry is reachable by path resolution if it has a non-zero
+	// reference count (such that it is usable as vfs.ResolvingPath.Start() or
+	// is reachable from its children), or if it is a child dentry (such that
+	// it is reachable from its parent).
+	renameMu sync.RWMutex
+
+	// cachedDentries contains all dentries with 0 references. (Due to race
+	// conditions, it may also contain dentries with non-zero references.)
+	// cachedDentriesLen is the number of dentries in cachedDentries. These
+	// fields are protected by renameMu.
+	cachedDentries    dentryList
+	cachedDentriesLen uint64
+
+	// dentries contains all dentries in this filesystem. specialFileFDs
+	// contains all open specialFileFDs. These fields are protected by syncMu.
+	syncMu         sync.Mutex
+	dentries       map[*dentry]struct{}
+	specialFileFDs map[*specialFileFD]struct{}
+}
+
+type filesystemOptions struct {
+	// "Standard" 9P options.
+	fd      int
+	aname   string
+	interop InteropMode // derived from the "cache" mount option
+	msize   uint32
+	version string
+
+	// maxCachedDentries is the maximum number of dentries with 0 references
+	// retained by the client.
+	maxCachedDentries uint64
+
+	// If forcePageCache is true, host FDs may not be used for application
+	// memory mappings even if available; instead, the client must perform its
+	// own caching of regular file pages. This is primarily useful for testing.
+	forcePageCache bool
+
+	// If limitHostFDTranslation is true, apply maxFillRange() constraints to
+	// host FD mappings returned by dentry.(memmap.Mappable).Translate(). This
+	// makes memory accounting behavior more consistent between cases where
+	// host FDs are / are not available, but may increase the frequency of
+	// sentry-handled page faults on files for which a host FD is available.
+	limitHostFDTranslation bool
+
+	// If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote
+	// filesystem may not be coherent with writable host FDs opened later, so
+	// mappings of the former must be replaced by mappings of the latter. This
+	// is usually only the case when the remote filesystem is an overlayfs
+	// mount on Linux < 4.19.
+	overlayfsStaleRead bool
+
+	// If regularFilesUseSpecialFileFD is true, application FDs representing
+	// regular files will use distinct file handles for each FD, in the same
+	// way that application FDs representing "special files" such as sockets
+	// do. Note that this disables client caching and mmap for regular files.
+	regularFilesUseSpecialFileFD bool
+}
+
+// InteropMode controls the client's interaction with other remote filesystem
+// users.
+type InteropMode uint32
+
+const (
+	// InteropModeExclusive is appropriate when the filesystem client is the
+	// only user of the remote filesystem.
+	//
+	// - The client may cache arbitrary filesystem state (file data, metadata,
+	// filesystem structure, etc.).
+	//
+	// - Client changes to filesystem state may be sent to the remote
+	// filesystem asynchronously, except when server permission checks are
+	// necessary.
+	//
+	// - File timestamps are based on client clocks. This ensures that users of
+	// the client observe timestamps that are coherent with their own clocks
+	// and consistent with Linux's semantics. However, since it is not always
+	// possible for clients to set arbitrary atimes and mtimes, and never
+	// possible for clients to set arbitrary ctimes, file timestamp changes are
+	// stored in the client only and never sent to the remote filesystem.
+	InteropModeExclusive InteropMode = iota
+
+	// InteropModeWritethrough is appropriate when there are read-only users of
+	// the remote filesystem that expect to observe changes made by the
+	// filesystem client.
+	//
+	// - The client may cache arbitrary filesystem state.
+	//
+	// - Client changes to filesystem state must be sent to the remote
+	// filesystem synchronously.
+	//
+	// - File timestamps are based on client clocks. As a corollary, access
+	// timestamp changes from other remote filesystem users will not be visible
+	// to the client.
+	InteropModeWritethrough
+
+	// InteropModeShared is appropriate when there are users of the remote
+	// filesystem that may mutate its state other than the client.
+	//
+	// - The client must verify cached filesystem state before using it.
+	//
+	// - Client changes to filesystem state must be sent to the remote
+	// filesystem synchronously.
+	//
+	// - File timestamps are based on server clocks. This is necessary to
+	// ensure that timestamp changes are synchronized between remote filesystem
+	// users.
+	//
+	// Note that the correctness of InteropModeShared depends on the server
+	// correctly implementing 9P fids (i.e. each fid immutably represents a
+	// single filesystem object), even in the presence of remote filesystem
+	// mutations from other users. If this is violated, the behavior of the
+	// client is undefined.
+	InteropModeShared
+)
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	if mfp == nil {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider")
+		return nil, nil, syserror.EINVAL
+	}
+
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	var fsopts filesystemOptions
+
+	// Check that the transport is "fd".
+	trans, ok := mopts["trans"]
+	if !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'")
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "trans")
+	if trans != "fd" {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Check that read and write FDs are provided and identical.
+	rfdstr, ok := mopts["rfdno"]
+	if !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=<file descriptor>")
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "rfdno")
+	rfd, err := strconv.Atoi(rfdstr)
+	if err != nil {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr)
+		return nil, nil, syserror.EINVAL
+	}
+	wfdstr, ok := mopts["wfdno"]
+	if !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=<file descriptor>")
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "wfdno")
+	wfd, err := strconv.Atoi(wfdstr)
+	if err != nil {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr)
+		return nil, nil, syserror.EINVAL
+	}
+	if rfd != wfd {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
+		return nil, nil, syserror.EINVAL
+	}
+	fsopts.fd = rfd
+
+	// Get the attach name.
+	fsopts.aname = "/"
+	if aname, ok := mopts["aname"]; ok {
+		delete(mopts, "aname")
+		fsopts.aname = aname
+	}
+
+	// Parse the cache policy. For historical reasons, this defaults to the
+	// least generally-applicable option, InteropModeExclusive.
+	fsopts.interop = InteropModeExclusive
+	if cache, ok := mopts["cache"]; ok {
+		delete(mopts, "cache")
+		switch cache {
+		case "fscache":
+			fsopts.interop = InteropModeExclusive
+		case "fscache_writethrough":
+			fsopts.interop = InteropModeWritethrough
+		case "none":
+			fsopts.regularFilesUseSpecialFileFD = true
+			fallthrough
+		case "remote_revalidating":
+			fsopts.interop = InteropModeShared
+		default:
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: cache=%s", cache)
+			return nil, nil, syserror.EINVAL
+		}
+	}
+
+	// Parse the 9P message size.
+	fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M
+	if msizestr, ok := mopts["msize"]; ok {
+		delete(mopts, "msize")
+		msize, err := strconv.ParseUint(msizestr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: msize=%s", msizestr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.msize = uint32(msize)
+	}
+
+	// Parse the 9P protocol version.
+	fsopts.version = p9.HighestVersionString()
+	if version, ok := mopts["version"]; ok {
+		delete(mopts, "version")
+		fsopts.version = version
+	}
+
+	// Parse the dentry cache limit.
+	fsopts.maxCachedDentries = 1000
+	if str, ok := mopts["dentry_cache_limit"]; ok {
+		delete(mopts, "dentry_cache_limit")
+		maxCachedDentries, err := strconv.ParseUint(str, 10, 64)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.maxCachedDentries = maxCachedDentries
+	}
+
+	// Handle simple flags.
+	if _, ok := mopts["force_page_cache"]; ok {
+		delete(mopts, "force_page_cache")
+		fsopts.forcePageCache = true
+	}
+	if _, ok := mopts["limit_host_fd_translation"]; ok {
+		delete(mopts, "limit_host_fd_translation")
+		fsopts.limitHostFDTranslation = true
+	}
+	if _, ok := mopts["overlayfs_stale_read"]; ok {
+		delete(mopts, "overlayfs_stale_read")
+		fsopts.overlayfsStaleRead = true
+	}
+	// fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying
+	// "cache=none".
+
+	// Check for unparsed options.
+	if len(mopts) != 0 {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Establish a connection with the server.
+	conn, err := unet.NewSocket(fsopts.fd)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Perform version negotiation with the server.
+	ctx.UninterruptibleSleepStart(false)
+	client, err := p9.NewClient(conn, fsopts.msize, fsopts.version)
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		conn.Close()
+		return nil, nil, err
+	}
+	// Ownership of conn has been transferred to client.
+
+	// Perform attach to obtain the filesystem root.
+	ctx.UninterruptibleSleepStart(false)
+	attached, err := client.Attach(fsopts.aname)
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		client.Close()
+		return nil, nil, err
+	}
+	attachFile := p9file{attached}
+	qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
+	if err != nil {
+		attachFile.close(ctx)
+		client.Close()
+		return nil, nil, err
+	}
+
+	// Construct the filesystem object.
+	fs := &filesystem{
+		mfp:            mfp,
+		opts:           fsopts,
+		uid:            creds.EffectiveKUID,
+		gid:            creds.EffectiveKGID,
+		client:         client,
+		dentries:       make(map[*dentry]struct{}),
+		specialFileFDs: make(map[*specialFileFD]struct{}),
+	}
+	fs.vfsfs.Init(vfsObj, fs)
+
+	// Construct the root dentry.
+	root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
+	if err != nil {
+		attachFile.close(ctx)
+		fs.vfsfs.DecRef()
+		return nil, nil, err
+	}
+	// Set the root's reference count to 2. One reference is returned to the
+	// caller, and the other is deliberately leaked to prevent the root from
+	// being "cached" and subsequently evicted. Its resources will still be
+	// cleaned up by fs.Release().
+	root.refs = 2
+
+	return &fs.vfsfs, &root.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+	ctx := context.Background()
+	mf := fs.mfp.MemoryFile()
+
+	fs.syncMu.Lock()
+	for d := range fs.dentries {
+		d.handleMu.Lock()
+		d.dataMu.Lock()
+		if d.handleWritable {
+			// Write dirty cached data to the remote file.
+			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil {
+				log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
+			}
+			// TODO(jamieliu): Do we need to flushf/fsync d?
+		}
+		// Discard cached pages.
+		d.cache.DropAll(mf)
+		d.dirty.RemoveAll()
+		d.dataMu.Unlock()
+		// Close the host fd if one exists.
+		if d.handle.fd >= 0 {
+			syscall.Close(int(d.handle.fd))
+			d.handle.fd = -1
+		}
+		d.handleMu.Unlock()
+	}
+	// There can't be any specialFileFDs still using fs, since each such
+	// FileDescription would hold a reference on a Mount holding a reference on
+	// fs.
+	fs.syncMu.Unlock()
+
+	// Close the connection to the server. This implicitly clunks all fids.
+	fs.client.Close()
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	vfsd vfs.Dentry
+
+	// refs is the reference count. Each dentry holds a reference on its
+	// parent, even if disowned. refs is accessed using atomic memory
+	// operations.
+	refs int64
+
+	// fs is the owning filesystem. fs is immutable.
+	fs *filesystem
+
+	// We don't support hard links, so each dentry maps 1:1 to an inode.
+
+	// file is the unopened p9.File that backs this dentry. file is immutable.
+	file p9file
+
+	// If deleted is non-zero, the file represented by this dentry has been
+	// deleted. deleted is accessed using atomic memory operations.
+	deleted uint32
+
+	// If cached is true, dentryEntry links dentry into
+	// filesystem.cachedDentries. cached and dentryEntry are protected by
+	// filesystem.renameMu.
+	cached bool
+	dentryEntry
+
+	dirMu sync.Mutex
+
+	// If this dentry represents a directory, and InteropModeShared is not in
+	// effect, negativeChildren is a set of child names in this directory that
+	// are known not to exist. negativeChildren is protected by dirMu.
+	negativeChildren map[string]struct{}
+
+	// If this dentry represents a directory, InteropModeShared is not in
+	// effect, and dirents is not nil, it is a cache of all entries in the
+	// directory, in the order they were returned by the server. dirents is
+	// protected by dirMu.
+	dirents []vfs.Dirent
+
+	// Cached metadata; protected by metadataMu and accessed using atomic
+	// memory operations unless otherwise specified.
+	metadataMu sync.Mutex
+	ino        uint64 // immutable
+	mode       uint32 // type is immutable, perms are mutable
+	uid        uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid        uint32 // auth.KGID, but ...
+	blockSize  uint32 // 0 if unknown
+	// Timestamps, all nsecs from the Unix epoch.
+	atime int64
+	mtime int64
+	ctime int64
+	btime int64
+	// File size, protected by both metadataMu and dataMu (i.e. both must be
+	// locked to mutate it).
+	size uint64
+
+	mapsMu sync.Mutex
+
+	// If this dentry represents a regular file, mappings tracks mappings of
+	// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
+	mappings memmap.MappingSet
+
+	// If this dentry represents a regular file or directory:
+	//
+	// - handle is the I/O handle used by all regularFileFDs/directoryFDs
+	// representing this dentry.
+	//
+	// - handleReadable is true if handle is readable.
+	//
+	// - handleWritable is true if handle is writable.
+	//
+	// Invariants:
+	//
+	// - If handleReadable == handleWritable == false, then handle.file == nil
+	// (i.e. there is no open handle). Conversely, if handleReadable ||
+	// handleWritable == true, then handle.file != nil (i.e. there is an open
+	// handle).
+	//
+	// - handleReadable and handleWritable cannot transition from true to false
+	// (i.e. handles may not be downgraded).
+	//
+	// These fields are protected by handleMu.
+	handleMu       sync.RWMutex
+	handle         handle
+	handleReadable bool
+	handleWritable bool
+
+	dataMu sync.RWMutex
+
+	// If this dentry represents a regular file that is client-cached, cache
+	// maps offsets into the cached file to offsets into
+	// filesystem.mfp.MemoryFile() that store the file's data. cache is
+	// protected by dataMu.
+	cache fsutil.FileRangeSet
+
+	// If this dentry represents a regular file that is client-cached, dirty
+	// tracks dirty segments in cache. dirty is protected by dataMu.
+	dirty fsutil.DirtySet
+
+	// pf implements platform.File for mappings of handle.fd.
+	pf dentryPlatformFile
+
+	// If this dentry represents a symbolic link, InteropModeShared is not in
+	// effect, and haveTarget is true, target is the symlink target. haveTarget
+	// and target are protected by dataMu.
+	haveTarget bool
+	target     string
+}
+
+// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the
+// gofer client.
+func dentryAttrMask() p9.AttrMask {
+	return p9.AttrMask{
+		Mode:  true,
+		UID:   true,
+		GID:   true,
+		ATime: true,
+		MTime: true,
+		CTime: true,
+		Size:  true,
+		BTime: true,
+	}
+}
+
+// newDentry creates a new dentry representing the given file. The dentry
+// initially has no references, but is not cached; it is the caller's
+// responsibility to set the dentry's reference count and/or call
+// dentry.checkCachingLocked() as appropriate.
+func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) {
+	if !mask.Mode {
+		ctx.Warningf("can't create gofer.dentry without file type")
+		return nil, syserror.EIO
+	}
+	if attr.Mode.FileType() == p9.ModeRegular && !mask.Size {
+		ctx.Warningf("can't create regular file gofer.dentry without file size")
+		return nil, syserror.EIO
+	}
+
+	d := &dentry{
+		fs:        fs,
+		file:      file,
+		ino:       qid.Path,
+		mode:      uint32(attr.Mode),
+		uid:       uint32(fs.uid),
+		gid:       uint32(fs.gid),
+		blockSize: usermem.PageSize,
+		handle: handle{
+			fd: -1,
+		},
+	}
+	d.pf.dentry = d
+	if mask.UID {
+		d.uid = uint32(attr.UID)
+	}
+	if mask.GID {
+		d.gid = uint32(attr.GID)
+	}
+	if mask.Size {
+		d.size = attr.Size
+	}
+	if attr.BlockSize != 0 {
+		d.blockSize = uint32(attr.BlockSize)
+	}
+	if mask.ATime {
+		d.atime = dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)
+	}
+	if mask.MTime {
+		d.mtime = dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)
+	}
+	if mask.CTime {
+		d.ctime = dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds)
+	}
+	if mask.BTime {
+		d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)
+	}
+	d.vfsd.Init(d)
+
+	fs.syncMu.Lock()
+	fs.dentries[d] = struct{}{}
+	fs.syncMu.Unlock()
+	return d, nil
+}
+
+// updateFromP9Attrs is called to update d's metadata after an update from the
+// remote filesystem.
+func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
+	d.metadataMu.Lock()
+	if mask.Mode {
+		if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want {
+			d.metadataMu.Unlock()
+			panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got))
+		}
+		atomic.StoreUint32(&d.mode, uint32(attr.Mode))
+	}
+	if mask.UID {
+		atomic.StoreUint32(&d.uid, uint32(attr.UID))
+	}
+	if mask.GID {
+		atomic.StoreUint32(&d.gid, uint32(attr.GID))
+	}
+	// There is no P9_GETATTR_* bit for I/O block size.
+	if attr.BlockSize != 0 {
+		atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize))
+	}
+	if mask.ATime {
+		atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds))
+	}
+	if mask.MTime {
+		atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds))
+	}
+	if mask.CTime {
+		atomic.StoreInt64(&d.ctime, dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds))
+	}
+	if mask.BTime {
+		atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds))
+	}
+	if mask.Size {
+		d.dataMu.Lock()
+		atomic.StoreUint64(&d.size, attr.Size)
+		d.dataMu.Unlock()
+	}
+	d.metadataMu.Unlock()
+}
+
+func (d *dentry) updateFromGetattr(ctx context.Context) error {
+	// Use d.handle.file, which represents a 9P fid that has been opened, in
+	// preference to d.file, which represents a 9P fid that has not. This may
+	// be significantly more efficient in some implementations.
+	var (
+		file            p9file
+		handleMuRLocked bool
+	)
+	d.handleMu.RLock()
+	if !d.handle.file.isNil() {
+		file = d.handle.file
+		handleMuRLocked = true
+	} else {
+		file = d.file
+		d.handleMu.RUnlock()
+	}
+	_, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask())
+	if handleMuRLocked {
+		d.handleMu.RUnlock()
+	}
+	if err != nil {
+		return err
+	}
+	d.updateFromP9Attrs(attrMask, &attr)
+	return nil
+}
+
+func (d *dentry) fileType() uint32 {
+	return atomic.LoadUint32(&d.mode) & linux.S_IFMT
+}
+
+func (d *dentry) statTo(stat *linux.Statx) {
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
+	stat.Blksize = atomic.LoadUint32(&d.blockSize)
+	stat.Nlink = 1
+	if d.isDir() {
+		stat.Nlink = 2
+	}
+	stat.UID = atomic.LoadUint32(&d.uid)
+	stat.GID = atomic.LoadUint32(&d.gid)
+	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
+	stat.Ino = d.ino
+	stat.Size = atomic.LoadUint64(&d.size)
+	// This is consistent with regularFileFD.Seek(), which treats regular files
+	// as having no holes.
+	stat.Blocks = (stat.Size + 511) / 512
+	stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime))
+	stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime))
+	stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime))
+	stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime))
+	// TODO(jamieliu): device number
+}
+
+func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error {
+	if stat.Mask == 0 {
+		return nil
+	}
+	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
+		return syserror.EPERM
+	}
+	if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+		return err
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	setLocalAtime := false
+	setLocalMtime := false
+	if d.fs.opts.interop != InteropModeShared {
+		// Timestamp updates will be handled locally.
+		setLocalAtime = stat.Mask&linux.STATX_ATIME != 0
+		setLocalMtime = stat.Mask&linux.STATX_MTIME != 0
+		stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME
+		if !setLocalMtime && (stat.Mask&linux.STATX_SIZE != 0) {
+			// Truncate updates mtime.
+			setLocalMtime = true
+			stat.Mtime.Nsec = linux.UTIME_NOW
+		}
+	}
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+	if stat.Mask != 0 {
+		if err := d.file.setAttr(ctx, p9.SetAttrMask{
+			Permissions:        stat.Mask&linux.STATX_MODE != 0,
+			UID:                stat.Mask&linux.STATX_UID != 0,
+			GID:                stat.Mask&linux.STATX_GID != 0,
+			Size:               stat.Mask&linux.STATX_SIZE != 0,
+			ATime:              stat.Mask&linux.STATX_ATIME != 0,
+			MTime:              stat.Mask&linux.STATX_MTIME != 0,
+			ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW,
+			MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW,
+		}, p9.SetAttr{
+			Permissions:      p9.FileMode(stat.Mode),
+			UID:              p9.UID(stat.UID),
+			GID:              p9.GID(stat.GID),
+			Size:             stat.Size,
+			ATimeSeconds:     uint64(stat.Atime.Sec),
+			ATimeNanoSeconds: uint64(stat.Atime.Nsec),
+			MTimeSeconds:     uint64(stat.Mtime.Sec),
+			MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
+		}); err != nil {
+			return err
+		}
+	}
+	if d.fs.opts.interop == InteropModeShared {
+		// There's no point to updating d's metadata in this case since it'll
+		// be overwritten by revalidation before the next time it's used
+		// anyway. (InteropModeShared inhibits client caching of regular file
+		// data, so there's no cache to truncate either.)
+		return nil
+	}
+	now, haveNow := nowFromContext(ctx)
+	if !haveNow {
+		ctx.Warningf("gofer.dentry.setStat: current time not available")
+	}
+	if stat.Mask&linux.STATX_MODE != 0 {
+		atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
+	}
+	if stat.Mask&linux.STATX_UID != 0 {
+		atomic.StoreUint32(&d.uid, stat.UID)
+	}
+	if stat.Mask&linux.STATX_GID != 0 {
+		atomic.StoreUint32(&d.gid, stat.GID)
+	}
+	if setLocalAtime {
+		if stat.Atime.Nsec == linux.UTIME_NOW {
+			if haveNow {
+				atomic.StoreInt64(&d.atime, now)
+			}
+		} else {
+			atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
+		}
+	}
+	if setLocalMtime {
+		if stat.Mtime.Nsec == linux.UTIME_NOW {
+			if haveNow {
+				atomic.StoreInt64(&d.mtime, now)
+			}
+		} else {
+			atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
+		}
+	}
+	if haveNow {
+		atomic.StoreInt64(&d.ctime, now)
+	}
+	if stat.Mask&linux.STATX_SIZE != 0 {
+		d.dataMu.Lock()
+		oldSize := d.size
+		d.size = stat.Size
+		// d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
+		// below. This allows concurrent calls to Read/Translate/etc. These
+		// functions synchronize with truncation by refusing to use cache
+		// contents beyond the new d.size. (We are still holding d.metadataMu,
+		// so we can't race with Write or another truncate.)
+		d.dataMu.Unlock()
+		if d.size < oldSize {
+			oldpgend := pageRoundUp(oldSize)
+			newpgend := pageRoundUp(d.size)
+			if oldpgend != newpgend {
+				d.mapsMu.Lock()
+				d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+					// Compare Linux's mm/truncate.c:truncate_setsize() =>
+					// truncate_pagecache() =>
+					// mm/memory.c:unmap_mapping_range(evencows=1).
+					InvalidatePrivate: true,
+				})
+				d.mapsMu.Unlock()
+			}
+			// We are now guaranteed that there are no translations of
+			// truncated pages, and can remove them from the cache. Since
+			// truncated pages have been removed from the remote file, they
+			// should be dropped without being written back.
+			d.dataMu.Lock()
+			d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
+			d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
+			d.dataMu.Unlock()
+		}
+	}
+	return nil
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+	// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
+	// d.checkCachingLocked().
+	atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&d.refs)
+		if refs == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef() {
+	if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+		d.fs.renameMu.Lock()
+		d.checkCachingLocked()
+		d.fs.renameMu.Unlock()
+	} else if refs < 0 {
+		panic("gofer.dentry.DecRef() called without holding a reference")
+	}
+}
+
+// checkCachingLocked should be called after d's reference count becomes 0 or it
+// becomes disowned.
+//
+// Preconditions: d.fs.renameMu must be locked for writing.
+func (d *dentry) checkCachingLocked() {
+	// Dentries with a non-zero reference count must be retained. (The only way
+	// to obtain a reference on a dentry with zero references is via path
+	// resolution, which requires renameMu, so if d.refs is zero then it will
+	// remain zero while we hold renameMu for writing.)
+	if atomic.LoadInt64(&d.refs) != 0 {
+		if d.cached {
+			d.fs.cachedDentries.Remove(d)
+			d.fs.cachedDentriesLen--
+			d.cached = false
+		}
+		return
+	}
+	// Non-child dentries with zero references are no longer reachable by path
+	// resolution and should be dropped immediately.
+	if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() {
+		if d.cached {
+			d.fs.cachedDentries.Remove(d)
+			d.fs.cachedDentriesLen--
+			d.cached = false
+		}
+		d.destroyLocked()
+		return
+	}
+	// If d is already cached, just move it to the front of the LRU.
+	if d.cached {
+		d.fs.cachedDentries.Remove(d)
+		d.fs.cachedDentries.PushFront(d)
+		return
+	}
+	// Cache the dentry, then evict the least recently used cached dentry if
+	// the cache becomes over-full.
+	d.fs.cachedDentries.PushFront(d)
+	d.fs.cachedDentriesLen++
+	d.cached = true
+	if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries {
+		victim := d.fs.cachedDentries.Back()
+		d.fs.cachedDentries.Remove(victim)
+		d.fs.cachedDentriesLen--
+		victim.cached = false
+		// victim.refs may have become non-zero from an earlier path
+		// resolution since it was inserted into fs.cachedDentries; see
+		// dentry.incRefLocked(). Either way, we brought
+		// fs.cachedDentriesLen back down to fs.opts.maxCachedDentries, so
+		// we don't loop.
+		if atomic.LoadInt64(&victim.refs) == 0 {
+			if victimParentVFSD := victim.vfsd.Parent(); victimParentVFSD != nil {
+				victimParent := victimParentVFSD.Impl().(*dentry)
+				victimParent.dirMu.Lock()
+				if !victim.vfsd.IsDisowned() {
+					// victim can't be a mount point (in any mount
+					// namespace), since VFS holds references on mount
+					// points.
+					d.fs.vfsfs.VirtualFilesystem().ForceDeleteDentry(&victim.vfsd)
+					// We're only deleting the dentry, not the file it
+					// represents, so we don't need to update
+					// victimParent.dirents etc.
+				}
+				victimParent.dirMu.Unlock()
+			}
+			victim.destroyLocked()
+		}
+	}
+}
+
+// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is
+// not a child dentry.
+func (d *dentry) destroyLocked() {
+	ctx := context.Background()
+	d.handleMu.Lock()
+	if !d.handle.file.isNil() {
+		mf := d.fs.mfp.MemoryFile()
+		d.dataMu.Lock()
+		// Write dirty pages back to the remote filesystem.
+		if d.handleWritable {
+			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+				log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err)
+			}
+		}
+		// Discard cached data.
+		d.cache.DropAll(mf)
+		d.dirty.RemoveAll()
+		d.dataMu.Unlock()
+		// Clunk open fids and close open host FDs.
+		d.handle.close(ctx)
+	}
+	d.handleMu.Unlock()
+	d.file.close(ctx)
+	// Remove d from the set of all dentries.
+	d.fs.syncMu.Lock()
+	delete(d.fs.dentries, d)
+	d.fs.syncMu.Unlock()
+	// Drop the reference held by d on its parent.
+	if parentVFSD := d.vfsd.Parent(); parentVFSD != nil {
+		parent := parentVFSD.Impl().(*dentry)
+		// This is parent.DecRef() without recursive locking of d.fs.renameMu.
+		if refs := atomic.AddInt64(&parent.refs, -1); refs == 0 {
+			parent.checkCachingLocked()
+		} else if refs < 0 {
+			panic("gofer.dentry.DecRef() called without holding a reference")
+		}
+	}
+}
+
+func (d *dentry) isDeleted() bool {
+	return atomic.LoadUint32(&d.deleted) != 0
+}
+
+func (d *dentry) setDeleted() {
+	atomic.StoreUint32(&d.deleted, 1)
+}
+
+func (d *dentry) listxattr(ctx context.Context) ([]string, error) {
+	return nil, syserror.ENOTSUP
+}
+
+func (d *dentry) getxattr(ctx context.Context, name string) (string, error) {
+	// TODO(jamieliu): add vfs.GetxattrOptions.Size
+	return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX)
+}
+
+func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error {
+	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
+}
+
+func (d *dentry) removexattr(ctx context.Context, name string) error {
+	return syserror.ENOTSUP
+}
+
+// Preconditions: d.isRegularFile() || d.isDirectory().
+func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
+	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
+	// O_TRUNC).
+	if !trunc {
+		d.handleMu.RLock()
+		if (!read || d.handleReadable) && (!write || d.handleWritable) {
+			// The current handle is sufficient.
+			d.handleMu.RUnlock()
+			return nil
+		}
+		d.handleMu.RUnlock()
+	}
+
+	haveOldFD := false
+	d.handleMu.Lock()
+	if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc {
+		// Get a new handle.
+		wantReadable := d.handleReadable || read
+		wantWritable := d.handleWritable || write
+		h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc)
+		if err != nil {
+			d.handleMu.Unlock()
+			return err
+		}
+		if !d.handle.file.isNil() {
+			// Check that old and new handles are compatible: If the old handle
+			// includes a host file descriptor but the new one does not, or
+			// vice versa, old and new memory mappings may be incoherent.
+			haveOldFD = d.handle.fd >= 0
+			haveNewFD := h.fd >= 0
+			if haveOldFD != haveNewFD {
+				d.handleMu.Unlock()
+				ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD)
+				h.close(ctx)
+				return syserror.EIO
+			}
+			if haveOldFD {
+				// We may have raced with callers of d.pf.FD() that are now
+				// using the old file descriptor, preventing us from safely
+				// closing it. We could handle this by invalidating existing
+				// memmap.Translations, but this is expensive. Instead, use
+				// dup2() to make the old file descriptor refer to the new file
+				// description, then close the new file descriptor (which is no
+				// longer needed). Racing callers may use the old or new file
+				// description, but this doesn't matter since they refer to the
+				// same file (unless d.fs.opts.overlayfsStaleRead is true,
+				// which we handle separately).
+				if err := syscall.Dup2(int(h.fd), int(d.handle.fd)); err != nil {
+					d.handleMu.Unlock()
+					ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err)
+					h.close(ctx)
+					return err
+				}
+				syscall.Close(int(h.fd))
+				h.fd = d.handle.fd
+				if d.fs.opts.overlayfsStaleRead {
+					// Replace sentry mappings of the old FD with mappings of
+					// the new FD, since the two are not necessarily coherent.
+					if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
+						d.handleMu.Unlock()
+						ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
+						h.close(ctx)
+						return err
+					}
+				}
+				// Clunk the old fid before making the new handle visible (by
+				// unlocking d.handleMu).
+				d.handle.file.close(ctx)
+			}
+		}
+		// Switch to the new handle.
+		d.handle = h
+		d.handleReadable = wantReadable
+		d.handleWritable = wantWritable
+	}
+	d.handleMu.Unlock()
+
+	if d.fs.opts.overlayfsStaleRead && haveOldFD {
+		// Invalidate application mappings that may be using the old FD; they
+		// will be replaced with mappings using the new FD after future calls
+		// to d.Translate(). This requires holding d.mapsMu, which precedes
+		// d.handleMu in the lock order.
+		d.mapsMu.Lock()
+		d.mappings.InvalidateAll(memmap.InvalidateOpts{})
+		d.mapsMu.Unlock()
+	}
+
+	return nil
+}
+
+// fileDescription is embedded by gofer implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) dentry() *dentry {
+	return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	d := fd.dentry()
+	if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
+		// TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
+		// available?
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return linux.Statx{}, err
+		}
+	}
+	var stat linux.Statx
+	d.statTo(&stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount())
+}
+
+// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
+func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) {
+	return fd.dentry().listxattr(ctx)
+}
+
+// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
+func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+	return fd.dentry().getxattr(ctx, name)
+}
+
+// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
+func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+	return fd.dentry().setxattr(ctx, &opts)
+}
+
+// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
+func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+	return fd.dentry().removexattr(ctx, name)
+}
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
new file mode 100644
index 000000000..cfe66f797
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -0,0 +1,135 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// handle represents a remote "open file descriptor", consisting of an opened
+// fid (p9.File) and optionally a host file descriptor.
+type handle struct {
+	file p9file
+	fd   int32 // -1 if unavailable
+}
+
+// Preconditions: read || write.
+func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (handle, error) {
+	_, newfile, err := file.walk(ctx, nil)
+	if err != nil {
+		return handle{fd: -1}, err
+	}
+	var flags p9.OpenFlags
+	switch {
+	case read && !write:
+		flags = p9.ReadOnly
+	case !read && write:
+		flags = p9.WriteOnly
+	case read && write:
+		flags = p9.ReadWrite
+	}
+	if trunc {
+		flags |= p9.OpenTruncate
+	}
+	fdobj, _, _, err := newfile.open(ctx, flags)
+	if err != nil {
+		newfile.close(ctx)
+		return handle{fd: -1}, err
+	}
+	fd := int32(-1)
+	if fdobj != nil {
+		fd = int32(fdobj.Release())
+	}
+	return handle{
+		file: newfile,
+		fd:   fd,
+	}, nil
+}
+
+func (h *handle) close(ctx context.Context) {
+	h.file.close(ctx)
+	h.file = p9file{}
+	if h.fd >= 0 {
+		syscall.Close(int(h.fd))
+		h.fd = -1
+	}
+}
+
+func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+	if h.fd >= 0 {
+		ctx.UninterruptibleSleepStart(false)
+		n, err := hostPreadv(h.fd, dsts, int64(offset))
+		ctx.UninterruptibleSleepFinish(false)
+		return n, err
+	}
+	if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() {
+		n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset)
+		return uint64(n), err
+	}
+	// Buffer the read since p9.File.ReadAt() takes []byte.
+	buf := make([]byte, dsts.NumBytes())
+	n, err := h.file.readAt(ctx, buf, offset)
+	if n == 0 {
+		return 0, err
+	}
+	if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil {
+		return cp, cperr
+	}
+	return uint64(n), err
+}
+
+func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+	if h.fd >= 0 {
+		ctx.UninterruptibleSleepStart(false)
+		n, err := hostPwritev(h.fd, srcs, int64(offset))
+		ctx.UninterruptibleSleepFinish(false)
+		return n, err
+	}
+	if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() {
+		n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset)
+		return uint64(n), err
+	}
+	// Buffer the write since p9.File.WriteAt() takes []byte.
+	buf := make([]byte, srcs.NumBytes())
+	cp, cperr := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), srcs)
+	if cp == 0 {
+		return 0, cperr
+	}
+	n, err := h.file.writeAt(ctx, buf[:cp], offset)
+	if err != nil {
+		return uint64(n), err
+	}
+	return cp, cperr
+}
+
+func (h *handle) sync(ctx context.Context) error {
+	if h.fd >= 0 {
+		ctx.UninterruptibleSleepStart(false)
+		err := syscall.Fsync(int(h.fd))
+		ctx.UninterruptibleSleepFinish(false)
+		return err
+	}
+	return h.file.fsync(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/handle_unsafe.go b/pkg/sentry/fsimpl/gofer/handle_unsafe.go
new file mode 100644
index 000000000..19560ab26
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/handle_unsafe.go
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// Preconditions: !dsts.IsEmpty().
+func hostPreadv(fd int32, dsts safemem.BlockSeq, off int64) (uint64, error) {
+	// No buffering is necessary regardless of safecopy; host syscalls will
+	// return EFAULT if appropriate, instead of raising SIGBUS.
+	if dsts.NumBlocks() == 1 {
+		// Use pread() instead of preadv() to avoid iovec allocation and
+		// copying.
+		dst := dsts.Head()
+		n, _, e := syscall.Syscall6(syscall.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(off), 0, 0)
+		if e != 0 {
+			return 0, e
+		}
+		return uint64(n), nil
+	}
+	iovs := safemem.IovecsFromBlockSeq(dsts)
+	n, _, e := syscall.Syscall6(syscall.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0)
+	if e != 0 {
+		return 0, e
+	}
+	return uint64(n), nil
+}
+
+// Preconditions: !srcs.IsEmpty().
+func hostPwritev(fd int32, srcs safemem.BlockSeq, off int64) (uint64, error) {
+	// No buffering is necessary regardless of safecopy; host syscalls will
+	// return EFAULT if appropriate, instead of raising SIGBUS.
+	if srcs.NumBlocks() == 1 {
+		// Use pwrite() instead of pwritev() to avoid iovec allocation and
+		// copying.
+		src := srcs.Head()
+		n, _, e := syscall.Syscall6(syscall.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(off), 0, 0)
+		if e != 0 {
+			return 0, e
+		}
+		return uint64(n), nil
+	}
+	iovs := safemem.IovecsFromBlockSeq(srcs)
+	n, _, e := syscall.Syscall6(syscall.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0)
+	if e != 0 {
+		return 0, e
+	}
+	return uint64(n), nil
+}
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
new file mode 100644
index 000000000..755ac2985
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -0,0 +1,219 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// p9file is a wrapper around p9.File that provides methods that are
+// Context-aware.
+type p9file struct {
+	file p9.File
+}
+
+func (f p9file) isNil() bool {
+	return f.file == nil
+}
+
+func (f p9file) walk(ctx context.Context, names []string) ([]p9.QID, p9file, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qids, newfile, err := f.file.Walk(names)
+	ctx.UninterruptibleSleepFinish(false)
+	return qids, p9file{newfile}, err
+}
+
+func (f p9file) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, p9file, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qids, newfile, attrMask, attr, err := f.file.WalkGetAttr(names)
+	ctx.UninterruptibleSleepFinish(false)
+	return qids, p9file{newfile}, attrMask, attr, err
+}
+
+// walkGetAttrOne is a wrapper around p9.File.WalkGetAttr that takes a single
+// path component and returns a single qid.
+func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qids, newfile, attrMask, attr, err := f.file.WalkGetAttr([]string{name})
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, err
+	}
+	if len(qids) != 1 {
+		ctx.Warningf("p9.File.WalkGetAttr returned %d qids (%v), wanted 1", len(qids), qids)
+		if newfile != nil {
+			p9file{newfile}.close(ctx)
+		}
+		return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO
+	}
+	return qids[0], p9file{newfile}, attrMask, attr, nil
+}
+
+func (f p9file) statFS(ctx context.Context) (p9.FSStat, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fsstat, err := f.file.StatFS()
+	ctx.UninterruptibleSleepFinish(false)
+	return fsstat, err
+}
+
+func (f p9file) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, attrMask, attr, err := f.file.GetAttr(req)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, attrMask, attr, err
+}
+
+func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.SetAttr(valid, attr)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) {
+	ctx.UninterruptibleSleepStart(false)
+	val, err := f.file.GetXattr(name, size)
+	ctx.UninterruptibleSleepFinish(false)
+	return val, err
+}
+
+func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.SetXattr(name, value, flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Allocate(mode, offset, length)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) close(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Close()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fdobj, qid, iounit, err := f.file.Open(flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return fdobj, qid, iounit, err
+}
+
+func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+	ctx.UninterruptibleSleepStart(false)
+	n, err := f.file.ReadAt(p, offset)
+	ctx.UninterruptibleSleepFinish(false)
+	return n, err
+}
+
+func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+	ctx.UninterruptibleSleepStart(false)
+	n, err := f.file.WriteAt(p, offset)
+	ctx.UninterruptibleSleepFinish(false)
+	return n, err
+}
+
+func (f p9file) fsync(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.FSync()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9file, p9.QID, uint32, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fdobj, newfile, qid, iounit, err := f.file.Create(name, flags, permissions, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return fdobj, p9file{newfile}, qid, iounit, err
+}
+
+func (f p9file) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, err := f.file.Mkdir(name, permissions, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, err
+}
+
+func (f p9file) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, err := f.file.Symlink(oldName, newName, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, err
+}
+
+func (f p9file) link(ctx context.Context, target p9file, newName string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Link(target.file, newName)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) mknod(ctx context.Context, name string, mode p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, err := f.file.Mknod(name, mode, major, minor, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, err
+}
+
+func (f p9file) rename(ctx context.Context, newDir p9file, newName string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Rename(newDir.file, newName)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) unlinkAt(ctx context.Context, name string, flags uint32) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.UnlinkAt(name, flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) {
+	ctx.UninterruptibleSleepStart(false)
+	dirents, err := f.file.Readdir(offset, count)
+	ctx.UninterruptibleSleepFinish(false)
+	return dirents, err
+}
+
+func (f p9file) readlink(ctx context.Context) (string, error) {
+	ctx.UninterruptibleSleepStart(false)
+	target, err := f.file.Readlink()
+	ctx.UninterruptibleSleepFinish(false)
+	return target, err
+}
+
+func (f p9file) flush(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Flush()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fdobj, err := f.file.Connect(flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return fdobj, err
+}
diff --git a/pkg/sentry/fsimpl/gofer/pagemath.go b/pkg/sentry/fsimpl/gofer/pagemath.go
new file mode 100644
index 000000000..847cb0784
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/pagemath.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// This are equivalent to usermem.Addr.RoundDown/Up, but without the
+// potentially truncating conversion to usermem.Addr. This is necessary because
+// there is no way to define generic "PageRoundDown/Up" functions in Go.
+
+func pageRoundDown(x uint64) uint64 {
+	return x &^ (usermem.PageSize - 1)
+}
+
+func pageRoundUp(x uint64) uint64 {
+	return pageRoundDown(x + usermem.PageSize - 1)
+}
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
new file mode 100644
index 000000000..8e11e06b3
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -0,0 +1,860 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isRegularFile() bool {
+	return d.fileType() == linux.S_IFREG
+}
+
+type regularFileFD struct {
+	fileDescription
+
+	// off is the file offset. off is protected by mu.
+	mu  sync.Mutex
+	off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *regularFileFD) OnClose(ctx context.Context) error {
+	if !fd.vfsfd.IsWritable() {
+		return nil
+	}
+	// Skip flushing if writes may be buffered by the client, since (as with
+	// the VFS1 client) we don't flush buffered writes on close anyway.
+	d := fd.dentry()
+	if d.fs.opts.interop == InteropModeExclusive {
+		return nil
+	}
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	return d.handle.file.flush(ctx)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	// Check for reading at EOF before calling into MM (but not under
+	// InteropModeShared, which makes d.size unreliable).
+	d := fd.dentry()
+	if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) {
+		return 0, io.EOF
+	}
+
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Lock d.metadataMu for the rest of the read to prevent d.size from
+		// changing.
+		d.metadataMu.Lock()
+		defer d.metadataMu.Unlock()
+		// Write dirty cached pages that will be touched by the read back to
+		// the remote file.
+		if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
+			return 0, err
+		}
+	}
+
+	rw := getDentryReadWriter(ctx, d, offset)
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Require the read to go to the remote file.
+		rw.direct = true
+	}
+	n, err := dst.CopyOutFrom(ctx, rw)
+	putDentryReadWriter(rw)
+	if d.fs.opts.interop != InteropModeShared {
+		// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+		d.touchAtime(ctx, fd.vfsfd.Mount())
+	}
+	return n, err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	d := fd.dentry()
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+	if d.fs.opts.interop != InteropModeShared {
+		// Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
+		// file_update_time(). This is d.touchCMtime(), but without locking
+		// d.metadataMu (recursively).
+		if now, ok := nowFromContext(ctx); ok {
+			atomic.StoreInt64(&d.mtime, now)
+			atomic.StoreInt64(&d.ctime, now)
+		}
+	}
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Write dirty cached pages that will be touched by the write back to
+		// the remote file.
+		if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
+			return 0, err
+		}
+		// Remove touched pages from the cache.
+		pgstart := pageRoundDown(uint64(offset))
+		pgend := pageRoundUp(uint64(offset + src.NumBytes()))
+		if pgend < pgstart {
+			return 0, syserror.EINVAL
+		}
+		mr := memmap.MappableRange{pgstart, pgend}
+		var freed []platform.FileRange
+		d.dataMu.Lock()
+		cseg := d.cache.LowerBoundSegment(mr.Start)
+		for cseg.Ok() && cseg.Start() < mr.End {
+			cseg = d.cache.Isolate(cseg, mr)
+			freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
+			cseg = d.cache.Remove(cseg).NextSegment()
+		}
+		d.dataMu.Unlock()
+		// Invalidate mappings of removed pages.
+		d.mapsMu.Lock()
+		d.mappings.Invalidate(mr, memmap.InvalidateOpts{})
+		d.mapsMu.Unlock()
+		// Finally free pages removed from the cache.
+		mf := d.fs.mfp.MemoryFile()
+		for _, freedFR := range freed {
+			mf.DecRef(freedFR)
+		}
+	}
+	rw := getDentryReadWriter(ctx, d, offset)
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Require the write to go to the remote file.
+		rw.direct = true
+	}
+	n, err := src.CopyInTo(ctx, rw)
+	putDentryReadWriter(rw)
+	if n != 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+		// Write dirty cached pages touched by the write back to the remote
+		// file.
+		if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
+			return 0, err
+		}
+		// Request the remote filesystem to sync the remote file.
+		if err := d.handle.file.fsync(ctx); err != nil {
+			return 0, err
+		}
+	}
+	return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.PWrite(ctx, src, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+type dentryReadWriter struct {
+	ctx    context.Context
+	d      *dentry
+	off    uint64
+	direct bool
+}
+
+var dentryReadWriterPool = sync.Pool{
+	New: func() interface{} {
+		return &dentryReadWriter{}
+	},
+}
+
+func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter {
+	rw := dentryReadWriterPool.Get().(*dentryReadWriter)
+	rw.ctx = ctx
+	rw.d = d
+	rw.off = uint64(offset)
+	rw.direct = false
+	return rw
+}
+
+func putDentryReadWriter(rw *dentryReadWriter) {
+	rw.ctx = nil
+	rw.d = nil
+	dentryReadWriterPool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+
+	// If we have a mmappable host FD (which must be used here to ensure
+	// coherence with memory-mapped I/O), or if InteropModeShared is in effect
+	// (which prevents us from caching file contents and makes dentry.size
+	// unreliable), or if the file was opened O_DIRECT, read directly from
+	// dentry.handle without locking dentry.dataMu.
+	rw.d.handleMu.RLock()
+	if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+		n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off)
+		rw.d.handleMu.RUnlock()
+		rw.off += n
+		return n, err
+	}
+
+	// Otherwise read from/through the cache.
+	mf := rw.d.fs.mfp.MemoryFile()
+	fillCache := mf.ShouldCacheEvictable()
+	var dataMuUnlock func()
+	if fillCache {
+		rw.d.dataMu.Lock()
+		dataMuUnlock = rw.d.dataMu.Unlock
+	} else {
+		rw.d.dataMu.RLock()
+		dataMuUnlock = rw.d.dataMu.RUnlock
+	}
+
+	// Compute the range to read (limited by file size and overflow-checked).
+	if rw.off >= rw.d.size {
+		dataMuUnlock()
+		rw.d.handleMu.RUnlock()
+		return 0, io.EOF
+	}
+	end := rw.d.size
+	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
+		end = rend
+	}
+
+	var done uint64
+	seg, gap := rw.d.cache.Find(rw.off)
+	for rw.off < end {
+		mr := memmap.MappableRange{rw.off, end}
+		switch {
+		case seg.Ok():
+			// Get internal mappings from the cache.
+			ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+			if err != nil {
+				dataMuUnlock()
+				rw.d.handleMu.RUnlock()
+				return done, err
+			}
+
+			// Copy from internal mappings.
+			n, err := safemem.CopySeq(dsts, ims)
+			done += n
+			rw.off += n
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				dataMuUnlock()
+				rw.d.handleMu.RUnlock()
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			gapMR := gap.Range().Intersect(mr)
+			if fillCache {
+				// Read into the cache, then re-enter the loop to read from the
+				// cache.
+				reqMR := memmap.MappableRange{
+					Start: pageRoundDown(gapMR.Start),
+					End:   pageRoundUp(gapMR.End),
+				}
+				optMR := gap.Range()
+				err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt)
+				mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
+				seg, gap = rw.d.cache.Find(rw.off)
+				if !seg.Ok() {
+					dataMuUnlock()
+					rw.d.handleMu.RUnlock()
+					return done, err
+				}
+				// err might have occurred in part of gap.Range() outside
+				// gapMR. Forget about it for now; if the error matters and
+				// persists, we'll run into it again in a later iteration of
+				// this loop.
+			} else {
+				// Read directly from the file.
+				gapDsts := dsts.TakeFirst64(gapMR.Length())
+				n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
+				done += n
+				rw.off += n
+				dsts = dsts.DropFirst64(n)
+				// Partial reads are fine. But we must stop reading.
+				if n != gapDsts.NumBytes() || err != nil {
+					dataMuUnlock()
+					rw.d.handleMu.RUnlock()
+					return done, err
+				}
+
+				// Continue.
+				seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+			}
+		}
+	}
+	dataMuUnlock()
+	rw.d.handleMu.RUnlock()
+	return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+//
+// Preconditions: rw.d.metadataMu must be locked.
+func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+
+	// If we have a mmappable host FD (which must be used here to ensure
+	// coherence with memory-mapped I/O), or if InteropModeShared is in effect
+	// (which prevents us from caching file contents), or if the file was
+	// opened with O_DIRECT, write directly to dentry.handle without locking
+	// dentry.dataMu.
+	rw.d.handleMu.RLock()
+	if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+		n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off)
+		rw.d.handleMu.RUnlock()
+		rw.off += n
+		return n, err
+	}
+
+	// Otherwise write to/through the cache.
+	mf := rw.d.fs.mfp.MemoryFile()
+	rw.d.dataMu.Lock()
+
+	// Compute the range to write (overflow-checked).
+	start := rw.off
+	end := rw.off + srcs.NumBytes()
+	if end <= rw.off {
+		end = math.MaxInt64
+	}
+
+	var (
+		done   uint64
+		retErr error
+	)
+	seg, gap := rw.d.cache.Find(rw.off)
+	for rw.off < end {
+		mr := memmap.MappableRange{rw.off, end}
+		switch {
+		case seg.Ok():
+			// Get internal mappings from the cache.
+			segMR := seg.Range().Intersect(mr)
+			ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Copy to internal mappings.
+			n, err := safemem.CopySeq(ims, srcs)
+			done += n
+			rw.off += n
+			srcs = srcs.DropFirst64(n)
+			rw.d.dirty.MarkDirty(segMR)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Write directly to the file. At present, we never fill the cache
+			// when writing, since doing so can convert small writes into
+			// inefficient read-modify-write cycles, and we have no mechanism
+			// for detecting or avoiding this.
+			gapMR := gap.Range().Intersect(mr)
+			gapSrcs := srcs.TakeFirst64(gapMR.Length())
+			n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
+			done += n
+			rw.off += n
+			srcs = srcs.DropFirst64(n)
+			// Partial writes are fine. But we must stop writing.
+			if n != gapSrcs.NumBytes() || err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Continue.
+			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+		}
+	}
+exitLoop:
+	if rw.off > rw.d.size {
+		atomic.StoreUint64(&rw.d.size, rw.off)
+		// The remote file's size will implicitly be extended to the correct
+		// value when we write back to it.
+	}
+	// If InteropModeWritethrough is in effect, flush written data back to the
+	// remote filesystem.
+	if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 {
+		if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{
+			Start: start,
+			End:   rw.off,
+		}, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil {
+			// We have no idea how many bytes were actually flushed.
+			rw.off = start
+			done = 0
+			retErr = err
+		}
+	}
+	rw.d.dataMu.Unlock()
+	rw.d.handleMu.RUnlock()
+	return done, retErr
+}
+
+func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
+	if size == 0 {
+		return nil
+	}
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+	// Compute the range of valid bytes (overflow-checked).
+	if uint64(offset) >= d.size {
+		return nil
+	}
+	end := int64(d.size)
+	if rend := offset + size; rend > offset && rend < end {
+		end = rend
+	}
+	return fsutil.SyncDirty(ctx, memmap.MappableRange{
+		Start: uint64(offset),
+		End:   uint64(end),
+	}, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as specified.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE:
+		// Ensure file size is up to date.
+		d := fd.dentry()
+		if fd.filesystem().opts.interop == InteropModeShared {
+			if err := d.updateFromGetattr(ctx); err != nil {
+				return 0, err
+			}
+		}
+		size := int64(atomic.LoadUint64(&d.size))
+		// For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous
+		// block of data.
+		switch whence {
+		case linux.SEEK_END:
+			offset += size
+		case linux.SEEK_DATA:
+			if offset > size {
+				return 0, syserror.ENXIO
+			}
+			// Use offset as specified.
+		case linux.SEEK_HOLE:
+			if offset > size {
+				return 0, syserror.ENXIO
+			}
+			offset = size
+		}
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+	return fd.dentry().syncSharedHandle(ctx)
+}
+
+func (d *dentry) syncSharedHandle(ctx context.Context) error {
+	d.handleMu.RLock()
+	if !d.handleWritable {
+		d.handleMu.RUnlock()
+		return nil
+	}
+	d.dataMu.Lock()
+	// Write dirty cached data to the remote file.
+	err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
+	d.dataMu.Unlock()
+	if err == nil {
+		// Sync the remote file.
+		err = d.handle.sync(ctx)
+	}
+	d.handleMu.RUnlock()
+	return err
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	d := fd.dentry()
+	switch d.fs.opts.interop {
+	case InteropModeExclusive:
+		// Any mapping is fine.
+	case InteropModeWritethrough:
+		// Shared writable mappings require a host FD, since otherwise we can't
+		// synchronously flush memory-mapped writes to the remote file.
+		if opts.Private || !opts.MaxPerms.Write {
+			break
+		}
+		fallthrough
+	case InteropModeShared:
+		// All mappings require a host FD to be coherent with other filesystem
+		// users.
+		if d.fs.opts.forcePageCache {
+			// Whether or not we have a host FD, we're not allowed to use it.
+			return syserror.ENODEV
+		}
+		d.handleMu.RLock()
+		haveFD := d.handle.fd >= 0
+		d.handleMu.RUnlock()
+		if !haveFD {
+			return syserror.ENODEV
+		}
+	default:
+		panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop))
+	}
+	return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts)
+}
+
+func (d *dentry) mayCachePages() bool {
+	if d.fs.opts.interop == InteropModeShared {
+		return false
+	}
+	if d.fs.opts.forcePageCache {
+		return true
+	}
+	d.handleMu.RLock()
+	haveFD := d.handle.fd >= 0
+	d.handleMu.RUnlock()
+	return haveFD
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	d.mapsMu.Lock()
+	mapped := d.mappings.AddMapping(ms, ar, offset, writable)
+	// Do this unconditionally since whether we have a host FD can change
+	// across save/restore.
+	for _, r := range mapped {
+		d.pf.hostFileMapper.IncRefOn(r)
+	}
+	if d.mayCachePages() {
+		// d.Evict() will refuse to evict memory-mapped pages, so tell the
+		// MemoryFile to not bother trying.
+		mf := d.fs.mfp.MemoryFile()
+		for _, r := range mapped {
+			mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End})
+		}
+	}
+	d.mapsMu.Unlock()
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	d.mapsMu.Lock()
+	unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable)
+	for _, r := range unmapped {
+		d.pf.hostFileMapper.DecRefOn(r)
+	}
+	if d.mayCachePages() {
+		// Pages that are no longer referenced by any application memory
+		// mappings are now considered unused; allow MemoryFile to evict them
+		// when necessary.
+		mf := d.fs.mfp.MemoryFile()
+		d.dataMu.Lock()
+		for _, r := range unmapped {
+			// Since these pages are no longer mapped, they are no longer
+			// concurrently dirtyable by a writable memory mapping.
+			d.dirty.AllowClean(r)
+			mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End})
+		}
+		d.dataMu.Unlock()
+	}
+	d.mapsMu.Unlock()
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return d.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	d.handleMu.RLock()
+	if d.handle.fd >= 0 && !d.fs.opts.forcePageCache {
+		d.handleMu.RUnlock()
+		mr := optional
+		if d.fs.opts.limitHostFDTranslation {
+			mr = maxFillRange(required, optional)
+		}
+		return []memmap.Translation{
+			{
+				Source: mr,
+				File:   &d.pf,
+				Offset: mr.Start,
+				Perms:  usermem.AnyAccess,
+			},
+		}, nil
+	}
+
+	d.dataMu.Lock()
+
+	// Constrain translations to d.size (rounded up) to prevent translation to
+	// pages that may be concurrently truncated.
+	pgend := pageRoundUp(d.size)
+	var beyondEOF bool
+	if required.End > pgend {
+		if required.Start >= pgend {
+			d.dataMu.Unlock()
+			d.handleMu.RUnlock()
+			return nil, &memmap.BusError{io.EOF}
+		}
+		beyondEOF = true
+		required.End = pgend
+	}
+	if optional.End > pgend {
+		optional.End = pgend
+	}
+
+	mf := d.fs.mfp.MemoryFile()
+	cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt)
+
+	var ts []memmap.Translation
+	var translatedEnd uint64
+	for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
+		segMR := seg.Range().Intersect(optional)
+		// TODO(jamieliu): Make Translations writable even if writability is
+		// not required if already kept-dirty by another writable translation.
+		perms := usermem.AccessType{
+			Read:    true,
+			Execute: true,
+		}
+		if at.Write {
+			// From this point forward, this memory can be dirtied through the
+			// mapping at any time.
+			d.dirty.KeepDirty(segMR)
+			perms.Write = true
+		}
+		ts = append(ts, memmap.Translation{
+			Source: segMR,
+			File:   mf,
+			Offset: seg.FileRangeOf(segMR).Start,
+			Perms:  perms,
+		})
+		translatedEnd = segMR.End
+	}
+
+	d.dataMu.Unlock()
+	d.handleMu.RUnlock()
+
+	// Don't return the error returned by c.cache.Fill if it occurred outside
+	// of required.
+	if translatedEnd < required.End && cerr != nil {
+		return ts, &memmap.BusError{cerr}
+	}
+	if beyondEOF {
+		return ts, &memmap.BusError{io.EOF}
+	}
+	return ts, nil
+}
+
+func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange {
+	const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily
+	if required.Length() >= maxReadahead {
+		return required
+	}
+	if optional.Length() <= maxReadahead {
+		return optional
+	}
+	optional.Start = required.Start
+	if optional.Length() <= maxReadahead {
+		return optional
+	}
+	optional.End = optional.Start + maxReadahead
+	return optional
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
+	// Whether we have a host fd (and consequently what platform.File is
+	// mapped) can change across save/restore, so invalidate all translations
+	// unconditionally.
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.mappings.InvalidateAll(memmap.InvalidateOpts{})
+
+	// Write the cache's contents back to the remote file so that if we have a
+	// host fd after restore, the remote file's contents are coherent.
+	mf := d.fs.mfp.MemoryFile()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+	if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+		return err
+	}
+
+	// Discard the cache so that it's not stored in saved state. This is safe
+	// because per InvalidateUnsavable invariants, no new translations can have
+	// been returned after we invalidated all existing translations above.
+	d.cache.DropAll(mf)
+	d.dirty.RemoveAll()
+
+	return nil
+}
+
+// Evict implements pgalloc.EvictableMemoryUser.Evict.
+func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+
+	mr := memmap.MappableRange{er.Start, er.End}
+	mf := d.fs.mfp.MemoryFile()
+	// Only allow pages that are no longer memory-mapped to be evicted.
+	for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
+		mgapMR := mgap.Range().Intersect(mr)
+		if mgapMR.Length() == 0 {
+			continue
+		}
+		if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+			log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
+		}
+		d.cache.Drop(mgapMR, mf)
+		d.dirty.KeepClean(mgapMR)
+	}
+}
+
+// dentryPlatformFile implements platform.File. It exists solely because dentry
+// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef.
+//
+// dentryPlatformFile is only used when a host FD representing the remote file
+// is available (i.e. dentry.handle.fd >= 0), and that FD is used for
+// application memory mappings (i.e. !filesystem.opts.forcePageCache).
+type dentryPlatformFile struct {
+	*dentry
+
+	// fdRefs counts references on platform.File offsets. fdRefs is protected
+	// by dentry.dataMu.
+	fdRefs fsutil.FrameRefSet
+
+	// If this dentry represents a regular file, and handle.fd >= 0,
+	// hostFileMapper caches mappings of handle.fd.
+	hostFileMapper fsutil.HostFileMapper
+}
+
+// IncRef implements platform.File.IncRef.
+func (d *dentryPlatformFile) IncRef(fr platform.FileRange) {
+	d.dataMu.Lock()
+	seg, gap := d.fdRefs.Find(fr.Start)
+	for {
+		switch {
+		case seg.Ok() && seg.Start() < fr.End:
+			seg = d.fdRefs.Isolate(seg, fr)
+			seg.SetValue(seg.Value() + 1)
+			seg, gap = seg.NextNonEmpty()
+		case gap.Ok() && gap.Start() < fr.End:
+			newRange := gap.Range().Intersect(fr)
+			usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
+			seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
+		default:
+			d.fdRefs.MergeAdjacent(fr)
+			d.dataMu.Unlock()
+			return
+		}
+	}
+}
+
+// DecRef implements platform.File.DecRef.
+func (d *dentryPlatformFile) DecRef(fr platform.FileRange) {
+	d.dataMu.Lock()
+	seg := d.fdRefs.FindSegment(fr.Start)
+
+	for seg.Ok() && seg.Start() < fr.End {
+		seg = d.fdRefs.Isolate(seg, fr)
+		if old := seg.Value(); old == 1 {
+			usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
+			seg = d.fdRefs.Remove(seg).NextSegment()
+		} else {
+			seg.SetValue(old - 1)
+			seg = seg.NextSegment()
+		}
+	}
+	d.fdRefs.MergeAdjacent(fr)
+	d.dataMu.Unlock()
+
+}
+
+// MapInternal implements platform.File.MapInternal.
+func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	d.handleMu.RLock()
+	bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write)
+	d.handleMu.RUnlock()
+	return bs, err
+}
+
+// FD implements platform.File.FD.
+func (d *dentryPlatformFile) FD() int {
+	d.handleMu.RLock()
+	fd := d.handle.fd
+	d.handleMu.RUnlock()
+	return int(fd)
+}
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
new file mode 100644
index 000000000..08c691c47
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -0,0 +1,159 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// specialFileFD implements vfs.FileDescriptionImpl for files other than
+// regular files, directories, and symlinks: pipes, sockets, etc. It is also
+// used for regular files when filesystemOptions.specialRegularFiles is in
+// effect. specialFileFD differs from regularFileFD by using per-FD handles
+// instead of shared per-dentry handles, and never buffering I/O.
+type specialFileFD struct {
+	fileDescription
+
+	// handle is immutable.
+	handle handle
+
+	// off is the file offset. off is protected by mu. (POSIX 2.9.7 only
+	// requires operations using the file offset to be atomic for regular files
+	// and symlinks; however, since specialFileFD may be used for regular
+	// files, we apply this atomicity unconditionally.)
+	mu  sync.Mutex
+	off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *specialFileFD) Release() {
+	fd.handle.close(context.Background())
+	fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+	fs.syncMu.Lock()
+	delete(fs.specialFileFDs, fd)
+	fs.syncMu.Unlock()
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *specialFileFD) OnClose(ctx context.Context) error {
+	if !fd.vfsfd.IsWritable() {
+		return nil
+	}
+	return fd.handle.file.flush(ctx)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	// Going through dst.CopyOutFrom() holds MM locks around file operations of
+	// unknown duration. For regularFileFD, doing so is necessary to support
+	// mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't
+	// hold here since specialFileFD doesn't client-cache data. Just buffer the
+	// read instead.
+	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+		d.touchAtime(ctx, fd.vfsfd.Mount())
+	}
+	buf := make([]byte, dst.NumBytes())
+	n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	if n == 0 {
+		return 0, err
+	}
+	if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil {
+		return int64(cp), cperr
+	}
+	return int64(n), err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	// Do a buffered write. See rationale in PRead.
+	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+		d.touchCMtime(ctx)
+	}
+	buf := make([]byte, src.NumBytes())
+	// Don't do partial writes if we get a partial read from src.
+	if _, err := src.CopyIn(ctx, buf); err != nil {
+		return 0, err
+	}
+	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	return int64(n), err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.PWrite(ctx, src, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as given.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	default:
+		// SEEK_END, SEEK_DATA, and SEEK_HOLE aren't supported since it's not
+		// clear that file size is even meaningful for these files.
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *specialFileFD) Sync(ctx context.Context) error {
+	if !fd.vfsfd.IsWritable() {
+		return nil
+	}
+	return fd.handle.sync(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go
new file mode 100644
index 000000000..adf43be60
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/symlink.go
@@ -0,0 +1,47 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func (d *dentry) isSymlink() bool {
+	return d.fileType() == linux.S_IFLNK
+}
+
+// Precondition: d.isSymlink().
+func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
+	if d.fs.opts.interop != InteropModeShared {
+		d.touchAtime(ctx, mnt)
+		d.dataMu.Lock()
+		if d.haveTarget {
+			target := d.target
+			d.dataMu.Unlock()
+			return target, nil
+		}
+	}
+	target, err := d.file.readlink(ctx)
+	if d.fs.opts.interop != InteropModeShared {
+		if err == nil {
+			d.haveTarget = true
+			d.target = target
+		}
+		d.dataMu.Unlock()
+	}
+	return target, err
+}
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
new file mode 100644
index 000000000..7598ec6a8
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -0,0 +1,75 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func dentryTimestampFromP9(s, ns uint64) int64 {
+	return int64(s*1e9 + ns)
+}
+
+func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 {
+	return ts.Sec*1e9 + int64(ts.Nsec)
+}
+
+func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
+	return linux.StatxTimestamp{
+		Sec:  ns / 1e9,
+		Nsec: uint32(ns % 1e9),
+	}
+}
+
+func nowFromContext(ctx context.Context) (int64, bool) {
+	if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
+		return clock.Now().Nanoseconds(), true
+	}
+	return 0, false
+}
+
+// Preconditions: fs.interop != InteropModeShared.
+func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) {
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	now, ok := nowFromContext(ctx)
+	if !ok {
+		mnt.EndWrite()
+		return
+	}
+	d.metadataMu.Lock()
+	atomic.StoreInt64(&d.atime, now)
+	d.metadataMu.Unlock()
+	mnt.EndWrite()
+}
+
+// Preconditions: fs.interop != InteropModeShared. The caller has successfully
+// called vfs.Mount.CheckBeginWrite().
+func (d *dentry) touchCMtime(ctx context.Context) {
+	now, ok := nowFromContext(ctx)
+	if !ok {
+		return
+	}
+	d.metadataMu.Lock()
+	atomic.StoreInt64(&d.mtime, now)
+	atomic.StoreInt64(&d.ctime, now)
+	d.metadataMu.Unlock()
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 5ee9cf1e9..72bc15264 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -622,7 +622,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if child.inode.isDir() {
 		return syserror.EISDIR
 	}
-	if !rp.MustBeDir() {
+	if rp.MustBeDir() {
 		return syserror.ENOTDIR
 	}
 	mnt := rp.Mount()
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index bde4c7a1e..34f63986f 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -126,7 +126,7 @@ func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 			}
 			return uint64(n), nil
 		}
-		return readv(s.fd, iovecsFromBlockSeq(dsts))
+		return readv(s.fd, safemem.IovecsFromBlockSeq(dsts))
 	}))
 	return int64(n), err
 }
@@ -149,7 +149,7 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 			}
 			return uint64(n), nil
 		}
-		return writev(s.fd, iovecsFromBlockSeq(srcs))
+		return writev(s.fd, safemem.IovecsFromBlockSeq(srcs))
 	}))
 	return int64(n), err
 }
@@ -402,7 +402,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		// We always do a non-blocking recv*().
 		sysflags := flags | syscall.MSG_DONTWAIT
 
-		iovs := iovecsFromBlockSeq(dsts)
+		iovs := safemem.IovecsFromBlockSeq(dsts)
 		msg := syscall.Msghdr{
 			Iov:    &iovs[0],
 			Iovlen: uint64(len(iovs)),
@@ -522,7 +522,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 			return uint64(n), nil
 		}
 
-		iovs := iovecsFromBlockSeq(srcs)
+		iovs := safemem.IovecsFromBlockSeq(srcs)
 		msg := syscall.Msghdr{
 			Iov:    &iovs[0],
 			Iovlen: uint64(len(iovs)),
@@ -567,21 +567,6 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 	return int(n), syserr.FromError(err)
 }
 
-func iovecsFromBlockSeq(bs safemem.BlockSeq) []syscall.Iovec {
-	iovs := make([]syscall.Iovec, 0, bs.NumBlocks())
-	for ; !bs.IsEmpty(); bs = bs.Tail() {
-		b := bs.Head()
-		iovs = append(iovs, syscall.Iovec{
-			Base: &b.ToSlice()[0],
-			Len:  uint64(b.Len()),
-		})
-		// We don't need to care about b.NeedSafecopy(), because the host
-		// kernel will handle such address ranges just fine (by returning
-		// EFAULT).
-	}
-	return iovs
-}
-
 func translateIOSyscallError(err error) error {
 	if err == syscall.EAGAIN || err == syscall.EWOULDBLOCK {
 		return syserror.ErrWouldBlock
-- 
cgit v1.2.3


From dcffddf0cae026411e7e678744a1e39dc2b513cf Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 4 Feb 2020 11:47:41 -0800
Subject: Remove argument from vfs.MountNamespace.DecRef()

Updates #1035

PiperOrigin-RevId: 293194631
---
 pkg/sentry/fsimpl/devtmpfs/devtmpfs.go       | 2 +-
 pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go  | 2 +-
 pkg/sentry/fsimpl/testutil/testutil.go       | 2 +-
 pkg/sentry/fsimpl/tmpfs/benchmark_test.go    | 4 ++--
 pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 2 +-
 pkg/sentry/vfs/mount.go                      | 3 ++-
 6 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index d36fa74fb..e03a0c665 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -86,7 +86,7 @@ func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth
 // Release must be called when a is no longer in use.
 func (a *Accessor) Release() {
 	a.root.DecRef()
-	a.mntns.DecRef(a.vfsObj)
+	a.mntns.DecRef()
 }
 
 // accessorContext implements context.Context by extending an existing
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 82c58c900..73308a2b5 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -45,7 +45,7 @@ func TestDevtmpfs(t *testing.T) {
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
-	defer mntns.DecRef(vfsObj)
+	defer mntns.DecRef()
 	root := mntns.Root()
 	defer root.DecRef()
 	devpop := vfs.PathOperation{
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index 1c98335c1..69fd84ddd 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -98,7 +98,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System {
 // Destroy release resources associated with a test system.
 func (s *System) Destroy() {
 	s.Root.DecRef()
-	s.mns.DecRef(s.VFS) // Reference on mns passed to NewSystem.
+	s.mns.DecRef() // Reference on mns passed to NewSystem.
 }
 
 // ReadToEnd reads the contents of fd until EOF to a string.
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index 54241c8e8..9fce5e4b4 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -183,7 +183,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
-			defer mntns.DecRef(vfsObj)
+			defer mntns.DecRef()
 
 			var filePathBuilder strings.Builder
 			filePathBuilder.WriteByte('/')
@@ -374,7 +374,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
-			defer mntns.DecRef(vfsObj)
+			defer mntns.DecRef()
 
 			var filePathBuilder strings.Builder
 			filePathBuilder.WriteByte('/')
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index 2b52992ea..e9f71e334 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -51,7 +51,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr
 	root := mntns.Root()
 	return vfsObj, root, func() {
 		root.DecRef()
-		mntns.DecRef(vfsObj)
+		mntns.DecRef()
 	}, nil
 }
 
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index d39528051..1fbb420f9 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -423,7 +423,8 @@ func (mntns *MountNamespace) IncRef() {
 }
 
 // DecRef decrements mntns' reference count.
-func (mntns *MountNamespace) DecRef(vfs *VirtualFilesystem) {
+func (mntns *MountNamespace) DecRef() {
+	vfs := mntns.root.fs.VirtualFilesystem()
 	if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 {
 		vfs.mountMu.Lock()
 		vfs.mounts.seq.BeginWrite()
-- 
cgit v1.2.3


From f5072caaf85b9f067d737a874804c04e2b9039b8 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 4 Feb 2020 12:39:58 -0800
Subject: Fix safecopy test.

This is failing in Go1.14 due to new checkptr constraints. This version
should avoid hitting this constraints by doing "dangerous" pointer math
less dangerously?

PiperOrigin-RevId: 293205764
---
 pkg/safecopy/safecopy_test.go   | 88 ++++++++++++++++++++----------------
 pkg/safecopy/safecopy_unsafe.go | 98 ++++++++++++++++++++++++++---------------
 2 files changed, 112 insertions(+), 74 deletions(-)

(limited to 'pkg')

diff --git a/pkg/safecopy/safecopy_test.go b/pkg/safecopy/safecopy_test.go
index 5818f7f9b..7f7f69d61 100644
--- a/pkg/safecopy/safecopy_test.go
+++ b/pkg/safecopy/safecopy_test.go
@@ -138,10 +138,14 @@ func TestSwapUint32Success(t *testing.T) {
 func TestSwapUint32AlignmentError(t *testing.T) {
 	// Test that SwapUint32 returns an AlignmentError when passed an unaligned
 	// address.
-	data := new(struct{ val uint64 })
-	addr := uintptr(unsafe.Pointer(&data.val)) + 1
-	want := AlignmentError{Addr: addr, Alignment: 4}
-	if _, err := SwapUint32(unsafe.Pointer(addr), 1); err != want {
+	data := make([]byte, 8) // 2 * sizeof(uint32).
+	alignedIndex := uintptr(0)
+	if offset := uintptr(unsafe.Pointer(&data[0])) % 4; offset != 0 {
+		alignedIndex = 4 - offset
+	}
+	ptr := unsafe.Pointer(&data[alignedIndex+1])
+	want := AlignmentError{Addr: uintptr(ptr), Alignment: 4}
+	if _, err := SwapUint32(ptr, 1); err != want {
 		t.Errorf("Unexpected error: got %v, want %v", err, want)
 	}
 }
@@ -171,10 +175,14 @@ func TestSwapUint64Success(t *testing.T) {
 func TestSwapUint64AlignmentError(t *testing.T) {
 	// Test that SwapUint64 returns an AlignmentError when passed an unaligned
 	// address.
-	data := new(struct{ val1, val2 uint64 })
-	addr := uintptr(unsafe.Pointer(&data.val1)) + 1
-	want := AlignmentError{Addr: addr, Alignment: 8}
-	if _, err := SwapUint64(unsafe.Pointer(addr), 1); err != want {
+	data := make([]byte, 16) // 2 * sizeof(uint64).
+	alignedIndex := uintptr(0)
+	if offset := uintptr(unsafe.Pointer(&data[0])) % 8; offset != 0 {
+		alignedIndex = 8 - offset
+	}
+	ptr := unsafe.Pointer(&data[alignedIndex+1])
+	want := AlignmentError{Addr: uintptr(ptr), Alignment: 8}
+	if _, err := SwapUint64(ptr, 1); err != want {
 		t.Errorf("Unexpected error: got %v, want %v", err, want)
 	}
 }
@@ -201,10 +209,14 @@ func TestCompareAndSwapUint32Success(t *testing.T) {
 func TestCompareAndSwapUint32AlignmentError(t *testing.T) {
 	// Test that CompareAndSwapUint32 returns an AlignmentError when passed an
 	// unaligned address.
-	data := new(struct{ val uint64 })
-	addr := uintptr(unsafe.Pointer(&data.val)) + 1
-	want := AlignmentError{Addr: addr, Alignment: 4}
-	if _, err := CompareAndSwapUint32(unsafe.Pointer(addr), 0, 1); err != want {
+	data := make([]byte, 8) // 2 * sizeof(uint32).
+	alignedIndex := uintptr(0)
+	if offset := uintptr(unsafe.Pointer(&data[0])) % 4; offset != 0 {
+		alignedIndex = 4 - offset
+	}
+	ptr := unsafe.Pointer(&data[alignedIndex+1])
+	want := AlignmentError{Addr: uintptr(ptr), Alignment: 4}
+	if _, err := CompareAndSwapUint32(ptr, 0, 1); err != want {
 		t.Errorf("Unexpected error: got %v, want %v", err, want)
 	}
 }
@@ -252,8 +264,8 @@ func TestCopyInSegvError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
 			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				src := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				dst := randBuf(pageSize)
 				n, err := CopyIn(dst, src)
 				if n != bytesBeforeFault {
@@ -276,8 +288,8 @@ func TestCopyInBusError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
 			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				src := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				dst := randBuf(pageSize)
 				n, err := CopyIn(dst, src)
 				if n != bytesBeforeFault {
@@ -300,8 +312,8 @@ func TestCopyOutSegvError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
 			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				src := randBuf(pageSize)
 				n, err := CopyOut(dst, src)
 				if n != bytesBeforeFault {
@@ -324,8 +336,8 @@ func TestCopyOutBusError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
 			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				src := randBuf(pageSize)
 				n, err := CopyOut(dst, src)
 				if n != bytesBeforeFault {
@@ -348,8 +360,8 @@ func TestCopySourceSegvError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
 			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				src := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				dst := randBuf(pageSize)
 				n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize)
 				if n != uintptr(bytesBeforeFault) {
@@ -372,8 +384,8 @@ func TestCopySourceBusError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
 			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				src := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				dst := randBuf(pageSize)
 				n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize)
 				if n != uintptr(bytesBeforeFault) {
@@ -396,8 +408,8 @@ func TestCopyDestinationSegvError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
 			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				src := randBuf(pageSize)
 				n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize)
 				if n != uintptr(bytesBeforeFault) {
@@ -420,8 +432,8 @@ func TestCopyDestinationBusError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
 			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				src := randBuf(pageSize)
 				n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize)
 				if n != uintptr(bytesBeforeFault) {
@@ -444,8 +456,8 @@ func TestZeroOutSegvError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting write %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
 			withSegvErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				n, err := ZeroOut(dst, pageSize)
 				if n != uintptr(bytesBeforeFault) {
 					t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault)
@@ -467,8 +479,8 @@ func TestZeroOutBusError(t *testing.T) {
 	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
 		t.Run(fmt.Sprintf("starting write %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
 			withBusErrorTestMapping(t, func(mapping []byte) {
-				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
-				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
+				dst := unsafe.Pointer(&mapping[pageSize-bytesBeforeFault])
 				n, err := ZeroOut(dst, pageSize)
 				if n != uintptr(bytesBeforeFault) {
 					t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault)
@@ -488,7 +500,7 @@ func TestSwapUint32SegvError(t *testing.T) {
 	// Test that SwapUint32 returns a SegvError when reaching a page that
 	// signals SIGSEGV.
 	withSegvErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
 		_, err := SwapUint32(unsafe.Pointer(secondPage), 1)
 		if want := (SegvError{secondPage}); err != want {
 			t.Errorf("Unexpected error: got %v, want %v", err, want)
@@ -500,7 +512,7 @@ func TestSwapUint32BusError(t *testing.T) {
 	// Test that SwapUint32 returns a BusError when reaching a page that
 	// signals SIGBUS.
 	withBusErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
 		_, err := SwapUint32(unsafe.Pointer(secondPage), 1)
 		if want := (BusError{secondPage}); err != want {
 			t.Errorf("Unexpected error: got %v, want %v", err, want)
@@ -512,7 +524,7 @@ func TestSwapUint64SegvError(t *testing.T) {
 	// Test that SwapUint64 returns a SegvError when reaching a page that
 	// signals SIGSEGV.
 	withSegvErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
 		_, err := SwapUint64(unsafe.Pointer(secondPage), 1)
 		if want := (SegvError{secondPage}); err != want {
 			t.Errorf("Unexpected error: got %v, want %v", err, want)
@@ -524,7 +536,7 @@ func TestSwapUint64BusError(t *testing.T) {
 	// Test that SwapUint64 returns a BusError when reaching a page that
 	// signals SIGBUS.
 	withBusErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
 		_, err := SwapUint64(unsafe.Pointer(secondPage), 1)
 		if want := (BusError{secondPage}); err != want {
 			t.Errorf("Unexpected error: got %v, want %v", err, want)
@@ -536,7 +548,7 @@ func TestCompareAndSwapUint32SegvError(t *testing.T) {
 	// Test that CompareAndSwapUint32 returns a SegvError when reaching a page
 	// that signals SIGSEGV.
 	withSegvErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
 		_, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1)
 		if want := (SegvError{secondPage}); err != want {
 			t.Errorf("Unexpected error: got %v, want %v", err, want)
@@ -548,7 +560,7 @@ func TestCompareAndSwapUint32BusError(t *testing.T) {
 	// Test that CompareAndSwapUint32 returns a BusError when reaching a page
 	// that signals SIGBUS.
 	withBusErrorTestMapping(t, func(mapping []byte) {
-		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		secondPage := uintptr(unsafe.Pointer(&mapping[pageSize]))
 		_, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1)
 		if want := (BusError{secondPage}); err != want {
 			t.Errorf("Unexpected error: got %v, want %v", err, want)
diff --git a/pkg/safecopy/safecopy_unsafe.go b/pkg/safecopy/safecopy_unsafe.go
index eef028e68..41dd567f3 100644
--- a/pkg/safecopy/safecopy_unsafe.go
+++ b/pkg/safecopy/safecopy_unsafe.go
@@ -16,6 +16,7 @@ package safecopy
 
 import (
 	"fmt"
+	"runtime"
 	"syscall"
 	"unsafe"
 )
@@ -35,7 +36,7 @@ const maxRegisterSize = 16
 // successfully copied.
 //
 //go:noescape
-func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+func memcpy(dst, src uintptr, n uintptr) (fault uintptr, sig int32)
 
 // memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS
 // signal is received during the write, it returns the address that caused the
@@ -47,7 +48,7 @@ func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32
 // successfully written.
 //
 //go:noescape
-func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+func memclr(ptr uintptr, n uintptr) (fault uintptr, sig int32)
 
 // swapUint32 atomically stores new into *ptr and returns (the previous *ptr
 // value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
@@ -90,29 +91,35 @@ func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
 // CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes
 // copied and an error if SIGSEGV or SIGBUS is received while reading from src.
 func CopyIn(dst []byte, src unsafe.Pointer) (int, error) {
+	n, err := copyIn(dst, uintptr(src))
+	runtime.KeepAlive(src)
+	return n, err
+}
+
+// copyIn is the underlying definition for CopyIn.
+func copyIn(dst []byte, src uintptr) (int, error) {
 	toCopy := uintptr(len(dst))
 	if len(dst) == 0 {
 		return 0, nil
 	}
 
-	fault, sig := memcpy(unsafe.Pointer(&dst[0]), src, toCopy)
+	fault, sig := memcpy(uintptr(unsafe.Pointer(&dst[0])), src, toCopy)
 	if sig == 0 {
 		return len(dst), nil
 	}
 
-	faultN, srcN := uintptr(fault), uintptr(src)
-	if faultN < srcN || faultN >= srcN+toCopy {
-		panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, faultN, srcN, srcN+toCopy))
+	if fault < src || fault >= src+toCopy {
+		panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, fault, src, src+toCopy))
 	}
 
 	// memcpy might have ended the copy up to maxRegisterSize bytes before
 	// fault, if an instruction caused a memory access that straddled two
 	// pages, and the second one faulted. Try to copy up to the fault.
 	var done int
-	if faultN-srcN > maxRegisterSize {
-		done = int(faultN - srcN - maxRegisterSize)
+	if fault-src > maxRegisterSize {
+		done = int(fault - src - maxRegisterSize)
 	}
-	n, err := CopyIn(dst[done:int(faultN-srcN)], unsafe.Pointer(srcN+uintptr(done)))
+	n, err := copyIn(dst[done:int(fault-src)], src+uintptr(done))
 	done += n
 	if err != nil {
 		return done, err
@@ -124,29 +131,35 @@ func CopyIn(dst []byte, src unsafe.Pointer) (int, error) {
 // bytes done and an error if SIGSEGV or SIGBUS is received while writing to
 // dst.
 func CopyOut(dst unsafe.Pointer, src []byte) (int, error) {
+	n, err := copyOut(uintptr(dst), src)
+	runtime.KeepAlive(dst)
+	return n, err
+}
+
+// copyOut is the underlying definition for CopyOut.
+func copyOut(dst uintptr, src []byte) (int, error) {
 	toCopy := uintptr(len(src))
 	if toCopy == 0 {
 		return 0, nil
 	}
 
-	fault, sig := memcpy(dst, unsafe.Pointer(&src[0]), toCopy)
+	fault, sig := memcpy(dst, uintptr(unsafe.Pointer(&src[0])), toCopy)
 	if sig == 0 {
 		return len(src), nil
 	}
 
-	faultN, dstN := uintptr(fault), uintptr(dst)
-	if faultN < dstN || faultN >= dstN+toCopy {
-		panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toCopy))
+	if fault < dst || fault >= dst+toCopy {
+		panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, fault, dst, dst+toCopy))
 	}
 
 	// memcpy might have ended the copy up to maxRegisterSize bytes before
 	// fault, if an instruction caused a memory access that straddled two
 	// pages, and the second one faulted. Try to copy up to the fault.
 	var done int
-	if faultN-dstN > maxRegisterSize {
-		done = int(faultN - dstN - maxRegisterSize)
+	if fault-dst > maxRegisterSize {
+		done = int(fault - dst - maxRegisterSize)
 	}
-	n, err := CopyOut(unsafe.Pointer(dstN+uintptr(done)), src[done:int(faultN-dstN)])
+	n, err := copyOut(dst+uintptr(done), src[done:int(fault-dst)])
 	done += n
 	if err != nil {
 		return done, err
@@ -161,6 +174,14 @@ func CopyOut(dst unsafe.Pointer, src []byte) (int, error) {
 // Data is copied in order; if [src, src+toCopy) and [dst, dst+toCopy) overlap,
 // the resulting contents of dst are unspecified.
 func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
+	n, err := copyN(uintptr(dst), uintptr(src), toCopy)
+	runtime.KeepAlive(dst)
+	runtime.KeepAlive(src)
+	return n, err
+}
+
+// copyN is the underlying definition for Copy.
+func copyN(dst, src uintptr, toCopy uintptr) (uintptr, error) {
 	if toCopy == 0 {
 		return 0, nil
 	}
@@ -171,17 +192,16 @@ func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
 	}
 
 	// Did the fault occur while reading from src or writing to dst?
-	faultN, srcN, dstN := uintptr(fault), uintptr(src), uintptr(dst)
 	faultAfterSrc := ^uintptr(0)
-	if faultN >= srcN {
-		faultAfterSrc = faultN - srcN
+	if fault >= src {
+		faultAfterSrc = fault - src
 	}
 	faultAfterDst := ^uintptr(0)
-	if faultN >= dstN {
-		faultAfterDst = faultN - dstN
+	if fault >= dst {
+		faultAfterDst = fault - dst
 	}
 	if faultAfterSrc >= toCopy && faultAfterDst >= toCopy {
-		panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, faultN, srcN, srcN+toCopy, dstN, dstN+toCopy))
+		panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, fault, src, src+toCopy, dst, dst+toCopy))
 	}
 	faultedAfter := faultAfterSrc
 	if faultedAfter > faultAfterDst {
@@ -195,7 +215,7 @@ func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
 	if faultedAfter > maxRegisterSize {
 		done = faultedAfter - maxRegisterSize
 	}
-	n, err := Copy(unsafe.Pointer(dstN+done), unsafe.Pointer(srcN+done), faultedAfter-done)
+	n, err := copyN(dst+done, src+done, faultedAfter-done)
 	done += n
 	if err != nil {
 		return done, err
@@ -206,6 +226,13 @@ func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
 // ZeroOut writes toZero zero bytes to dst. It returns the number of bytes
 // written and an error if SIGSEGV or SIGBUS is received while writing to dst.
 func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) {
+	n, err := zeroOut(uintptr(dst), toZero)
+	runtime.KeepAlive(dst)
+	return n, err
+}
+
+// zeroOut is the underlying definition for ZeroOut.
+func zeroOut(dst uintptr, toZero uintptr) (uintptr, error) {
 	if toZero == 0 {
 		return 0, nil
 	}
@@ -215,19 +242,18 @@ func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) {
 		return toZero, nil
 	}
 
-	faultN, dstN := uintptr(fault), uintptr(dst)
-	if faultN < dstN || faultN >= dstN+toZero {
-		panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toZero))
+	if fault < dst || fault >= dst+toZero {
+		panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, fault, dst, dst+toZero))
 	}
 
 	// memclr might have ended the write up to maxRegisterSize bytes before
 	// fault, if an instruction caused a memory access that straddled two
 	// pages, and the second one faulted. Try to write up to the fault.
 	var done uintptr
-	if faultN-dstN > maxRegisterSize {
-		done = faultN - dstN - maxRegisterSize
+	if fault-dst > maxRegisterSize {
+		done = fault - dst - maxRegisterSize
 	}
-	n, err := ZeroOut(unsafe.Pointer(dstN+done), faultN-dstN-done)
+	n, err := zeroOut(dst+done, fault-dst-done)
 	done += n
 	if err != nil {
 		return done, err
@@ -243,7 +269,7 @@ func SwapUint32(ptr unsafe.Pointer, new uint32) (uint32, error) {
 		return 0, AlignmentError{addr, 4}
 	}
 	old, sig := swapUint32(ptr, new)
-	return old, errorFromFaultSignal(ptr, sig)
+	return old, errorFromFaultSignal(uintptr(ptr), sig)
 }
 
 // SwapUint64 is equivalent to sync/atomic.SwapUint64, except that it returns
@@ -254,7 +280,7 @@ func SwapUint64(ptr unsafe.Pointer, new uint64) (uint64, error) {
 		return 0, AlignmentError{addr, 8}
 	}
 	old, sig := swapUint64(ptr, new)
-	return old, errorFromFaultSignal(ptr, sig)
+	return old, errorFromFaultSignal(uintptr(ptr), sig)
 }
 
 // CompareAndSwapUint32 is equivalent to atomicbitops.CompareAndSwapUint32,
@@ -265,7 +291,7 @@ func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) {
 		return 0, AlignmentError{addr, 4}
 	}
 	prev, sig := compareAndSwapUint32(ptr, old, new)
-	return prev, errorFromFaultSignal(ptr, sig)
+	return prev, errorFromFaultSignal(uintptr(ptr), sig)
 }
 
 // LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
@@ -277,17 +303,17 @@ func LoadUint32(ptr unsafe.Pointer) (uint32, error) {
 		return 0, AlignmentError{addr, 4}
 	}
 	val, sig := loadUint32(ptr)
-	return val, errorFromFaultSignal(ptr, sig)
+	return val, errorFromFaultSignal(uintptr(ptr), sig)
 }
 
-func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error {
+func errorFromFaultSignal(addr uintptr, sig int32) error {
 	switch sig {
 	case 0:
 		return nil
 	case int32(syscall.SIGSEGV):
-		return SegvError{uintptr(addr)}
+		return SegvError{addr}
 	case int32(syscall.SIGBUS):
-		return BusError{uintptr(addr)}
+		return BusError{addr}
 	default:
 		panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr))
 	}
-- 
cgit v1.2.3


From 6823b5e244a5748032130574ae3a25a0a36bbbf5 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 4 Feb 2020 13:05:30 -0800
Subject: timer_create(2) should return 0 on success

The timer ID is copied out to the argument.

Fixes #1738

PiperOrigin-RevId: 293210801
---
 pkg/sentry/syscalls/linux/sys_timer.go |  2 +-
 test/syscalls/linux/timers.cc          | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index 432351917..a4c400f87 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -146,7 +146,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 		return 0, nil, err
 	}
 
-	return uintptr(id), nil, nil
+	return 0, nil, nil
 }
 
 // TimerSettime implements linux syscall timer_settime(2).
diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc
index 3db18d7ac..2f92c27da 100644
--- a/test/syscalls/linux/timers.cc
+++ b/test/syscalls/linux/timers.cc
@@ -297,9 +297,13 @@ class IntervalTimer {
 PosixErrorOr<IntervalTimer> TimerCreate(clockid_t clockid,
                                         const struct sigevent& sev) {
   int timerid;
-  if (syscall(SYS_timer_create, clockid, &sev, &timerid) < 0) {
+  int ret = syscall(SYS_timer_create, clockid, &sev, &timerid);
+  if (ret < 0) {
     return PosixError(errno, "timer_create");
   }
+  if (ret > 0) {
+    return PosixError(EINVAL, "timer_create should never return positive");
+  }
   MaybeSave();
   return IntervalTimer(timerid);
 }
@@ -317,6 +321,18 @@ TEST(IntervalTimerTest, IsInitiallyStopped) {
   EXPECT_EQ(0, its.it_value.tv_nsec);
 }
 
+// Kernel can create multiple timers without issue.
+//
+// Regression test for gvisor.dev/issue/1738.
+TEST(IntervalTimerTest, MultipleTimers) {
+  struct sigevent sev = {};
+  sev.sigev_notify = SIGEV_NONE;
+  const auto timer1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+  const auto timer2 =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+}
+
 TEST(IntervalTimerTest, SingleShotSilent) {
   struct sigevent sev = {};
   sev.sigev_notify = SIGEV_NONE;
-- 
cgit v1.2.3


From a26a954946ad2e7910d3ad7578960a93b73a1f9b Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 4 Feb 2020 15:20:30 -0800
Subject: Add socket connection stress test.

Tests 65k connection attempts on common types of sockets to check for port
leaks.

Also fixes a bug where dual-stack sockets wouldn't properly re-queue
segments received while closing.

PiperOrigin-RevId: 293241166
---
 pkg/tcpip/transport/tcp/connect.go           |   4 ++
 test/syscalls/BUILD                          |   9 +++
 test/syscalls/linux/BUILD                    |  17 +++++
 test/syscalls/linux/ip_socket_test_util.cc   |  27 +++++++
 test/syscalls/linux/ip_socket_test_util.h    |  15 ++++
 test/syscalls/linux/socket_generic_stress.cc |  83 ++++++++++++++++++++++
 test/syscalls/linux/socket_test_util.cc      | 101 ++++++++++++++++++++++++---
 test/syscalls/linux/socket_test_util.h       |   6 ++
 8 files changed, 251 insertions(+), 11 deletions(-)
 create mode 100644 test/syscalls/linux/socket_generic_stress.cc

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 9ff7ac261..5c5397823 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -989,6 +989,10 @@ func (e *endpoint) transitionToStateCloseLocked() {
 // to any other listening endpoint. We reply with RST if we cannot find one.
 func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 	ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.ID, &s.route)
+	if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.EndpointInfo.TransportEndpointInfo.ID.LocalAddress.To4() != "" {
+		// Dual-stack socket, try IPv4.
+		ep = e.stack.FindTransportEndpoint(header.IPv4ProtocolNumber, e.TransProto, e.ID, &s.route)
+	}
 	if ep == nil {
 		replyWithReset(s)
 		s.decRef()
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 8f2b75a1c..31d239c0e 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -45,6 +45,15 @@ syscall_test(test = "//test/syscalls/linux:brk_test")
 
 syscall_test(test = "//test/syscalls/linux:socket_test")
 
+syscall_test(
+    size = "large",
+    shard_count = 50,
+    # Takes too long for TSAN. Since this is kind of a stress test that doesn't
+    # involve much concurrency, TSAN's usefulness here is limited anyway.
+    tags = ["nogotsan"],
+    test = "//test/syscalls/linux:socket_stress_test",
+)
+
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:chdir_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 737e2329f..273b014d6 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -136,6 +136,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:optional",
         "//test/util:file_descriptor",
         "//test/util:posix_error",
         "//test/util:temp_path",
@@ -2151,6 +2152,22 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_binary(
+    name = "socket_stress_test",
+    testonly = 1,
+    srcs = [
+        "socket_generic_stress.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
 cc_library(
     name = "socket_unix_dgram_test_cases",
     testonly = 1,
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 6b472eb2f..bba022a41 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -79,6 +79,33 @@ SocketPairKind DualStackTCPAcceptBindSocketPair(int type) {
                                      /* dual_stack = */ true)};
 }
 
+SocketPairKind IPv6TCPAcceptBindPersistentListenerSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "connected IPv6 TCP socket");
+  return SocketPairKind{description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP,
+                        TCPAcceptBindPersistentListenerSocketPairCreator(
+                            AF_INET6, type | SOCK_STREAM, 0,
+                            /* dual_stack = */ false)};
+}
+
+SocketPairKind IPv4TCPAcceptBindPersistentListenerSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "connected IPv4 TCP socket");
+  return SocketPairKind{description, AF_INET, type | SOCK_STREAM, IPPROTO_TCP,
+                        TCPAcceptBindPersistentListenerSocketPairCreator(
+                            AF_INET, type | SOCK_STREAM, 0,
+                            /* dual_stack = */ false)};
+}
+
+SocketPairKind DualStackTCPAcceptBindPersistentListenerSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "connected dual stack TCP socket");
+  return SocketPairKind{description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP,
+                        TCPAcceptBindPersistentListenerSocketPairCreator(
+                            AF_INET6, type | SOCK_STREAM, 0,
+                            /* dual_stack = */ true)};
+}
+
 SocketPairKind IPv6UDPBidirectionalBindSocketPair(int type) {
   std::string description =
       absl::StrCat(DescribeSocketType(type), "connected IPv6 UDP socket");
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 0f58e0f77..083ebbcf0 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -50,6 +50,21 @@ SocketPairKind IPv4TCPAcceptBindSocketPair(int type);
 // given type bound to the IPv4 loopback.
 SocketPairKind DualStackTCPAcceptBindSocketPair(int type);
 
+// IPv6TCPAcceptBindPersistentListenerSocketPair is like
+// IPv6TCPAcceptBindSocketPair except it uses a persistent listening socket to
+// create all socket pairs.
+SocketPairKind IPv6TCPAcceptBindPersistentListenerSocketPair(int type);
+
+// IPv4TCPAcceptBindPersistentListenerSocketPair is like
+// IPv4TCPAcceptBindSocketPair except it uses a persistent listening socket to
+// create all socket pairs.
+SocketPairKind IPv4TCPAcceptBindPersistentListenerSocketPair(int type);
+
+// DualStackTCPAcceptBindPersistentListenerSocketPair is like
+// DualStackTCPAcceptBindSocketPair except it uses a persistent listening socket
+// to create all socket pairs.
+SocketPairKind DualStackTCPAcceptBindPersistentListenerSocketPair(int type);
+
 // IPv6UDPBidirectionalBindSocketPair returns a SocketPairKind that represents
 // SocketPairs created with bind() and connect() syscalls with AF_INET6 and the
 // given type bound to the IPv6 loopback.
diff --git a/test/syscalls/linux/socket_generic_stress.cc b/test/syscalls/linux/socket_generic_stress.cc
new file mode 100644
index 000000000..6a232238d
--- /dev/null
+++ b/test/syscalls/linux/socket_generic_stress.cc
@@ -0,0 +1,83 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected sockets.
+using ConnectStressTest = SocketPairTest;
+
+TEST_P(ConnectStressTest, Reset65kTimes) {
+  for (int i = 0; i < 1 << 16; ++i) {
+    auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+    // Send some data to ensure that the connection gets reset and the port gets
+    // released immediately. This avoids either end entering TIME-WAIT.
+    char sent_data[100] = {};
+    ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+                SyscallSucceedsWithValue(sizeof(sent_data)));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllConnectedSockets, ConnectStressTest,
+    ::testing::Values(IPv6UDPBidirectionalBindSocketPair(0),
+                      IPv4UDPBidirectionalBindSocketPair(0),
+                      DualStackUDPBidirectionalBindSocketPair(0),
+
+                      // Without REUSEADDR, we get port exhaustion on Linux.
+                      SetSockOpt(SOL_SOCKET, SO_REUSEADDR,
+                                 &kSockOptOn)(IPv6TCPAcceptBindSocketPair(0)),
+                      SetSockOpt(SOL_SOCKET, SO_REUSEADDR,
+                                 &kSockOptOn)(IPv4TCPAcceptBindSocketPair(0)),
+                      SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+                          DualStackTCPAcceptBindSocketPair(0))));
+
+// Test fixture for tests that apply to pairs of connected sockets created with
+// a persistent listener (if applicable).
+using PersistentListenerConnectStressTest = SocketPairTest;
+
+TEST_P(PersistentListenerConnectStressTest, 65kTimes) {
+  for (int i = 0; i < 1 << 16; ++i) {
+    auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllConnectedSockets, PersistentListenerConnectStressTest,
+    ::testing::Values(
+        IPv6UDPBidirectionalBindSocketPair(0),
+        IPv4UDPBidirectionalBindSocketPair(0),
+        DualStackUDPBidirectionalBindSocketPair(0),
+
+        // Without REUSEADDR, we get port exhaustion on Linux.
+        SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+            IPv6TCPAcceptBindPersistentListenerSocketPair(0)),
+        SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+            IPv4TCPAcceptBindPersistentListenerSocketPair(0)),
+        SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+            DualStackTCPAcceptBindPersistentListenerSocketPair(0))));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index eff7d577e..c0c5ab3fe 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -18,10 +18,13 @@
 #include <poll.h>
 #include <sys/socket.h>
 
+#include <memory>
+
 #include "gtest/gtest.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/clock.h"
+#include "absl/types/optional.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
@@ -109,7 +112,10 @@ Creator<SocketPair> AcceptBindSocketPairCreator(bool abstract, int domain,
       MaybeSave();  // Unlinked path.
     }
 
-    return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
+    // accepted is before connected to destruct connected before accepted.
+    // Destructors for nonstatic member objects are called in the reverse order
+    // in which they appear in the class declaration.
+    return absl::make_unique<AddrFDSocketPair>(accepted, connected, bind_addr,
                                                extra_addr);
   };
 }
@@ -311,11 +317,16 @@ PosixErrorOr<T> BindIP(int fd, bool dual_stack) {
 }
 
 template <typename T>
-PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
-    int bound, int connected, int type, bool dual_stack) {
-  ASSIGN_OR_RETURN_ERRNO(T bind_addr, BindIP<T>(bound, dual_stack));
-  RETURN_ERROR_IF_SYSCALL_FAIL(listen(bound, /* backlog = */ 5));
+PosixErrorOr<T> TCPBindAndListen(int fd, bool dual_stack) {
+  ASSIGN_OR_RETURN_ERRNO(T addr, BindIP<T>(fd, dual_stack));
+  RETURN_ERROR_IF_SYSCALL_FAIL(listen(fd, /* backlog = */ 5));
+  return addr;
+}
 
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>>
+CreateTCPConnectAcceptSocketPair(int bound, int connected, int type,
+                                 bool dual_stack, T bind_addr) {
   int connect_result = 0;
   RETURN_ERROR_IF_SYSCALL_FAIL(
       (connect_result = RetryEINTR(connect)(
@@ -358,16 +369,27 @@ PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
     absl::SleepFor(absl::Seconds(1));
   }
 
-  // Cleanup no longer needed resources.
-  RETURN_ERROR_IF_SYSCALL_FAIL(close(bound));
-  MaybeSave();  // Successful close.
-
   T extra_addr = {};
   LocalhostAddr(&extra_addr, dual_stack);
   return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
                                              extra_addr);
 }
 
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
+    int bound, int connected, int type, bool dual_stack) {
+  ASSIGN_OR_RETURN_ERRNO(T bind_addr, TCPBindAndListen<T>(bound, dual_stack));
+
+  auto result = CreateTCPConnectAcceptSocketPair(bound, connected, type,
+                                                 dual_stack, bind_addr);
+
+  // Cleanup no longer needed resources.
+  RETURN_ERROR_IF_SYSCALL_FAIL(close(bound));
+  MaybeSave();  // Successful close.
+
+  return result;
+}
+
 Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
                                                    int protocol,
                                                    bool dual_stack) {
@@ -389,6 +411,63 @@ Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
   };
 }
 
+Creator<SocketPair> TCPAcceptBindPersistentListenerSocketPairCreator(
+    int domain, int type, int protocol, bool dual_stack) {
+  // These are lazily initialized below, on the first call to the returned
+  // lambda. These values are private to each returned lambda, but shared across
+  // invocations of a specific lambda.
+  //
+  // The sharing allows pairs created with the same parameters to share a
+  // listener. This prevents future connects from failing if the connecting
+  // socket selects a port which had previously been used by a listening socket
+  // that still has some connections in TIME-WAIT.
+  //
+  // The lazy initialization is to avoid creating sockets during parameter
+  // enumeration. This is important because parameters are enumerated during the
+  // build process where networking may not be available.
+  auto listener = std::make_shared<absl::optional<int>>(absl::optional<int>());
+  auto addr4 = std::make_shared<absl::optional<sockaddr_in>>(
+      absl::optional<sockaddr_in>());
+  auto addr6 = std::make_shared<absl::optional<sockaddr_in6>>(
+      absl::optional<sockaddr_in6>());
+
+  return [=]() -> PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> {
+    int connected;
+    RETURN_ERROR_IF_SYSCALL_FAIL(connected = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+
+    // Share the listener across invocations.
+    if (!listener->has_value()) {
+      int fd = socket(domain, type, protocol);
+      if (fd < 0) {
+        return PosixError(errno, absl::StrCat("socket(", domain, ", ", type,
+                                              ", ", protocol, ")"));
+      }
+      listener->emplace(fd);
+      MaybeSave();  // Successful socket creation.
+    }
+
+    // Bind the listener once, but create a new connect/accept pair each
+    // time.
+    if (domain == AF_INET) {
+      if (!addr4->has_value()) {
+        addr4->emplace(
+            TCPBindAndListen<sockaddr_in>(listener->value(), dual_stack)
+                .ValueOrDie());
+      }
+      return CreateTCPConnectAcceptSocketPair(listener->value(), connected,
+                                              type, dual_stack, addr4->value());
+    }
+    if (!addr6->has_value()) {
+      addr6->emplace(
+          TCPBindAndListen<sockaddr_in6>(listener->value(), dual_stack)
+              .ValueOrDie());
+    }
+    return CreateTCPConnectAcceptSocketPair(listener->value(), connected, type,
+                                            dual_stack, addr6->value());
+  };
+}
+
 template <typename T>
 PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateUDPBoundSocketPair(
     int sock1, int sock2, int type, bool dual_stack) {
@@ -518,8 +597,8 @@ size_t CalculateUnixSockAddrLen(const char* sun_path) {
   if (sun_path[0] == 0) {
     return sizeof(sockaddr_un);
   }
-  // Filesystem addresses use the address length plus the 2 byte sun_family and
-  // null terminator.
+  // Filesystem addresses use the address length plus the 2 byte sun_family
+  // and null terminator.
   return strlen(sun_path) + 3;
 }
 
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index 2dbb8bed3..bfaa6e397 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -273,6 +273,12 @@ Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
                                                    int protocol,
                                                    bool dual_stack);
 
+// TCPAcceptBindPersistentListenerSocketPairCreator is like
+// TCPAcceptBindSocketPairCreator, except it uses the same listening socket to
+// create all SocketPairs.
+Creator<SocketPair> TCPAcceptBindPersistentListenerSocketPairCreator(
+    int domain, int type, int protocol, bool dual_stack);
+
 // UDPBidirectionalBindSocketPairCreator returns a Creator<SocketPair> that
 // obtains file descriptors by invoking the bind() and connect() syscalls on UDP
 // sockets.
-- 
cgit v1.2.3


From 665b614e4a6e715bac25bea15c5c29184016e549 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Tue, 4 Feb 2020 18:04:26 -0800
Subject: Support RTM_NEWADDR and RTM_GETLINK in (rt)netlink.

PiperOrigin-RevId: 293271055
---
 pkg/sentry/inet/inet.go                      |   4 +
 pkg/sentry/inet/test_stack.go                |   6 +
 pkg/sentry/socket/hostinet/stack.go          |   5 +
 pkg/sentry/socket/netlink/BUILD              |  14 +-
 pkg/sentry/socket/netlink/message.go         | 129 +++++++++++
 pkg/sentry/socket/netlink/message_test.go    | 312 +++++++++++++++++++++++++++
 pkg/sentry/socket/netlink/provider.go        |   2 +-
 pkg/sentry/socket/netlink/route/BUILD        |   2 -
 pkg/sentry/socket/netlink/route/protocol.go  | 238 ++++++++++++++------
 pkg/sentry/socket/netlink/socket.go          |  54 ++---
 pkg/sentry/socket/netlink/uevent/protocol.go |   2 +-
 pkg/sentry/socket/netstack/stack.go          |  55 +++++
 pkg/tcpip/stack/stack.go                     |   9 +
 test/syscalls/linux/BUILD                    |   2 +
 test/syscalls/linux/socket_netlink_route.cc  | 296 ++++++++++++++++++++-----
 test/syscalls/linux/socket_netlink_util.cc   |  45 +++-
 test/syscalls/linux/socket_netlink_util.h    |   9 +
 17 files changed, 1022 insertions(+), 162 deletions(-)
 create mode 100644 pkg/sentry/socket/netlink/message_test.go

(limited to 'pkg')

diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index a7dfb78a7..2916a0644 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -28,6 +28,10 @@ type Stack interface {
 	// interface indexes to a slice of associated interface address properties.
 	InterfaceAddrs() map[int32][]InterfaceAddr
 
+	// AddInterfaceAddr adds an address to the network interface identified by
+	// index.
+	AddInterfaceAddr(idx int32, addr InterfaceAddr) error
+
 	// SupportsIPv6 returns true if the stack supports IPv6 connectivity.
 	SupportsIPv6() bool
 
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index dcfcbd97e..d8961fc94 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -47,6 +47,12 @@ func (s *TestStack) InterfaceAddrs() map[int32][]InterfaceAddr {
 	return s.InterfaceAddrsMap
 }
 
+// AddInterfaceAddr implements Stack.AddInterfaceAddr.
+func (s *TestStack) AddInterfaceAddr(idx int32, addr InterfaceAddr) error {
+	s.InterfaceAddrsMap[idx] = append(s.InterfaceAddrsMap[idx], addr)
+	return nil
+}
+
 // SupportsIPv6 implements Stack.SupportsIPv6.
 func (s *TestStack) SupportsIPv6() bool {
 	return s.SupportsIPv6Flag
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index 034eca676..a48082631 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -310,6 +310,11 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 	return addrs
 }
 
+// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
+func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+	return syserror.EACCES
+}
+
 // SupportsIPv6 implements inet.Stack.SupportsIPv6.
 func (s *Stack) SupportsIPv6() bool {
 	return s.supportsIPv6
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index f8b8e467d..1911cd9b8 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -33,3 +33,15 @@ go_library(
         "//pkg/waiter",
     ],
 )
+
+go_test(
+    name = "netlink_test",
+    size = "small",
+    srcs = [
+        "message_test.go",
+    ],
+    deps = [
+        ":netlink",
+        "//pkg/abi/linux",
+    ],
+)
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index b21e0ca4b..4ea252ccb 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -30,8 +30,16 @@ func alignUp(length int, align uint) int {
 	return (length + int(align) - 1) &^ (int(align) - 1)
 }
 
+// alignPad returns the length of padding required for alignment.
+//
+// Preconditions: align is a power of two.
+func alignPad(length int, align uint) int {
+	return alignUp(length, align) - length
+}
+
 // Message contains a complete serialized netlink message.
 type Message struct {
+	hdr linux.NetlinkMessageHeader
 	buf []byte
 }
 
@@ -40,10 +48,86 @@ type Message struct {
 // The header length will be updated by Finalize.
 func NewMessage(hdr linux.NetlinkMessageHeader) *Message {
 	return &Message{
+		hdr: hdr,
 		buf: binary.Marshal(nil, usermem.ByteOrder, hdr),
 	}
 }
 
+// ParseMessage parses the first message seen at buf, returning the rest of the
+// buffer. If message is malformed, ok of false is returned. For last message,
+// padding check is loose, if there isn't enought padding, whole buf is consumed
+// and ok is set to true.
+func ParseMessage(buf []byte) (msg *Message, rest []byte, ok bool) {
+	b := BytesView(buf)
+
+	hdrBytes, ok := b.Extract(linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return
+	}
+	var hdr linux.NetlinkMessageHeader
+	binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr)
+
+	// Msg portion.
+	totalMsgLen := int(hdr.Length)
+	_, ok = b.Extract(totalMsgLen - linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return
+	}
+
+	// Padding.
+	numPad := alignPad(totalMsgLen, linux.NLMSG_ALIGNTO)
+	// Linux permits the last message not being aligned, just consume all of it.
+	// Ref: net/netlink/af_netlink.c:netlink_rcv_skb
+	if numPad > len(b) {
+		numPad = len(b)
+	}
+	_, ok = b.Extract(numPad)
+	if !ok {
+		return
+	}
+
+	return &Message{
+		hdr: hdr,
+		buf: buf[:totalMsgLen],
+	}, []byte(b), true
+}
+
+// Header returns the header of this message.
+func (m *Message) Header() linux.NetlinkMessageHeader {
+	return m.hdr
+}
+
+// GetData unmarshals the payload message header from this netlink message, and
+// returns the attributes portion.
+func (m *Message) GetData(msg interface{}) (AttrsView, bool) {
+	b := BytesView(m.buf)
+
+	_, ok := b.Extract(linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return nil, false
+	}
+
+	size := int(binary.Size(msg))
+	msgBytes, ok := b.Extract(size)
+	if !ok {
+		return nil, false
+	}
+	binary.Unmarshal(msgBytes, usermem.ByteOrder, msg)
+
+	numPad := alignPad(linux.NetlinkMessageHeaderSize+size, linux.NLMSG_ALIGNTO)
+	// Linux permits the last message not being aligned, just consume all of it.
+	// Ref: net/netlink/af_netlink.c:netlink_rcv_skb
+	if numPad > len(b) {
+		numPad = len(b)
+	}
+	_, ok = b.Extract(numPad)
+	if !ok {
+		return nil, false
+	}
+
+	return AttrsView(b), true
+}
+
 // Finalize returns the []byte containing the entire message, with the total
 // length set in the message header. The Message must not be modified after
 // calling Finalize.
@@ -157,3 +241,48 @@ func (ms *MessageSet) AddMessage(hdr linux.NetlinkMessageHeader) *Message {
 	ms.Messages = append(ms.Messages, m)
 	return m
 }
+
+// AttrsView is a view into the attributes portion of a netlink message.
+type AttrsView []byte
+
+// Empty returns whether there is no attribute left in v.
+func (v AttrsView) Empty() bool {
+	return len(v) == 0
+}
+
+// ParseFirst parses first netlink attribute at the beginning of v.
+func (v AttrsView) ParseFirst() (hdr linux.NetlinkAttrHeader, value []byte, rest AttrsView, ok bool) {
+	b := BytesView(v)
+
+	hdrBytes, ok := b.Extract(linux.NetlinkAttrHeaderSize)
+	if !ok {
+		return
+	}
+	binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr)
+
+	value, ok = b.Extract(int(hdr.Length) - linux.NetlinkAttrHeaderSize)
+	if !ok {
+		return
+	}
+
+	_, ok = b.Extract(alignPad(int(hdr.Length), linux.NLA_ALIGNTO))
+	if !ok {
+		return
+	}
+
+	return hdr, value, AttrsView(b), ok
+}
+
+// BytesView supports extracting data from a byte slice with bounds checking.
+type BytesView []byte
+
+// Extract removes the first n bytes from v and returns it. If n is out of
+// bounds, it returns false.
+func (v *BytesView) Extract(n int) ([]byte, bool) {
+	if n < 0 || n > len(*v) {
+		return nil, false
+	}
+	extracted := (*v)[:n]
+	*v = (*v)[n:]
+	return extracted, true
+}
diff --git a/pkg/sentry/socket/netlink/message_test.go b/pkg/sentry/socket/netlink/message_test.go
new file mode 100644
index 000000000..ef13d9386
--- /dev/null
+++ b/pkg/sentry/socket/netlink/message_test.go
@@ -0,0 +1,312 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package message_test
+
+import (
+	"bytes"
+	"reflect"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
+)
+
+type dummyNetlinkMsg struct {
+	Foo uint16
+}
+
+func TestParseMessage(t *testing.T) {
+	tests := []struct {
+		desc  string
+		input []byte
+
+		header  linux.NetlinkMessageHeader
+		dataMsg *dummyNetlinkMsg
+		restLen int
+		ok      bool
+	}{
+		{
+			desc: "valid",
+			input: []byte{
+				0x14, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 20,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "valid with next message",
+			input: []byte{
+				0x14, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+				0xFF, // Next message (rest)
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 20,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 1,
+			ok:      true,
+		},
+		{
+			desc: "valid for last message without padding",
+			input: []byte{
+				0x12, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, // Data message
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 18,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "valid for last message not to be aligned",
+			input: []byte{
+				0x13, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, // Data message
+				0x00, // Excessive 1 byte permitted at end
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 19,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "header.Length too short",
+			input: []byte{
+				0x04, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			ok: false,
+		},
+		{
+			desc: "header.Length too long",
+			input: []byte{
+				0xFF, 0xFF, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			ok: false,
+		},
+		{
+			desc: "header incomplete",
+			input: []byte{
+				0x04, 0x00, 0x00, 0x00, // Length
+			},
+			ok: false,
+		},
+		{
+			desc:  "empty message",
+			input: []byte{},
+			ok:    false,
+		},
+	}
+	for _, test := range tests {
+		msg, rest, ok := netlink.ParseMessage(test.input)
+		if ok != test.ok {
+			t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok)
+			continue
+		}
+		if !test.ok {
+			continue
+		}
+		if !reflect.DeepEqual(msg.Header(), test.header) {
+			t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, msg.Header(), test.header)
+		}
+
+		dataMsg := &dummyNetlinkMsg{}
+		_, dataOk := msg.GetData(dataMsg)
+		if !dataOk {
+			t.Errorf("%v: GetData.ok = %v, want = true", test.desc, dataOk)
+		} else if !reflect.DeepEqual(dataMsg, test.dataMsg) {
+			t.Errorf("%v: GetData.msg = %+v, want = %+v", test.desc, dataMsg, test.dataMsg)
+		}
+
+		if got, want := rest, test.input[len(test.input)-test.restLen:]; !bytes.Equal(got, want) {
+			t.Errorf("%v: got rest = %v, want = %v", test.desc, got, want)
+		}
+	}
+}
+
+func TestAttrView(t *testing.T) {
+	tests := []struct {
+		desc  string
+		input []byte
+
+		// Outputs for ParseFirst.
+		hdr     linux.NetlinkAttrHeader
+		value   []byte
+		restLen int
+		ok      bool
+
+		// Outputs for Empty.
+		isEmpty bool
+	}{
+		{
+			desc: "valid",
+			input: []byte{
+				0x06, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x00, 0x00, // Data with 2 bytes padding
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 6,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31},
+			restLen: 0,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "at alignment",
+			input: []byte{
+				0x08, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 8,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31, 0x32, 0x33},
+			restLen: 0,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "at alignment with rest data",
+			input: []byte{
+				0x08, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+				0xFF, 0xFE, // Rest data
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 8,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31, 0x32, 0x33},
+			restLen: 2,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "hdr.Length too long",
+			input: []byte{
+				0xFF, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			ok:      false,
+			isEmpty: false,
+		},
+		{
+			desc: "hdr.Length too short",
+			input: []byte{
+				0x01, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			ok:      false,
+			isEmpty: false,
+		},
+		{
+			desc:    "empty",
+			input:   []byte{},
+			ok:      false,
+			isEmpty: true,
+		},
+	}
+	for _, test := range tests {
+		attrs := netlink.AttrsView(test.input)
+
+		// Test ParseFirst().
+		hdr, value, rest, ok := attrs.ParseFirst()
+		if ok != test.ok {
+			t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok)
+		} else if test.ok {
+			if !reflect.DeepEqual(hdr, test.hdr) {
+				t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, hdr, test.hdr)
+			}
+			if !bytes.Equal(value, test.value) {
+				t.Errorf("%v: got value = %v, want = %v", test.desc, value, test.value)
+			}
+			if wantRest := test.input[len(test.input)-test.restLen:]; !bytes.Equal(rest, wantRest) {
+				t.Errorf("%v: got rest = %v, want = %v", test.desc, rest, wantRest)
+			}
+		}
+
+		// Test Empty().
+		if got, want := attrs.Empty(), test.isEmpty; got != want {
+			t.Errorf("%v: got empty = %v, want = %v", test.desc, got, want)
+		}
+	}
+}
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 07f860a49..b0dc70e5c 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -42,7 +42,7 @@ type Protocol interface {
 	// If err == nil, any messages added to ms will be sent back to the
 	// other end of the socket. Setting ms.Multi will cause an NLMSG_DONE
 	// message to be sent even if ms contains no messages.
-	ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *MessageSet) *syserr.Error
+	ProcessMessage(ctx context.Context, msg *Message, ms *MessageSet) *syserr.Error
 }
 
 // Provider is a function that creates a new Protocol for a specific netlink
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 622a1eafc..93127398d 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -10,13 +10,11 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/netlink",
         "//pkg/syserr",
-        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 2b3c7f5b3..c84d8bd7c 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -17,16 +17,15 @@ package route
 
 import (
 	"bytes"
+	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/syserr"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // commandKind describes the operational class of a message type.
@@ -69,13 +68,7 @@ func (p *Protocol) CanSend() bool {
 }
 
 // dumpLinks handles RTM_GETLINK dump requests.
-func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
-	// TODO(b/68878065): Only the dump variant of the types below are
-	// supported.
-	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
-		return syserr.ErrNotSupported
-	}
-
+func (p *Protocol) dumpLinks(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// NLM_F_DUMP + RTM_GETLINK messages are supposed to include an
 	// ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some
 	// userspace applications (including glibc) still include rtgenmsg.
@@ -99,44 +92,105 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
 		return nil
 	}
 
-	for id, i := range stack.Interfaces() {
-		m := ms.AddMessage(linux.NetlinkMessageHeader{
-			Type: linux.RTM_NEWLINK,
-		})
+	for idx, i := range stack.Interfaces() {
+		addNewLinkMessage(ms, idx, i)
+	}
 
-		m.Put(linux.InterfaceInfoMessage{
-			Family: linux.AF_UNSPEC,
-			Type:   i.DeviceType,
-			Index:  id,
-			Flags:  i.Flags,
-		})
+	return nil
+}
 
-		m.PutAttrString(linux.IFLA_IFNAME, i.Name)
-		m.PutAttr(linux.IFLA_MTU, i.MTU)
+// getLinks handles RTM_GETLINK requests.
+func (p *Protocol) getLink(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		// No network devices.
+		return nil
+	}
 
-		mac := make([]byte, 6)
-		brd := mac
-		if len(i.Addr) > 0 {
-			mac = i.Addr
-			brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
+	// Parse message.
+	var ifi linux.InterfaceInfoMessage
+	attrs, ok := msg.GetData(&ifi)
+	if !ok {
+		return syserr.ErrInvalidArgument
+	}
+
+	// Parse attributes.
+	var byName []byte
+	for !attrs.Empty() {
+		ahdr, value, rest, ok := attrs.ParseFirst()
+		if !ok {
+			return syserr.ErrInvalidArgument
 		}
-		m.PutAttr(linux.IFLA_ADDRESS, mac)
-		m.PutAttr(linux.IFLA_BROADCAST, brd)
+		attrs = rest
 
-		// TODO(gvisor.dev/issue/578): There are many more attributes.
+		switch ahdr.Type {
+		case linux.IFLA_IFNAME:
+			if len(value) < 1 {
+				return syserr.ErrInvalidArgument
+			}
+			byName = value[:len(value)-1]
+
+			// TODO(gvisor.dev/issue/578): Support IFLA_EXT_MASK.
+		}
 	}
 
+	found := false
+	for idx, i := range stack.Interfaces() {
+		switch {
+		case ifi.Index > 0:
+			if idx != ifi.Index {
+				continue
+			}
+		case byName != nil:
+			if string(byName) != i.Name {
+				continue
+			}
+		default:
+			// Criteria not specified.
+			return syserr.ErrInvalidArgument
+		}
+
+		addNewLinkMessage(ms, idx, i)
+		found = true
+		break
+	}
+	if !found {
+		return syserr.ErrNoDevice
+	}
 	return nil
 }
 
-// dumpAddrs handles RTM_GETADDR dump requests.
-func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
-	// TODO(b/68878065): Only the dump variant of the types below are
-	// supported.
-	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
-		return syserr.ErrNotSupported
+// addNewLinkMessage appends RTM_NEWLINK message for the given interface into
+// the message set.
+func addNewLinkMessage(ms *netlink.MessageSet, idx int32, i inet.Interface) {
+	m := ms.AddMessage(linux.NetlinkMessageHeader{
+		Type: linux.RTM_NEWLINK,
+	})
+
+	m.Put(linux.InterfaceInfoMessage{
+		Family: linux.AF_UNSPEC,
+		Type:   i.DeviceType,
+		Index:  idx,
+		Flags:  i.Flags,
+	})
+
+	m.PutAttrString(linux.IFLA_IFNAME, i.Name)
+	m.PutAttr(linux.IFLA_MTU, i.MTU)
+
+	mac := make([]byte, 6)
+	brd := mac
+	if len(i.Addr) > 0 {
+		mac = i.Addr
+		brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
 	}
+	m.PutAttr(linux.IFLA_ADDRESS, mac)
+	m.PutAttr(linux.IFLA_BROADCAST, brd)
+
+	// TODO(gvisor.dev/issue/578): There are many more attributes.
+}
 
+// dumpAddrs handles RTM_GETADDR dump requests.
+func (p *Protocol) dumpAddrs(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// RTM_GETADDR dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
@@ -168,6 +222,7 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
 				Index:     uint32(id),
 			})
 
+			m.PutAttr(linux.IFA_LOCAL, []byte(a.Addr))
 			m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr))
 
 			// TODO(gvisor.dev/issue/578): There are many more attributes.
@@ -252,12 +307,12 @@ func fillRoute(routes []inet.Route, addr []byte) (inet.Route, *syserr.Error) {
 }
 
 // parseForDestination parses a message as format of RouteMessage-RtAttr-dst.
-func parseForDestination(data []byte) ([]byte, *syserr.Error) {
+func parseForDestination(msg *netlink.Message) ([]byte, *syserr.Error) {
 	var rtMsg linux.RouteMessage
-	if len(data) < linux.SizeOfRouteMessage {
+	attrs, ok := msg.GetData(&rtMsg)
+	if !ok {
 		return nil, syserr.ErrInvalidArgument
 	}
-	binary.Unmarshal(data[:linux.SizeOfRouteMessage], usermem.ByteOrder, &rtMsg)
 	// iproute2 added the RTM_F_LOOKUP_TABLE flag in version v4.4.0. See
 	// commit bc234301af12. Note we don't check this flag for backward
 	// compatibility.
@@ -265,26 +320,15 @@ func parseForDestination(data []byte) ([]byte, *syserr.Error) {
 		return nil, syserr.ErrNotSupported
 	}
 
-	data = data[linux.SizeOfRouteMessage:]
-
-	// TODO(gvisor.dev/issue/1611): Add generic attribute parsing.
-	var rtAttr linux.RtAttr
-	if len(data) < linux.SizeOfRtAttr {
-		return nil, syserr.ErrInvalidArgument
+	// Expect first attribute is RTA_DST.
+	if hdr, value, _, ok := attrs.ParseFirst(); ok && hdr.Type == linux.RTA_DST {
+		return value, nil
 	}
-	binary.Unmarshal(data[:linux.SizeOfRtAttr], usermem.ByteOrder, &rtAttr)
-	if rtAttr.Type != linux.RTA_DST {
-		return nil, syserr.ErrInvalidArgument
-	}
-
-	if len(data) < int(rtAttr.Len) {
-		return nil, syserr.ErrInvalidArgument
-	}
-	return data[linux.SizeOfRtAttr:rtAttr.Len], nil
+	return nil, syserr.ErrInvalidArgument
 }
 
 // dumpRoutes handles RTM_GETROUTE requests.
-func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) dumpRoutes(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// RTM_GETROUTE dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
@@ -295,10 +339,11 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade
 		return nil
 	}
 
+	hdr := msg.Header()
 	routeTables := stack.RouteTable()
 
 	if hdr.Flags == linux.NLM_F_REQUEST {
-		dst, err := parseForDestination(data)
+		dst, err := parseForDestination(msg)
 		if err != nil {
 			return err
 		}
@@ -357,10 +402,55 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade
 	return nil
 }
 
+// newAddr handles RTM_NEWADDR requests.
+func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		// No network stack.
+		return syserr.ErrProtocolNotSupported
+	}
+
+	var ifa linux.InterfaceAddrMessage
+	attrs, ok := msg.GetData(&ifa)
+	if !ok {
+		return syserr.ErrInvalidArgument
+	}
+
+	for !attrs.Empty() {
+		ahdr, value, rest, ok := attrs.ParseFirst()
+		if !ok {
+			return syserr.ErrInvalidArgument
+		}
+		attrs = rest
+
+		switch ahdr.Type {
+		case linux.IFA_LOCAL:
+			err := stack.AddInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{
+				Family:    ifa.Family,
+				PrefixLen: ifa.PrefixLen,
+				Flags:     ifa.Flags,
+				Addr:      value,
+			})
+			if err == syscall.EEXIST {
+				flags := msg.Header().Flags
+				if flags&linux.NLM_F_EXCL != 0 {
+					return syserr.ErrExists
+				}
+			} else if err != nil {
+				return syserr.ErrInvalidArgument
+			}
+		}
+	}
+	return nil
+}
+
 // ProcessMessage implements netlink.Protocol.ProcessMessage.
-func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	hdr := msg.Header()
+
 	// All messages start with a 1 byte protocol family.
-	if len(data) < 1 {
+	var family uint8
+	if _, ok := msg.GetData(&family); !ok {
 		// Linux ignores messages missing the protocol family. See
 		// net/core/rtnetlink.c:rtnetlink_rcv_msg.
 		return nil
@@ -374,16 +464,32 @@ func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageH
 		}
 	}
 
-	switch hdr.Type {
-	case linux.RTM_GETLINK:
-		return p.dumpLinks(ctx, hdr, data, ms)
-	case linux.RTM_GETADDR:
-		return p.dumpAddrs(ctx, hdr, data, ms)
-	case linux.RTM_GETROUTE:
-		return p.dumpRoutes(ctx, hdr, data, ms)
-	default:
-		return syserr.ErrNotSupported
+	if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP {
+		// TODO(b/68878065): Only the dump variant of the types below are
+		// supported.
+		switch hdr.Type {
+		case linux.RTM_GETLINK:
+			return p.dumpLinks(ctx, msg, ms)
+		case linux.RTM_GETADDR:
+			return p.dumpAddrs(ctx, msg, ms)
+		case linux.RTM_GETROUTE:
+			return p.dumpRoutes(ctx, msg, ms)
+		default:
+			return syserr.ErrNotSupported
+		}
+	} else if hdr.Flags&linux.NLM_F_REQUEST == linux.NLM_F_REQUEST {
+		switch hdr.Type {
+		case linux.RTM_GETLINK:
+			return p.getLink(ctx, msg, ms)
+		case linux.RTM_GETROUTE:
+			return p.dumpRoutes(ctx, msg, ms)
+		case linux.RTM_NEWADDR:
+			return p.newAddr(ctx, msg, ms)
+		default:
+			return syserr.ErrNotSupported
+		}
 	}
+	return syserr.ErrNotSupported
 }
 
 // init registers the NETLINK_ROUTE provider.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index c4b95debb..2ca02567d 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -644,47 +644,38 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 	return nil
 }
 
-func (s *Socket) dumpErrorMesage(ctx context.Context, hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) *syserr.Error {
+func dumpErrorMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) {
 	m := ms.AddMessage(linux.NetlinkMessageHeader{
 		Type: linux.NLMSG_ERROR,
 	})
-
 	m.Put(linux.NetlinkErrorMessage{
 		Error:  int32(-err.ToLinux().Number()),
 		Header: hdr,
 	})
-	return nil
+}
 
+func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) {
+	m := ms.AddMessage(linux.NetlinkMessageHeader{
+		Type: linux.NLMSG_ERROR,
+	})
+	m.Put(linux.NetlinkErrorMessage{
+		Error:  0,
+		Header: hdr,
+	})
 }
 
 // processMessages handles each message in buf, passing it to the protocol
 // handler for final handling.
 func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error {
 	for len(buf) > 0 {
-		if len(buf) < linux.NetlinkMessageHeaderSize {
+		msg, rest, ok := ParseMessage(buf)
+		if !ok {
 			// Linux ignores messages that are too short. See
 			// net/netlink/af_netlink.c:netlink_rcv_skb.
 			break
 		}
-
-		var hdr linux.NetlinkMessageHeader
-		binary.Unmarshal(buf[:linux.NetlinkMessageHeaderSize], usermem.ByteOrder, &hdr)
-
-		if hdr.Length < linux.NetlinkMessageHeaderSize || uint64(hdr.Length) > uint64(len(buf)) {
-			// Linux ignores malformed messages. See
-			// net/netlink/af_netlink.c:netlink_rcv_skb.
-			break
-		}
-
-		// Data from this message.
-		data := buf[linux.NetlinkMessageHeaderSize:hdr.Length]
-
-		// Advance to the next message.
-		next := alignUp(int(hdr.Length), linux.NLMSG_ALIGNTO)
-		if next >= len(buf)-1 {
-			next = len(buf) - 1
-		}
-		buf = buf[next:]
+		buf = rest
+		hdr := msg.Header()
 
 		// Ignore control messages.
 		if hdr.Type < linux.NLMSG_MIN_TYPE {
@@ -692,19 +683,10 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error
 		}
 
 		ms := NewMessageSet(s.portID, hdr.Seq)
-		var err *syserr.Error
-		// TODO(b/68877377): ACKs not supported yet.
-		if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
-			err = syserr.ErrNotSupported
-		} else {
-
-			err = s.protocol.ProcessMessage(ctx, hdr, data, ms)
-		}
-		if err != nil {
-			ms = NewMessageSet(s.portID, hdr.Seq)
-			if err := s.dumpErrorMesage(ctx, hdr, ms, err); err != nil {
-				return err
-			}
+		if err := s.protocol.ProcessMessage(ctx, msg, ms); err != nil {
+			dumpErrorMesage(hdr, ms, err)
+		} else if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
+			dumpAckMesage(hdr, ms)
 		}
 
 		if err := s.sendResponse(ctx, ms); err != nil {
diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go
index 1ee4296bc..029ba21b5 100644
--- a/pkg/sentry/socket/netlink/uevent/protocol.go
+++ b/pkg/sentry/socket/netlink/uevent/protocol.go
@@ -49,7 +49,7 @@ func (p *Protocol) CanSend() bool {
 }
 
 // ProcessMessage implements netlink.Protocol.ProcessMessage.
-func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// Silently ignore all messages.
 	return nil
 }
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index 31ea66eca..0692482e9 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -20,6 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -88,6 +90,59 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 	return nicAddrs
 }
 
+// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
+func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+	var (
+		protocol tcpip.NetworkProtocolNumber
+		address  tcpip.Address
+	)
+	switch addr.Family {
+	case linux.AF_INET:
+		if len(addr.Addr) < header.IPv4AddressSize {
+			return syserror.EINVAL
+		}
+		if addr.PrefixLen > header.IPv4AddressSize*8 {
+			return syserror.EINVAL
+		}
+		protocol = ipv4.ProtocolNumber
+		address = tcpip.Address(addr.Addr[:header.IPv4AddressSize])
+
+	case linux.AF_INET6:
+		if len(addr.Addr) < header.IPv6AddressSize {
+			return syserror.EINVAL
+		}
+		if addr.PrefixLen > header.IPv6AddressSize*8 {
+			return syserror.EINVAL
+		}
+		protocol = ipv6.ProtocolNumber
+		address = tcpip.Address(addr.Addr[:header.IPv6AddressSize])
+
+	default:
+		return syserror.ENOTSUP
+	}
+
+	protocolAddress := tcpip.ProtocolAddress{
+		Protocol: protocol,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   address,
+			PrefixLen: int(addr.PrefixLen),
+		},
+	}
+
+	// Attach address to interface.
+	if err := s.Stack.AddProtocolAddressWithOptions(tcpip.NICID(idx), protocolAddress, stack.CanBePrimaryEndpoint); err != nil {
+		return syserr.TranslateNetstackError(err).ToError()
+	}
+
+	// Add route for local network.
+	s.Stack.AddRoute(tcpip.Route{
+		Destination: protocolAddress.AddressWithPrefix.Subnet(),
+		Gateway:     "", // No gateway for local network.
+		NIC:         tcpip.NICID(idx),
+	})
+	return nil
+}
+
 // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
 func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
 	var rs tcp.ReceiveBufferSizeOption
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 7057b110e..b793f1d74 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -795,6 +795,8 @@ func (s *Stack) Forwarding() bool {
 
 // SetRouteTable assigns the route table to be used by this stack. It
 // specifies which NIC to use for given destination address ranges.
+//
+// This method takes ownership of the table.
 func (s *Stack) SetRouteTable(table []tcpip.Route) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -809,6 +811,13 @@ func (s *Stack) GetRouteTable() []tcpip.Route {
 	return append([]tcpip.Route(nil), s.routeTable...)
 }
 
+// AddRoute appends a route to the route table.
+func (s *Stack) AddRoute(route tcpip.Route) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.routeTable = append(s.routeTable, route)
+}
+
 // NewEndpoint creates a new transport layer endpoint of the given protocol.
 func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	t, ok := s.transportProtocols[transport]
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 273b014d6..f2e3c7072 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2769,9 +2769,11 @@ cc_binary(
     deps = [
         ":socket_netlink_util",
         ":socket_test_util",
+        "//test/util:capability_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
         gtest,
         "//test/util:test_main",
         "//test/util:test_util",
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index 1e28e658d..e5aed1eec 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -14,6 +14,7 @@
 
 #include <arpa/inet.h>
 #include <ifaddrs.h>
+#include <linux/if.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 #include <sys/socket.h>
@@ -25,8 +26,10 @@
 
 #include "gtest/gtest.h"
 #include "absl/strings/str_format.h"
+#include "absl/types/optional.h"
 #include "test/syscalls/linux/socket_netlink_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
 #include "test/util/cleanup.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/test_util.h"
@@ -38,6 +41,8 @@ namespace testing {
 
 namespace {
 
+constexpr uint32_t kSeq = 12345;
+
 using ::testing::AnyOf;
 using ::testing::Eq;
 
@@ -113,58 +118,224 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
   // TODO(mpratt): Check ifinfomsg contents and following attrs.
 }
 
+PosixError DumpLinks(
+    const FileDescriptor& fd, uint32_t seq,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = seq;
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
+}
+
 TEST(NetlinkRouteTest, GetLinkDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
+  // Loopback is common among all tests, check that it's found.
+  bool loopbackFound = false;
+  ASSERT_NO_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    CheckGetLinkResponse(hdr, kSeq, port);
+    if (hdr->nlmsg_type != RTM_NEWLINK) {
+      return;
+    }
+    ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    std::cout << "Found interface idx=" << msg->ifi_index
+              << ", type=" << std::hex << msg->ifi_type;
+    if (msg->ifi_type == ARPHRD_LOOPBACK) {
+      loopbackFound = true;
+      EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
+    }
+  }));
+  EXPECT_TRUE(loopbackFound);
+}
+
+struct Link {
+  int index;
+  std::string name;
+};
+
+PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  absl::optional<Link> link;
+  RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    if (hdr->nlmsg_type != RTM_NEWLINK ||
+        hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
+      return;
+    }
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    if (msg->ifi_type == ARPHRD_LOOPBACK) {
+      const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+      if (rta == nullptr) {
+        // Ignore links that do not have a name.
+        return;
+      }
+
+      link = Link();
+      link->index = msg->ifi_index;
+      link->name = std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
+    }
+  }));
+  return link;
+}
+
+// CheckLinkMsg checks a netlink message against an expected link.
+void CheckLinkMsg(const struct nlmsghdr* hdr, const Link& link) {
+  ASSERT_THAT(hdr->nlmsg_type, Eq(RTM_NEWLINK));
+  ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+  const struct ifinfomsg* msg =
+      reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+  EXPECT_EQ(msg->ifi_index, link.index);
+
+  const struct rtattr* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+  EXPECT_NE(nullptr, rta) << "IFLA_IFNAME not found in message.";
+  if (rta != nullptr) {
+    std::string name(reinterpret_cast<const char*>(RTA_DATA(rta)));
+    EXPECT_EQ(name, link.name);
+  }
+}
+
+TEST(NetlinkRouteTest, GetLinkByIndex) {
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
   struct request {
     struct nlmsghdr hdr;
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
-  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
+  req.ifm.ifi_index = loopback_link->index;
 
-  // Loopback is common among all tests, check that it's found.
-  bool loopbackFound = false;
+  bool found = false;
   ASSERT_NO_ERRNO(NetlinkRequestResponse(
       fd, &req, sizeof(req),
       [&](const struct nlmsghdr* hdr) {
-        CheckGetLinkResponse(hdr, kSeq, port);
-        if (hdr->nlmsg_type != RTM_NEWLINK) {
-          return;
-        }
-        ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
-        const struct ifinfomsg* msg =
-            reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
-        std::cout << "Found interface idx=" << msg->ifi_index
-                  << ", type=" << std::hex << msg->ifi_type;
-        if (msg->ifi_type == ARPHRD_LOOPBACK) {
-          loopbackFound = true;
-          EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
-        }
+        CheckLinkMsg(hdr, *loopback_link);
+        found = true;
       },
       false));
-  EXPECT_TRUE(loopbackFound);
+  EXPECT_TRUE(found) << "Netlink response does not contain any links.";
 }
 
-TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
+TEST(NetlinkRouteTest, GetLinkByName) {
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
     struct ifinfomsg ifm;
+    struct rtattr rtattr;
+    char ifname[IFNAMSIZ];
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
   };
 
-  constexpr uint32_t kSeq = 12345;
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.rtattr.rta_type = IFLA_IFNAME;
+  req.rtattr.rta_len = RTA_LENGTH(loopback_link->name.size() + 1);
+  strncpy(req.ifname, loopback_link->name.c_str(), sizeof(req.ifname));
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  bool found = false;
+  ASSERT_NO_ERRNO(NetlinkRequestResponse(
+      fd, &req, sizeof(req),
+      [&](const struct nlmsghdr* hdr) {
+        CheckLinkMsg(hdr, *loopback_link);
+        found = true;
+      },
+      false));
+  EXPECT_TRUE(found) << "Netlink response does not contain any links.";
+}
+
+TEST(NetlinkRouteTest, GetLinkByIndexNotFound) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.ifm.ifi_index = 1234590;
+
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(ENODEV, ::testing::_));
+}
+
+TEST(NetlinkRouteTest, GetLinkByNameNotFound) {
+  const std::string name = "nodevice?!";
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+    struct rtattr rtattr;
+    char ifname[IFNAMSIZ];
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.rtattr.rta_type = IFLA_IFNAME;
+  req.rtattr.rta_len = RTA_LENGTH(name.size() + 1);
+  strncpy(req.ifname, name.c_str(), sizeof(req.ifname));
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(ENODEV, ::testing::_));
+}
+
+TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -175,18 +346,8 @@ TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
 
-  ASSERT_NO_ERRNO(NetlinkRequestResponse(
-      fd, &req, sizeof(req),
-      [&](const struct nlmsghdr* hdr) {
-        EXPECT_THAT(hdr->nlmsg_type, Eq(NLMSG_ERROR));
-        EXPECT_EQ(hdr->nlmsg_seq, kSeq);
-        EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct nlmsgerr));
-
-        const struct nlmsgerr* msg =
-            reinterpret_cast<const struct nlmsgerr*>(NLMSG_DATA(hdr));
-        EXPECT_EQ(msg->error, -EOPNOTSUPP);
-      },
-      true));
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(EOPNOTSUPP, ::testing::_));
 }
 
 TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
@@ -198,8 +359,6 @@ TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
@@ -238,8 +397,6 @@ TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
@@ -282,8 +439,6 @@ TEST(NetlinkRouteTest, ControlMessageIgnored) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
 
   // This control message is ignored. We still receive a response for the
@@ -317,8 +472,6 @@ TEST(NetlinkRouteTest, GetAddrDump) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -367,6 +520,57 @@ TEST(NetlinkRouteTest, LookupAll) {
   ASSERT_GT(count, 0);
 }
 
+TEST(NetlinkRouteTest, AddAddr) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifaddrmsg ifa;
+    struct rtattr rtattr;
+    struct in_addr addr;
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_NEWADDR;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifa.ifa_family = AF_INET;
+  req.ifa.ifa_prefixlen = 24;
+  req.ifa.ifa_flags = 0;
+  req.ifa.ifa_scope = 0;
+  req.ifa.ifa_index = loopback_link->index;
+  req.rtattr.rta_type = IFA_LOCAL;
+  req.rtattr.rta_len = RTA_LENGTH(sizeof(req.addr));
+  inet_pton(AF_INET, "10.0.0.1", &req.addr);
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifa)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  // Create should succeed, as no such address in kernel.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
+  EXPECT_NO_ERRNO(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+
+  // Replace an existing address should succeed.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_REPLACE | NLM_F_ACK;
+  req.hdr.nlmsg_seq++;
+  EXPECT_NO_ERRNO(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+
+  // Create exclusive should fail, as we created the address above.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL | NLM_F_ACK;
+  req.hdr.nlmsg_seq++;
+  EXPECT_THAT(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len),
+      PosixErrorIs(EEXIST, ::testing::_));
+}
+
 // GetRouteDump tests a RTM_GETROUTE + NLM_F_DUMP request.
 TEST(NetlinkRouteTest, GetRouteDump) {
   FileDescriptor fd =
@@ -378,8 +582,6 @@ TEST(NetlinkRouteTest, GetRouteDump) {
     struct rtmsg rtm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETROUTE;
@@ -538,8 +740,6 @@ TEST(NetlinkRouteTest, RecvmsgTrunc) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -615,8 +815,6 @@ TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -695,8 +893,6 @@ TEST(NetlinkRouteTest, NoPasscredNoCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -743,8 +939,6 @@ TEST(NetlinkRouteTest, PasscredCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index cd2212a1a..952eecfe8 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -16,6 +16,7 @@
 
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
+#include <linux/rtnetlink.h>
 #include <sys/socket.h>
 
 #include <vector>
@@ -71,9 +72,10 @@ PosixError NetlinkRequestResponse(
   iov.iov_base = buf.data();
   iov.iov_len = buf.size();
 
-  // Response is a series of NLM_F_MULTI messages, ending with a NLMSG_DONE
-  // message.
+  // If NLM_F_MULTI is set, response is a series of messages that ends with a
+  // NLMSG_DONE message.
   int type = -1;
+  int flags = 0;
   do {
     int len;
     RETURN_ERROR_IF_SYSCALL_FAIL(len = RetryEINTR(recvmsg)(fd.get(), &msg, 0));
@@ -89,6 +91,7 @@ PosixError NetlinkRequestResponse(
     for (struct nlmsghdr* hdr = reinterpret_cast<struct nlmsghdr*>(buf.data());
          NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) {
       fn(hdr);
+      flags = hdr->nlmsg_flags;
       type = hdr->nlmsg_type;
       // Done should include an integer payload for dump_done_errno.
       // See net/netlink/af_netlink.c:netlink_dump
@@ -98,11 +101,11 @@ PosixError NetlinkRequestResponse(
         EXPECT_GE(hdr->nlmsg_len, NLMSG_LENGTH(sizeof(int)));
       }
     }
-  } while (type != NLMSG_DONE && type != NLMSG_ERROR);
+  } while ((flags & NLM_F_MULTI) && type != NLMSG_DONE && type != NLMSG_ERROR);
 
   if (expect_nlmsgerr) {
     EXPECT_EQ(type, NLMSG_ERROR);
-  } else {
+  } else if (flags & NLM_F_MULTI) {
     EXPECT_EQ(type, NLMSG_DONE);
   }
   return NoError();
@@ -146,5 +149,39 @@ PosixError NetlinkRequestResponseSingle(
   return NoError();
 }
 
+PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq,
+                                    void* request, size_t len) {
+  // Dummy negative number for no error message received.
+  // We won't get a negative error number so there will be no confusion.
+  int err = -42;
+  RETURN_IF_ERRNO(NetlinkRequestResponse(
+      fd, request, len,
+      [&](const struct nlmsghdr* hdr) {
+        EXPECT_EQ(NLMSG_ERROR, hdr->nlmsg_type);
+        EXPECT_EQ(hdr->nlmsg_seq, seq);
+        EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct nlmsgerr));
+
+        const struct nlmsgerr* msg =
+            reinterpret_cast<const struct nlmsgerr*>(NLMSG_DATA(hdr));
+        err = -msg->error;
+      },
+      true));
+  return PosixError(err);
+}
+
+const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr,
+                                const struct ifinfomsg* msg, int16_t attr) {
+  const int ifi_space = NLMSG_SPACE(sizeof(*msg));
+  int attrlen = hdr->nlmsg_len - ifi_space;
+  const struct rtattr* rta = reinterpret_cast<const struct rtattr*>(
+      reinterpret_cast<const uint8_t*>(hdr) + NLMSG_ALIGN(ifi_space));
+  for (; RTA_OK(rta, attrlen); rta = RTA_NEXT(rta, attrlen)) {
+    if (rta->rta_type == attr) {
+      return rta;
+    }
+  }
+  return nullptr;
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index 3678c0599..e13ead406 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -19,6 +19,7 @@
 // socket.h has to be included before if_arp.h.
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
+#include <linux/rtnetlink.h>
 
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
@@ -47,6 +48,14 @@ PosixError NetlinkRequestResponseSingle(
     const FileDescriptor& fd, void* request, size_t len,
     const std::function<void(const struct nlmsghdr* hdr)>& fn);
 
+// Send the passed request then expect and return an ack or error.
+PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq,
+                                    void* request, size_t len);
+
+// Find rtnetlink attribute in message.
+const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr,
+                                const struct ifinfomsg* msg, int16_t attr);
+
 }  // namespace testing
 }  // namespace gvisor
 
-- 
cgit v1.2.3


From eea0eeee933ba8406ae688fce4348271f9513514 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 5 Feb 2020 11:25:10 -0800
Subject: Disable get/set xattrs until list/remove exist too.

PiperOrigin-RevId: 293411655
---
 pkg/sentry/syscalls/linux/linux64_amd64.go |  27 ++++---
 pkg/sentry/syscalls/linux/linux64_arm64.go |  37 +++++----
 test/syscalls/linux/xattr.cc               | 124 +++++++++++++++++++++++++++++
 3 files changed, 159 insertions(+), 29 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 7435b50bf..588f8b087 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -228,18 +228,21 @@ var AMD64 = &kernel.SyscallTable{
 		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
-		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
-		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
-		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
-		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
-		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		// TODO(b/148303075): Enable set/getxattr (in their various
+		// forms) once we also have list and removexattr. The JVM
+		// assumes that if get/set exist, then list and remove do too.
+		188: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		189: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		190: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
 		200: syscalls.Supported("tkill", Tkill),
 		201: syscalls.Supported("time", Time),
 		202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 03a39fe65..06e5ee401 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -36,23 +36,26 @@ var ARM64 = &kernel.SyscallTable{
 	},
 	AuditNumber: linux.AUDIT_ARCH_AARCH64,
 	Table: map[uintptr]kernel.Syscall{
-		0:   syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		1:   syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
-		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
-		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
-		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
-		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		12:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		14:  syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		15:  syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		16:  syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		// TODO(b/148303075): Enable set/getxattr (in their various
+		// forms) once we also have list and removexattr. The JVM
+		// assumes that if get/set exist, then list and remove do too.
+		5:   syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		6:   syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		7:   syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		8:   syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		13:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		14:  syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		15:  syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		16:  syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
 		17:  syscalls.Supported("getcwd", Getcwd),
 		18:  syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
 		19:  syscalls.Supported("eventfd2", Eventfd2),
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index ab21d68c6..85eb31847 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -39,6 +39,10 @@ namespace {
 class XattrTest : public FileTest {};
 
 TEST_F(XattrTest, XattrNullName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
@@ -48,6 +52,10 @@ TEST_F(XattrTest, XattrNullName) {
 }
 
 TEST_F(XattrTest, XattrEmptyName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
@@ -56,6 +64,10 @@ TEST_F(XattrTest, XattrEmptyName) {
 }
 
 TEST_F(XattrTest, XattrLargeName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   std::string name = "user.";
   name += std::string(XATTR_NAME_MAX - name.length(), 'a');
@@ -77,6 +89,10 @@ TEST_F(XattrTest, XattrLargeName) {
 }
 
 TEST_F(XattrTest, XattrInvalidPrefix) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   std::string name(XATTR_NAME_MAX, 'a');
   EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
@@ -88,6 +104,10 @@ TEST_F(XattrTest, XattrInvalidPrefix) {
 // Do not allow save/restore cycles after making the test file read-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -113,6 +133,10 @@ TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
 // Do not allow save/restore cycles after making the test file write-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -143,6 +167,10 @@ TEST_F(XattrTest, XattrTrustedWithNonadmin) {
 }
 
 TEST_F(XattrTest, XattrOnDirectory) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
@@ -152,6 +180,10 @@ TEST_F(XattrTest, XattrOnDirectory) {
 }
 
 TEST_F(XattrTest, XattrOnSymlink) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
@@ -163,6 +195,10 @@ TEST_F(XattrTest, XattrOnSymlink) {
 }
 
 TEST_F(XattrTest, XattrOnInvalidFileTypes) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char name[] = "user.test";
 
   char char_device[] = "/dev/zero";
@@ -181,6 +217,10 @@ TEST_F(XattrTest, XattrOnInvalidFileTypes) {
 }
 
 TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -196,6 +236,10 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, SetxattrZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -208,6 +252,10 @@ TEST_F(XattrTest, SetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrSizeTooLarge) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
 
@@ -223,6 +271,10 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
@@ -232,6 +284,10 @@ TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -240,6 +296,10 @@ TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val(XATTR_SIZE_MAX + 1);
@@ -256,6 +316,10 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -271,6 +335,10 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithLarger) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -285,6 +353,10 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) {
 }
 
 TEST_F(XattrTest, SetxattrCreateFlag) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
@@ -296,6 +368,10 @@ TEST_F(XattrTest, SetxattrCreateFlag) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceFlag) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
@@ -308,6 +384,10 @@ TEST_F(XattrTest, SetxattrReplaceFlag) {
 }
 
 TEST_F(XattrTest, SetxattrInvalidFlags) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   int invalid_flags = 0xff;
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
@@ -315,6 +395,10 @@ TEST_F(XattrTest, SetxattrInvalidFlags) {
 }
 
 TEST_F(XattrTest, Getxattr) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -327,6 +411,10 @@ TEST_F(XattrTest, Getxattr) {
 }
 
 TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -339,6 +427,10 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -354,6 +446,10 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -367,6 +463,10 @@ TEST_F(XattrTest, GetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrSizeTooLarge) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -383,6 +483,10 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, GetxattrNullValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -394,6 +498,10 @@ TEST_F(XattrTest, GetxattrNullValue) {
 }
 
 TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -410,12 +518,20 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrNonexistentName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, LGetSetxattrOnSymlink) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
@@ -427,6 +543,10 @@ TEST_F(XattrTest, LGetSetxattrOnSymlink) {
 }
 
 TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -441,6 +561,10 @@ TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
 }
 
 TEST_F(XattrTest, FGetSetxattr) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0));
   const char name[] = "user.test";
-- 
cgit v1.2.3


From f3d95607036b8a502c65aa7b3e8145227274dbbc Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 5 Feb 2020 17:56:00 -0800
Subject: recv() on a closed TCP socket returns ENOTCONN

From RFC 793 s3.9 p58 Event Processing:

If RECEIVE Call arrives in CLOSED state and the user has access to such a
connection, the return should be "error: connection does not exist"

Fixes #1598

PiperOrigin-RevId: 293494287
---
 pkg/sentry/socket/netstack/netstack.go | 7 ++++++-
 pkg/tcpip/tcpip.go                     | 4 ++++
 pkg/tcpip/transport/tcp/endpoint.go    | 4 ++--
 pkg/tcpip/transport/tcp/tcp_test.go    | 9 ++++-----
 test/syscalls/linux/tcp_socket.cc      | 9 +++++++++
 5 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 049d04bf2..ed2fbcceb 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -2229,11 +2229,16 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 	var copied int
 
 	// Copy as many views as possible into the user-provided buffer.
-	for dst.NumBytes() != 0 {
+	for {
+		// Always do at least one fetchReadView, even if the number of bytes to
+		// read is 0.
 		err = s.fetchReadView()
 		if err != nil {
 			break
 		}
+		if dst.NumBytes() == 0 {
+			break
+		}
 
 		var n int
 		var e error
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 0fa141d58..d29d9a704 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1124,6 +1124,10 @@ type ReadErrors struct {
 	// InvalidEndpointState is the number of times we found the endpoint state
 	// to be unexpected.
 	InvalidEndpointState StatCounter
+
+	// NotConnected is the number of times we tried to read but found that the
+	// endpoint was not connected.
+	NotConnected StatCounter
 }
 
 // WriteErrors collects packet write errors from an endpoint write call.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index b5a8e15ee..e4a6b1b8b 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1003,8 +1003,8 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 		if s == StateError {
 			return buffer.View{}, tcpip.ControlMessages{}, he
 		}
-		e.stats.ReadErrors.InvalidEndpointState.Increment()
-		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
+		e.stats.ReadErrors.NotConnected.Increment()
+		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrNotConnected
 	}
 
 	v, err := e.readLocked()
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 2c1505067..cc118c993 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -5405,12 +5405,11 @@ func TestEndpointBindListenAcceptState(t *testing.T) {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
 
-	// Expect InvalidEndpointState errors on a read at this point.
-	if _, _, err := ep.Read(nil); err != tcpip.ErrInvalidEndpointState {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrInvalidEndpointState)
+	if _, _, err := ep.Read(nil); err != tcpip.ErrNotConnected {
+		t.Errorf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrNotConnected)
 	}
-	if got := ep.Stats().(*tcp.Stats).ReadErrors.InvalidEndpointState.Value(); got != 1 {
-		t.Fatalf("got EP stats Stats.ReadErrors.InvalidEndpointState got %v want %v", got, 1)
+	if got := ep.Stats().(*tcp.Stats).ReadErrors.NotConnected.Value(); got != 1 {
+		t.Errorf("got EP stats Stats.ReadErrors.NotConnected got %v want %v", got, 1)
 	}
 
 	if err := ep.Listen(10); err != nil {
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 525ccbd88..8a8b68e75 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1339,6 +1339,15 @@ TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptGreaterThanZero) {
   EXPECT_EQ(get, kTCPDeferAccept);
 }
 
+TEST_P(SimpleTcpSocketTest, RecvOnClosedSocket) {
+  auto s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  char buf[1];
+  EXPECT_THAT(recv(s.get(), buf, 0, 0), SyscallFailsWithErrno(ENOTCONN));
+  EXPECT_THAT(recv(s.get(), buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From 1b6a12a768216a99a5e0428c42ea4faf79cf3b50 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 5 Feb 2020 22:45:44 -0800
Subject: Add notes to relevant tests.

These were out-of-band notes that can help provide additional context
and simplify automated imports.

PiperOrigin-RevId: 293525915
---
 pkg/metric/metric.go                          |  1 -
 pkg/sentry/arch/arch_x86.go                   |  4 ++
 pkg/sentry/arch/signal_amd64.go               |  2 +-
 pkg/sentry/fs/file_overlay_test.go            |  1 +
 pkg/sentry/fs/proc/README.md                  |  4 ++
 pkg/sentry/kernel/BUILD                       |  1 +
 pkg/sentry/kernel/kernel.go                   |  3 ++
 pkg/sentry/kernel/kernel_opts.go              | 20 +++++++
 pkg/sentry/socket/hostinet/BUILD              |  1 +
 pkg/sentry/socket/hostinet/socket.go          |  5 +-
 pkg/sentry/socket/hostinet/sockopt_impl.go    | 27 ++++++++++
 pkg/tcpip/transport/tcp/endpoint.go           |  3 ++
 runsc/boot/filter/BUILD                       |  1 +
 runsc/boot/filter/config.go                   | 13 -----
 runsc/boot/filter/config_profile.go           | 34 ++++++++++++
 runsc/container/console_test.go               |  5 +-
 runsc/dockerutil/dockerutil.go                | 11 ++--
 runsc/testutil/BUILD                          |  5 +-
 runsc/testutil/testutil.go                    | 54 -------------------
 runsc/testutil/testutil_runfiles.go           | 75 +++++++++++++++++++++++++++
 test/image/image_test.go                      |  8 +--
 test/syscalls/build_defs.bzl                  | 35 +++++++++++--
 test/syscalls/linux/chroot.cc                 |  2 +-
 test/syscalls/linux/concurrency.cc            |  3 +-
 test/syscalls/linux/exec_proc_exe_workload.cc |  6 +++
 test/syscalls/linux/fork.cc                   |  5 +-
 test/syscalls/linux/mmap.cc                   |  8 +--
 test/syscalls/linux/open_create.cc            |  1 +
 test/syscalls/linux/preadv.cc                 |  1 +
 test/syscalls/linux/proc.cc                   | 46 +++++++++++++---
 test/syscalls/linux/readv.cc                  |  4 +-
 test/syscalls/linux/rseq.cc                   |  2 +-
 test/syscalls/linux/select.cc                 |  2 +-
 test/syscalls/linux/shm.cc                    |  2 +-
 test/syscalls/linux/sigprocmask.cc            |  2 +-
 test/syscalls/linux/socket_unix_non_stream.cc |  4 +-
 test/syscalls/linux/symlink.cc                |  2 +-
 test/syscalls/linux/tcp_socket.cc             |  3 +-
 test/syscalls/linux/time.cc                   |  1 +
 test/syscalls/linux/tkill.cc                  |  2 +-
 test/util/temp_path.cc                        |  1 +
 tools/build/tags.bzl                          |  4 ++
 tools/defs.bzl                                | 17 +++++-
 43 files changed, 318 insertions(+), 113 deletions(-)
 create mode 100644 pkg/sentry/kernel/kernel_opts.go
 create mode 100644 pkg/sentry/socket/hostinet/sockopt_impl.go
 create mode 100644 runsc/boot/filter/config_profile.go
 create mode 100644 runsc/testutil/testutil_runfiles.go

(limited to 'pkg')

diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 93d4f2b8c..006fcd9ab 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -46,7 +46,6 @@ var (
 //
 // TODO(b/67298402): Support non-cumulative metrics.
 // TODO(b/67298427): Support metric fields.
-//
 type Uint64Metric struct {
 	// value is the actual value of the metric. It must be accessed
 	// atomically.
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index a18093155..3db8bd34b 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -114,6 +114,10 @@ func newX86FPStateSlice() []byte {
 	size, align := cpuid.HostFeatureSet().ExtendedStateSize()
 	capacity := size
 	// Always use at least 4096 bytes.
+	//
+	// For the KVM platform, this state is a fixed 4096 bytes, so make sure
+	// that the underlying array is at _least_ that size otherwise we will
+	// corrupt random memory. This is not a pleasant thing to debug.
 	if capacity < 4096 {
 		capacity = 4096
 	}
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 81b92bb43..6fb756f0e 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -55,7 +55,7 @@ type SignalContext64 struct {
 	Trapno  uint64
 	Oldmask linux.SignalSet
 	Cr2     uint64
-	// Pointer to a struct _fpstate.
+	// Pointer to a struct _fpstate. See b/33003106#comment8.
 	Fpstate  uint64
 	Reserved [8]uint64
 }
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index 02538bb4f..a76d87e3a 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -177,6 +177,7 @@ func TestReaddirRevalidation(t *testing.T) {
 
 // TestReaddirOverlayFrozen tests that calling Readdir on an overlay file with
 // a frozen dirent tree does not make Readdir calls to the underlying files.
+// This is a regression test for b/114808269.
 func TestReaddirOverlayFrozen(t *testing.T) {
 	ctx := contexttest.Context(t)
 
diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
index 5d4ec6c7b..6667a0916 100644
--- a/pkg/sentry/fs/proc/README.md
+++ b/pkg/sentry/fs/proc/README.md
@@ -11,6 +11,8 @@ inconsistency, please file a bug.
 
 The following files are implemented:
 
+<!-- mdformat off(don't wrap the table) -->
+
 | File /proc/                 | Content                                               |
 | :------------------------   | :---------------------------------------------------- |
 | [cpuinfo](#cpuinfo)         | Info about the CPU                                    |
@@ -22,6 +24,8 @@ The following files are implemented:
 | [uptime](#uptime)           | Wall clock since boot, combined idle time of all cpus |
 | [version](#version)         | Kernel version                                        |
 
+<!-- mdformat on -->
+
 ### cpuinfo
 
 ```bash
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index a27628c0a..2231d6973 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -91,6 +91,7 @@ go_library(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
+        "kernel_opts.go",
         "kernel_state.go",
         "pending_signals.go",
         "pending_signals_list.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index dcd6e91c4..3ee760ba2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -235,6 +235,9 @@ type Kernel struct {
 	// events. This is initialized lazily on the first unimplemented
 	// syscall.
 	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
+
+	// SpecialOpts contains special kernel options.
+	SpecialOpts
 }
 
 // InitKernelArgs holds arguments to Init.
diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go
new file mode 100644
index 000000000..2e66ec587
--- /dev/null
+++ b/pkg/sentry/kernel/kernel_opts.go
@@ -0,0 +1,20 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// SpecialOpts contains non-standard options for the kernel.
+//
+// +stateify savable
+type SpecialOpts struct{}
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 5a07d5d0e..023bad156 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -10,6 +10,7 @@ go_library(
         "save_restore.go",
         "socket.go",
         "socket_unsafe.go",
+        "sockopt_impl.go",
         "stack.go",
     ],
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 34f63986f..de76388ac 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -285,7 +285,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 	}
 
 	// Whitelist options and constrain option length.
-	var optlen int
+	optlen := getSockOptLen(t, level, name)
 	switch level {
 	case linux.SOL_IP:
 		switch name {
@@ -330,7 +330,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 // SetSockOpt implements socket.Socket.SetSockOpt.
 func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
 	// Whitelist options and constrain option length.
-	var optlen int
+	optlen := setSockOptLen(t, level, name)
 	switch level {
 	case linux.SOL_IP:
 		switch name {
@@ -353,6 +353,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 			optlen = sizeofInt32
 		}
 	}
+
 	if optlen == 0 {
 		// Pretend to accept socket options we don't understand. This seems
 		// dangerous, but it's what netstack does...
diff --git a/pkg/sentry/socket/hostinet/sockopt_impl.go b/pkg/sentry/socket/hostinet/sockopt_impl.go
new file mode 100644
index 000000000..8a783712e
--- /dev/null
+++ b/pkg/sentry/socket/hostinet/sockopt_impl.go
@@ -0,0 +1,27 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostinet
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+func getSockOptLen(t *kernel.Task, level, name int) int {
+	return 0 // No custom options.
+}
+
+func setSockOptLen(t *kernel.Task, level, name int) int {
+	return 0 // No custom options.
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index e4a6b1b8b..f2be0e651 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2166,6 +2166,9 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	e.isRegistered = true
 	e.setEndpointState(StateListen)
 
+	// The channel may be non-nil when we're restoring the endpoint, and it
+	// may be pre-populated with some previously accepted (but not Accepted)
+	// endpoints.
 	if e.acceptedChan == nil {
 		e.acceptedChan = make(chan *endpoint, backlog)
 	}
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index ce30f6c53..ed18f0047 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -8,6 +8,7 @@ go_library(
         "config.go",
         "config_amd64.go",
         "config_arm64.go",
+        "config_profile.go",
         "extra_filters.go",
         "extra_filters_msan.go",
         "extra_filters_race.go",
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index f8d351c7b..c69f4c602 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -536,16 +536,3 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
 		},
 	}
 }
-
-// profileFilters returns extra syscalls made by runtime/pprof package.
-func profileFilters() seccomp.SyscallRules {
-	return seccomp.SyscallRules{
-		syscall.SYS_OPENAT: []seccomp.Rule{
-			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
-			},
-		},
-	}
-}
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
new file mode 100644
index 000000000..194952a7b
--- /dev/null
+++ b/runsc/boot/filter/config_profile.go
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// profileFilters returns extra syscalls made by runtime/pprof package.
+func profileFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_OPENAT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+			},
+		},
+	}
+}
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 060b63bf3..c2518d52b 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -196,7 +196,10 @@ func TestJobControlSignalExec(t *testing.T) {
 	defer ptyMaster.Close()
 	defer ptySlave.Close()
 
-	// Exec bash and attach a terminal.
+	// Exec bash and attach a terminal. Note that occasionally /bin/sh
+	// may be a different shell or have a different configuration (such
+	// as disabling interactive mode and job control). Since we want to
+	// explicitly test interactive mode, use /bin/bash. See b/116981926.
 	execArgs := &control.ExecArgs{
 		Filename: "/bin/bash",
 		// Don't let bash execute from profile or rc files, otherwise
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
index 9b6346ca2..1ff5e8cc3 100644
--- a/runsc/dockerutil/dockerutil.go
+++ b/runsc/dockerutil/dockerutil.go
@@ -143,8 +143,11 @@ func PrepareFiles(names ...string) (string, error) {
 		return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err)
 	}
 	for _, name := range names {
-		src := getLocalPath(name)
-		dst := path.Join(dir, name)
+		src, err := testutil.FindFile(name)
+		if err != nil {
+			return "", fmt.Errorf("testutil.Preparefiles(%q) failed: %v", name, err)
+		}
+		dst := path.Join(dir, path.Base(name))
 		if err := testutil.Copy(src, dst); err != nil {
 			return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
 		}
@@ -152,10 +155,6 @@ func PrepareFiles(names ...string) (string, error) {
 	return dir, nil
 }
 
-func getLocalPath(file string) string {
-	return path.Join(".", file)
-}
-
 // do executes docker command.
 func do(args ...string) (string, error) {
 	log.Printf("Running: docker %s\n", args)
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
index f845120b0..945405303 100644
--- a/runsc/testutil/BUILD
+++ b/runsc/testutil/BUILD
@@ -5,7 +5,10 @@ package(licenses = ["notice"])
 go_library(
     name = "testutil",
     testonly = 1,
-    srcs = ["testutil.go"],
+    srcs = [
+        "testutil.go",
+        "testutil_runfiles.go",
+    ],
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index edf2e809a..80c2c9680 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -79,60 +79,6 @@ func ConfigureExePath() error {
 	return nil
 }
 
-// FindFile searchs for a file inside the test run environment. It returns the
-// full path to the file. It fails if none or more than one file is found.
-func FindFile(path string) (string, error) {
-	wd, err := os.Getwd()
-	if err != nil {
-		return "", err
-	}
-
-	// The test root is demarcated by a path element called "__main__". Search for
-	// it backwards from the working directory.
-	root := wd
-	for {
-		dir, name := filepath.Split(root)
-		if name == "__main__" {
-			break
-		}
-		if len(dir) == 0 {
-			return "", fmt.Errorf("directory __main__ not found in %q", wd)
-		}
-		// Remove ending slash to loop around.
-		root = dir[:len(dir)-1]
-	}
-
-	// Annoyingly, bazel adds the build type to the directory path for go
-	// binaries, but not for c++ binaries. We use two different patterns to
-	// to find our file.
-	patterns := []string{
-		// Try the obvious path first.
-		filepath.Join(root, path),
-		// If it was a go binary, use a wildcard to match the build
-		// type. The pattern is: /test-path/__main__/directories/*/file.
-		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
-	}
-
-	for _, p := range patterns {
-		matches, err := filepath.Glob(p)
-		if err != nil {
-			// "The only possible returned error is ErrBadPattern,
-			// when pattern is malformed." -godoc
-			return "", fmt.Errorf("error globbing %q: %v", p, err)
-		}
-		switch len(matches) {
-		case 0:
-			// Try the next pattern.
-		case 1:
-			// We found it.
-			return matches[0], nil
-		default:
-			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
-		}
-	}
-	return "", fmt.Errorf("file %q not found", path)
-}
-
 // TestConfig returns the default configuration to use in tests. Note that
 // 'RootDir' must be set by caller if required.
 func TestConfig() *boot.Config {
diff --git a/runsc/testutil/testutil_runfiles.go b/runsc/testutil/testutil_runfiles.go
new file mode 100644
index 000000000..ece9ea9a1
--- /dev/null
+++ b/runsc/testutil/testutil_runfiles.go
@@ -0,0 +1,75 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutil
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+// FindFile searchs for a file inside the test run environment. It returns the
+// full path to the file. It fails if none or more than one file is found.
+func FindFile(path string) (string, error) {
+	wd, err := os.Getwd()
+	if err != nil {
+		return "", err
+	}
+
+	// The test root is demarcated by a path element called "__main__". Search for
+	// it backwards from the working directory.
+	root := wd
+	for {
+		dir, name := filepath.Split(root)
+		if name == "__main__" {
+			break
+		}
+		if len(dir) == 0 {
+			return "", fmt.Errorf("directory __main__ not found in %q", wd)
+		}
+		// Remove ending slash to loop around.
+		root = dir[:len(dir)-1]
+	}
+
+	// Annoyingly, bazel adds the build type to the directory path for go
+	// binaries, but not for c++ binaries. We use two different patterns to
+	// to find our file.
+	patterns := []string{
+		// Try the obvious path first.
+		filepath.Join(root, path),
+		// If it was a go binary, use a wildcard to match the build
+		// type. The pattern is: /test-path/__main__/directories/*/file.
+		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
+	}
+
+	for _, p := range patterns {
+		matches, err := filepath.Glob(p)
+		if err != nil {
+			// "The only possible returned error is ErrBadPattern,
+			// when pattern is malformed." -godoc
+			return "", fmt.Errorf("error globbing %q: %v", p, err)
+		}
+		switch len(matches) {
+		case 0:
+			// Try the next pattern.
+		case 1:
+			// We found it.
+			return matches[0], nil
+		default:
+			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
+		}
+	}
+	return "", fmt.Errorf("file %q not found", path)
+}
diff --git a/test/image/image_test.go b/test/image/image_test.go
index d0dcb1861..0a1e19d6f 100644
--- a/test/image/image_test.go
+++ b/test/image/image_test.go
@@ -107,7 +107,7 @@ func TestHttpd(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("http-test")
 
-	dir, err := dockerutil.PrepareFiles("latin10k.txt")
+	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -139,7 +139,7 @@ func TestNginx(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("net-test")
 
-	dir, err := dockerutil.PrepareFiles("latin10k.txt")
+	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -183,7 +183,7 @@ func TestMysql(t *testing.T) {
 	}
 
 	client := dockerutil.MakeDocker("mysql-client-test")
-	dir, err := dockerutil.PrepareFiles("mysql.sql")
+	dir, err := dockerutil.PrepareFiles("test/image/mysql.sql")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -283,7 +283,7 @@ func TestRuby(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("ruby-test")
 
-	dir, err := dockerutil.PrepareFiles("ruby.rb", "ruby.sh")
+	dir, err := dockerutil.PrepareFiles("test/image/ruby.rb", "test/image/ruby.sh")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index 1df761dd0..cbab85ef7 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -2,8 +2,6 @@
 
 load("//tools:defs.bzl", "loopback")
 
-# syscall_test is a macro that will create targets to run the given test target
-# on the host (native) and runsc.
 def syscall_test(
         test,
         shard_count = 5,
@@ -13,6 +11,19 @@ def syscall_test(
         add_uds_tree = False,
         add_hostinet = False,
         tags = None):
+    """syscall_test is a macro that will create targets for all platforms.
+
+    Args:
+      test: the test target.
+      shard_count: shards for defined tests.
+      size: the defined test size.
+      use_tmpfs: use tmpfs in the defined tests.
+      add_overlay: add an overlay test.
+      add_uds_tree: add a UDS test.
+      add_hostinet: add a hostinet test.
+      tags: starting test tags.
+    """
+
     _syscall_test(
         test = test,
         shard_count = shard_count,
@@ -111,6 +122,19 @@ def _syscall_test(
     # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared.
     tags += [full_platform, "file_" + file_access]
 
+    # Hash this target into one of 15 buckets. This can be used to
+    # randomly split targets between different workflows.
+    hash15 = hash(native.package_name() + name) % 15
+    tags.append("hash15:" + str(hash15))
+
+    # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until
+    # we figure out how to request ipv4 sockets on Guitar machines.
+    if network == "host":
+        tags.append("noguitar")
+
+    # Disable off-host networking.
+    tags.append("requires-net:loopback")
+
     # Add tag to prevent the tests from running in a Bazel sandbox.
     # TODO(b/120560048): Make the tests run without this tag.
     tags.append("no-sandbox")
@@ -118,8 +142,11 @@ def _syscall_test(
     # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
     # more stable.
     if platform == "kvm":
-        tags += ["manual"]
-        tags += ["requires-kvm"]
+        tags.append("manual")
+        tags.append("requires-kvm")
+
+        # TODO(b/112165693): Remove when tests pass reliably.
+        tags.append("notap")
 
     args = [
         # Arguments are passed directly to syscall_test_runner binary.
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 0a2d44a2c..85ec013d5 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -167,7 +167,7 @@ TEST(ChrootTest, DotDotFromOpenFD) {
 }
 
 // Test that link resolution in a chroot can escape the root by following an
-// open proc fd.
+// open proc fd. Regression test for b/32316719.
 TEST(ChrootTest, ProcFdLinkResolutionInChroot) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
 
diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc
index f41f99900..7cd6a75bd 100644
--- a/test/syscalls/linux/concurrency.cc
+++ b/test/syscalls/linux/concurrency.cc
@@ -46,7 +46,8 @@ TEST(ConcurrencyTest, SingleProcessMultithreaded) {
 }
 
 // Test that multiple threads in this process continue to execute in parallel,
-// even if an unrelated second process is spawned.
+// even if an unrelated second process is spawned. Regression test for
+// b/32119508.
 TEST(ConcurrencyTest, MultiProcessMultithreaded) {
   // In PID 1, start TIDs 1 and 2, and put both to sleep.
   //
diff --git a/test/syscalls/linux/exec_proc_exe_workload.cc b/test/syscalls/linux/exec_proc_exe_workload.cc
index b790fe5be..2989379b7 100644
--- a/test/syscalls/linux/exec_proc_exe_workload.cc
+++ b/test/syscalls/linux/exec_proc_exe_workload.cc
@@ -21,6 +21,12 @@
 #include "test/util/posix_error.h"
 
 int main(int argc, char** argv, char** envp) {
+  // This is annoying. Because remote build systems may put these binaries
+  // in a content-addressable-store, you may wind up with /proc/self/exe
+  // pointing to some random path (but with a sensible argv[0]).
+  //
+  // Therefore, this test simply checks that the /proc/self/exe
+  // is absolute and *doesn't* match argv[1].
   std::string exe =
       gvisor::testing::ProcessExePath(getpid()).ValueOrDie();
   if (exe[0] != '/') {
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index 906f3358d..ff8bdfeb0 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -271,7 +271,7 @@ TEST_F(ForkTest, Alarm) {
   EXPECT_EQ(0, alarmed);
 }
 
-// Child cannot affect parent private memory.
+// Child cannot affect parent private memory. Regression test for b/24137240.
 TEST_F(ForkTest, PrivateMemory) {
   std::atomic<uint32_t> local(0);
 
@@ -298,6 +298,9 @@ TEST_F(ForkTest, PrivateMemory) {
 }
 
 // Kernel-accessed buffers should remain coherent across COW.
+//
+// The buffer must be >= usermem.ZeroCopyMinBytes, as UnsafeAccess operates
+// differently. Regression test for b/33811887.
 TEST_F(ForkTest, COWSegment) {
   constexpr int kBufSize = 1024;
   char* read_buf = private_;
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index 1c4d9f1c7..11fb1b457 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -1418,7 +1418,7 @@ TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) {
 //
 // On most platforms this is trivial, but when the file is mapped via the sentry
 // page cache (which does not yet support writing to shared mappings), a bug
-// caused reads to fail unnecessarily on such mappings.
+// caused reads to fail unnecessarily on such mappings. See b/28913513.
 TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) {
   uintptr_t addr;
   size_t len = strlen(kFileContents);
@@ -1435,7 +1435,7 @@ TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) {
 
 // Tests that EFAULT is returned when invoking a syscall that requires the OS to
 // read past end of file (resulting in a fault in sentry context in the gVisor
-// case).
+// case). See b/28913513.
 TEST_F(MMapFileTest, InternalSigBus) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
@@ -1578,7 +1578,7 @@ TEST_F(MMapFileTest, Bug38498194) {
 }
 
 // Tests that reading from a file to a memory mapping of the same file does not
-// deadlock.
+// deadlock. See b/34813270.
 TEST_F(MMapFileTest, SelfRead) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
@@ -1590,7 +1590,7 @@ TEST_F(MMapFileTest, SelfRead) {
 }
 
 // Tests that writing to a file from a memory mapping of the same file does not
-// deadlock.
+// deadlock. Regression test for b/34813270.
 TEST_F(MMapFileTest, SelfWrite) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index 431733dbe..902d0a0dc 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -132,6 +132,7 @@ TEST(CreateTest, CreateFailsOnDirWithoutWritePerms) {
 }
 
 // A file originally created RW, but opened RO can later be opened RW.
+// Regression test for b/65385065.
 TEST(CreateTest, OpenCreateROThenRW) {
   TempPath file(NewTempAbsPath());
 
diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc
index f7ea44054..5b0743fe9 100644
--- a/test/syscalls/linux/preadv.cc
+++ b/test/syscalls/linux/preadv.cc
@@ -37,6 +37,7 @@ namespace testing {
 
 namespace {
 
+// Stress copy-on-write. Attempts to reproduce b/38430174.
 TEST(PreadvTest, MMConcurrencyStress) {
   // Fill a one-page file with zeroes (the contents don't really matter).
   const auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 169b723eb..a23fdb58d 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1352,13 +1352,19 @@ TEST(ProcPidSymlink, SubprocessZombied) {
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
-  // 4.17 & gVisor: Syscall succeeds and returns 1
+  //
+  // ~4.3: Syscall fails with EACCES.
+  // 4.17 & gVisor: Syscall succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(EACCES));
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
-  // 4.17 &  gVisor: Syscall succeeds and returns 1.
+  //
+  // ~4.3: Syscall fails with EACCES.
+  // 4.17 & gVisor: Syscall succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(EACCES));
 }
@@ -1431,8 +1437,12 @@ TEST(ProcPidFile, SubprocessRunning) {
 TEST(ProcPidFile, SubprocessZombie) {
   char buf[1];
 
-  // 4.17: Succeeds and returns 1
-  // gVisor: Succeeds and returns 0
+  // FIXME(gvisor.dev/issue/164): Loosen requirement due to inconsistent
+  // behavior on different kernels.
+  //
+  // ~4.3: Succeds and returns 0.
+  // 4.17: Succeeds and returns 1.
+  // gVisor: Succeeds and returns 0.
   EXPECT_THAT(ReadWhileZombied("auxv", buf, sizeof(buf)), SyscallSucceeds());
 
   EXPECT_THAT(ReadWhileZombied("cmdline", buf, sizeof(buf)),
@@ -1458,7 +1468,10 @@ TEST(ProcPidFile, SubprocessZombie) {
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
+  //
+  // ~4.3: Fails and returns EACCES.
   // gVisor & 4.17: Succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadWhileZombied("io", buf, sizeof(buf)),
   //          SyscallFailsWithErrno(EACCES));
 }
@@ -1467,9 +1480,12 @@ TEST(ProcPidFile, SubprocessZombie) {
 TEST(ProcPidFile, SubprocessExited) {
   char buf[1];
 
-  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels.
+  //
+  // ~4.3: Fails and returns ESRCH.
   // gVisor: Fails with ESRCH.
   // 4.17: Succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadWhileExited("auxv", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(ESRCH));
 
@@ -1641,7 +1657,7 @@ TEST(ProcTask, KilledThreadsDisappear) {
   EXPECT_NO_ERRNO(DirContainsExactly("/proc/self/task",
                                      TaskFiles(initial, {child1.Tid()})));
 
-  // Stat child1's task file.
+  // Stat child1's task file. Regression test for b/32097707.
   struct stat statbuf;
   const std::string child1_task_file =
       absl::StrCat("/proc/self/task/", child1.Tid());
@@ -1669,7 +1685,7 @@ TEST(ProcTask, KilledThreadsDisappear) {
   EXPECT_NO_ERRNO(EventuallyDirContainsExactly(
       "/proc/self/task", TaskFiles(initial, {child3.Tid(), child5.Tid()})));
 
-  // Stat child1's task file again.  This time it should fail.
+  // Stat child1's task file again.  This time it should fail. See b/32097707.
   EXPECT_THAT(stat(child1_task_file.c_str(), &statbuf),
               SyscallFailsWithErrno(ENOENT));
 
@@ -1824,7 +1840,7 @@ TEST(ProcSysVmOvercommitMemory, HasNumericValue) {
 }
 
 // Check that link for proc fd entries point the target node, not the
-// symlink itself.
+// symlink itself. Regression test for b/31155070.
 TEST(ProcTaskFd, FstatatFollowsSymlink) {
   const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
   const FileDescriptor fd =
@@ -1883,6 +1899,20 @@ TEST(ProcMounts, IsSymlink) {
   EXPECT_EQ(link, "self/mounts");
 }
 
+TEST(ProcSelfMountinfo, RequiredFieldsArePresent) {
+  auto mountinfo =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mountinfo"));
+  EXPECT_THAT(
+      mountinfo,
+      AllOf(
+          // Root mount.
+          ContainsRegex(
+              R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / / (rw|ro).*- \S+ \S+ (rw|ro)\S*)"),
+          // Proc mount - always rw.
+          ContainsRegex(
+              R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / /proc rw.*- \S+ \S+ rw\S*)")));
+}
+
 // Check that /proc/self/mounts looks something like a real mounts file.
 TEST(ProcSelfMounts, RequiredFieldsArePresent) {
   auto mounts = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mounts"));
diff --git a/test/syscalls/linux/readv.cc b/test/syscalls/linux/readv.cc
index 4069cbc7e..baaf9f757 100644
--- a/test/syscalls/linux/readv.cc
+++ b/test/syscalls/linux/readv.cc
@@ -254,7 +254,9 @@ TEST_F(ReadvTest, IovecOutsideTaskAddressRangeInNonemptyArray) {
 // This test depends on the maximum extent of a single readv() syscall, so
 // we can't tolerate interruption from saving.
 TEST(ReadvTestNoFixture, TruncatedAtMax_NoRandomSave) {
-  // Ensure that we won't be interrupted by ITIMER_PROF.
+  // Ensure that we won't be interrupted by ITIMER_PROF. This is particularly
+  // important in environments where automated profiling tools may start
+  // ITIMER_PROF automatically.
   struct itimerval itv = {};
   auto const cleanup_itimer =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_PROF, itv));
diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc
index 106c045e3..4bfb1ff56 100644
--- a/test/syscalls/linux/rseq.cc
+++ b/test/syscalls/linux/rseq.cc
@@ -36,7 +36,7 @@ namespace {
 // We must be very careful about how these tests are written. Each thread may
 // only have one struct rseq registration, which may be done automatically at
 // thread start (as of 2019-11-13, glibc does *not* support rseq and thus does
-// not do so).
+// not do so, but other libraries do).
 //
 // Testing of rseq is thus done primarily in a child process with no
 // registration. This means exec'ing a nostdlib binary, as rseq registration can
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
index 424e2a67f..be2364fb8 100644
--- a/test/syscalls/linux/select.cc
+++ b/test/syscalls/linux/select.cc
@@ -146,7 +146,7 @@ TEST_F(SelectTest, IgnoreBitsAboveNfds) {
 
 // This test illustrates Linux's behavior of 'select' calls passing after
 // setrlimit RLIMIT_NOFILE is called. In particular, versions of sshd rely on
-// this behavior.
+// this behavior. See b/122318458.
 TEST_F(SelectTest, SetrlimitCallNOFILE) {
   fd_set read_set;
   FD_ZERO(&read_set);
diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc
index 7ba752599..c7fdbb924 100644
--- a/test/syscalls/linux/shm.cc
+++ b/test/syscalls/linux/shm.cc
@@ -473,7 +473,7 @@ TEST(ShmTest, PartialUnmap) {
 }
 
 // Check that sentry does not panic when asked for a zero-length private shm
-// segment.
+// segment. Regression test for b/110694797.
 TEST(ShmTest, GracefullyFailOnZeroLenSegmentCreation) {
   EXPECT_THAT(Shmget(IPC_PRIVATE, 0, 0), PosixErrorIs(EINVAL, _));
 }
diff --git a/test/syscalls/linux/sigprocmask.cc b/test/syscalls/linux/sigprocmask.cc
index 654c6a47f..a603fc1d1 100644
--- a/test/syscalls/linux/sigprocmask.cc
+++ b/test/syscalls/linux/sigprocmask.cc
@@ -237,7 +237,7 @@ TEST_F(SigProcMaskTest, SignalHandler) {
 }
 
 // Check that sigprocmask correctly handles aliasing of the set and oldset
-// pointers.
+// pointers. Regression test for b/30502311.
 TEST_F(SigProcMaskTest, AliasedSets) {
   sigset_t mask;
 
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
index 276a94eb8..884319e1d 100644
--- a/test/syscalls/linux/socket_unix_non_stream.cc
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -109,7 +109,7 @@ PosixErrorOr<std::vector<Mapping>> CreateFragmentedRegion(const int size,
 }
 
 // A contiguous iov that is heavily fragmented in FileMem can still be sent
-// successfully.
+// successfully. See b/115833655.
 TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -165,7 +165,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
 }
 
 // A contiguous iov that is heavily fragmented in FileMem can still be received
-// into successfully.
+// into successfully. Regression test for b/115833655.
 TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc
index b249ff91f..03ee1250d 100644
--- a/test/syscalls/linux/symlink.cc
+++ b/test/syscalls/linux/symlink.cc
@@ -38,7 +38,7 @@ mode_t FilePermission(const std::string& path) {
 }
 
 // Test that name collisions are checked on the new link path, not the source
-// path.
+// path. Regression test for b/31782115.
 TEST(SymlinkTest, CanCreateSymlinkWithCachedSourceDirent) {
   const std::string srcname = NewTempAbsPath();
   const std::string newname = NewTempAbsPath();
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 8a8b68e75..c4591a3b9 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -244,7 +244,8 @@ TEST_P(TcpSocketTest, ZeroWriteAllowed) {
 }
 
 // Test that a non-blocking write with a buffer that is larger than the send
-// buffer size will not actually write the whole thing at once.
+// buffer size will not actually write the whole thing at once. Regression test
+// for b/64438887.
 TEST_P(TcpSocketTest, NonblockingLargeWrite) {
   // Set the FD to O_NONBLOCK.
   int opts;
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index c7eead17e..1ccb95733 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -62,6 +62,7 @@ TEST(TimeTest, VsyscallTime_InvalidAddressSIGSEGV) {
               ::testing::KilledBySignal(SIGSEGV), "");
 }
 
+// Mimics the gettimeofday(2) wrapper from the Go runtime <= 1.2.
 int vsyscall_gettimeofday(struct timeval* tv, struct timezone* tz) {
   constexpr uint64_t kVsyscallGettimeofdayEntry = 0xffffffffff600000;
   return reinterpret_cast<int (*)(struct timeval*, struct timezone*)>(
diff --git a/test/syscalls/linux/tkill.cc b/test/syscalls/linux/tkill.cc
index bae377c69..8d8ebbb24 100644
--- a/test/syscalls/linux/tkill.cc
+++ b/test/syscalls/linux/tkill.cc
@@ -54,7 +54,7 @@ void SigHandler(int sig, siginfo_t* info, void* context) {
   TEST_CHECK(info->si_code == SI_TKILL);
 }
 
-// Test with a real signal.
+// Test with a real signal. Regression test for b/24790092.
 TEST(TkillTest, ValidTIDAndRealSignal) {
   struct sigaction sa;
   sa.sa_sigaction = SigHandler;
diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc
index 35aacb172..9c10b6674 100644
--- a/test/util/temp_path.cc
+++ b/test/util/temp_path.cc
@@ -77,6 +77,7 @@ std::string NewTempAbsPath() {
 std::string NewTempRelPath() { return NextTempBasename(); }
 
 std::string GetAbsoluteTestTmpdir() {
+  // Note that TEST_TMPDIR is guaranteed to be set.
   char* env_tmpdir = getenv("TEST_TMPDIR");
   std::string tmp_dir =
       env_tmpdir != nullptr ? std::string(env_tmpdir) : "/tmp";
diff --git a/tools/build/tags.bzl b/tools/build/tags.bzl
index e99c87f81..a6db44e47 100644
--- a/tools/build/tags.bzl
+++ b/tools/build/tags.bzl
@@ -33,4 +33,8 @@ go_suffixes = [
     "_wasm_unsafe",
     "_linux",
     "_linux_unsafe",
+    "_opts",
+    "_opts_unsafe",
+    "_impl",
+    "_impl_unsafe",
 ]
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 5d5fa134a..c03b557ae 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -73,6 +73,16 @@ def calculate_sets(srcs):
             result[target].append(file)
     return result
 
+def go_imports(name, src, out):
+    """Simplify a single Go source file by eliminating unused imports."""
+    native.genrule(
+        name = name,
+        srcs = [src],
+        outs = [out],
+        tools = ["@org_golang_x_tools//cmd/goimports:goimports"],
+        cmd = ("$(location @org_golang_x_tools//cmd/goimports:goimports) $(SRCS) > $@"),
+    )
+
 def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs):
     """Wraps the standard go_library and does stateification and marshalling.
 
@@ -107,10 +117,15 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
         state_sets = calculate_sets(srcs)
         for (suffix, srcs) in state_sets.items():
             go_stateify(
-                name = name + suffix + "_state_autogen",
+                name = name + suffix + "_state_autogen_with_imports",
                 srcs = srcs,
                 imports = imports,
                 package = name,
+                out = name + suffix + "_state_autogen_with_imports.go",
+            )
+            go_imports(
+                name = name + suffix + "_state_autogen",
+                src = name + suffix + "_state_autogen_with_imports.go",
                 out = name + suffix + "_state_autogen.go",
             )
         all_srcs = all_srcs + [
-- 
cgit v1.2.3


From 5ff780891e229dbde00d9a37c2f8b6681e592fdb Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 6 Feb 2020 10:06:56 -0800
Subject: Move p9.pool to a separate package

PiperOrigin-RevId: 293617493
---
 pkg/p9/BUILD          |  3 +--
 pkg/p9/client.go      |  9 ++++---
 pkg/p9/pool.go        | 68 ---------------------------------------------------
 pkg/p9/pool_test.go   | 64 ------------------------------------------------
 pkg/pool/BUILD        | 25 +++++++++++++++++++
 pkg/pool/pool.go      | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
 pkg/pool/pool_test.go | 64 ++++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 161 insertions(+), 138 deletions(-)
 delete mode 100644 pkg/p9/pool.go
 delete mode 100644 pkg/p9/pool_test.go
 create mode 100644 pkg/pool/BUILD
 create mode 100644 pkg/pool/pool.go
 create mode 100644 pkg/pool/pool_test.go

(limited to 'pkg')

diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index 4ccc1de86..8904afad9 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -16,7 +16,6 @@ go_library(
         "messages.go",
         "p9.go",
         "path_tree.go",
-        "pool.go",
         "server.go",
         "transport.go",
         "transport_flipcall.go",
@@ -27,6 +26,7 @@ go_library(
         "//pkg/fdchannel",
         "//pkg/flipcall",
         "//pkg/log",
+        "//pkg/pool",
         "//pkg/sync",
         "//pkg/unet",
         "@org_golang_x_sys//unix:go_default_library",
@@ -41,7 +41,6 @@ go_test(
         "client_test.go",
         "messages_test.go",
         "p9_test.go",
-        "pool_test.go",
         "transport_test.go",
         "version_test.go",
     ],
diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index 4045e41fa..a6f493b82 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -22,6 +22,7 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/flipcall"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/pool"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 )
@@ -74,10 +75,10 @@ type Client struct {
 	socket *unet.Socket
 
 	// tagPool is the collection of available tags.
-	tagPool pool
+	tagPool pool.Pool
 
 	// fidPool is the collection of available fids.
-	fidPool pool
+	fidPool pool.Pool
 
 	// messageSize is the maximum total size of a message.
 	messageSize uint32
@@ -155,8 +156,8 @@ func NewClient(socket *unet.Socket, messageSize uint32, version string) (*Client
 	}
 	c := &Client{
 		socket:      socket,
-		tagPool:     pool{start: 1, limit: uint64(NoTag)},
-		fidPool:     pool{start: 1, limit: uint64(NoFID)},
+		tagPool:     pool.Pool{Start: 1, Limit: uint64(NoTag)},
+		fidPool:     pool.Pool{Start: 1, Limit: uint64(NoFID)},
 		pending:     make(map[Tag]*response),
 		recvr:       make(chan bool, 1),
 		messageSize: messageSize,
diff --git a/pkg/p9/pool.go b/pkg/p9/pool.go
deleted file mode 100644
index 2b14a5ce3..000000000
--- a/pkg/p9/pool.go
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package p9
-
-import (
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-// pool is a simple allocator.
-//
-// It is used for both tags and FIDs.
-type pool struct {
-	mu sync.Mutex
-
-	// cache is the set of returned values.
-	cache []uint64
-
-	// start is the starting value (if needed).
-	start uint64
-
-	// max is the current maximum issued.
-	max uint64
-
-	// limit is the upper limit.
-	limit uint64
-}
-
-// Get gets a value from the pool.
-func (p *pool) Get() (uint64, bool) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-
-	// Anything cached?
-	if len(p.cache) > 0 {
-		v := p.cache[len(p.cache)-1]
-		p.cache = p.cache[:len(p.cache)-1]
-		return v, true
-	}
-
-	// Over the limit?
-	if p.start == p.limit {
-		return 0, false
-	}
-
-	// Generate a new value.
-	v := p.start
-	p.start++
-	return v, true
-}
-
-// Put returns a value to the pool.
-func (p *pool) Put(v uint64) {
-	p.mu.Lock()
-	p.cache = append(p.cache, v)
-	p.mu.Unlock()
-}
diff --git a/pkg/p9/pool_test.go b/pkg/p9/pool_test.go
deleted file mode 100644
index e4746b8da..000000000
--- a/pkg/p9/pool_test.go
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package p9
-
-import (
-	"testing"
-)
-
-func TestPoolUnique(t *testing.T) {
-	p := pool{start: 1, limit: 3}
-	got := make(map[uint64]bool)
-
-	for {
-		n, ok := p.Get()
-		if !ok {
-			break
-		}
-
-		// Check unique.
-		if _, ok := got[n]; ok {
-			t.Errorf("pool spit out %v multiple times", n)
-		}
-
-		// Record.
-		got[n] = true
-	}
-}
-
-func TestExausted(t *testing.T) {
-	p := pool{start: 1, limit: 500}
-	for i := 0; i < 499; i++ {
-		_, ok := p.Get()
-		if !ok {
-			t.Fatalf("pool exhausted before 499 items")
-		}
-	}
-
-	_, ok := p.Get()
-	if ok {
-		t.Errorf("pool not exhausted when it should be")
-	}
-}
-
-func TestPoolRecycle(t *testing.T) {
-	p := pool{start: 1, limit: 500}
-	n1, _ := p.Get()
-	p.Put(n1)
-	n2, _ := p.Get()
-	if n1 != n2 {
-		t.Errorf("pool not recycling items")
-	}
-}
diff --git a/pkg/pool/BUILD b/pkg/pool/BUILD
new file mode 100644
index 000000000..7b1c6b75b
--- /dev/null
+++ b/pkg/pool/BUILD
@@ -0,0 +1,25 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+go_library(
+    name = "pool",
+    srcs = [
+        "pool.go",
+    ],
+    deps = [
+        "//pkg/sync",
+    ],
+)
+
+go_test(
+    name = "pool_test",
+    size = "small",
+    srcs = [
+        "pool_test.go",
+    ],
+    library = ":pool",
+)
diff --git a/pkg/pool/pool.go b/pkg/pool/pool.go
new file mode 100644
index 000000000..a1b2e0cfe
--- /dev/null
+++ b/pkg/pool/pool.go
@@ -0,0 +1,66 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pool
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// Pool is a simple allocator.
+type Pool struct {
+	mu sync.Mutex
+
+	// cache is the set of returned values.
+	cache []uint64
+
+	// Start is the starting value (if needed).
+	Start uint64
+
+	// max is the current maximum issued.
+	max uint64
+
+	// Limit is the upper limit.
+	Limit uint64
+}
+
+// Get gets a value from the pool.
+func (p *Pool) Get() (uint64, bool) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Anything cached?
+	if len(p.cache) > 0 {
+		v := p.cache[len(p.cache)-1]
+		p.cache = p.cache[:len(p.cache)-1]
+		return v, true
+	}
+
+	// Over the limit?
+	if p.Start == p.Limit {
+		return 0, false
+	}
+
+	// Generate a new value.
+	v := p.Start
+	p.Start++
+	return v, true
+}
+
+// Put returns a value to the pool.
+func (p *Pool) Put(v uint64) {
+	p.mu.Lock()
+	p.cache = append(p.cache, v)
+	p.mu.Unlock()
+}
diff --git a/pkg/pool/pool_test.go b/pkg/pool/pool_test.go
new file mode 100644
index 000000000..d928439c1
--- /dev/null
+++ b/pkg/pool/pool_test.go
@@ -0,0 +1,64 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pool
+
+import (
+	"testing"
+)
+
+func TestPoolUnique(t *testing.T) {
+	p := Pool{Start: 1, Limit: 3}
+	got := make(map[uint64]bool)
+
+	for {
+		n, ok := p.Get()
+		if !ok {
+			break
+		}
+
+		// Check unique.
+		if _, ok := got[n]; ok {
+			t.Errorf("pool spit out %v multiple times", n)
+		}
+
+		// Record.
+		got[n] = true
+	}
+}
+
+func TestExausted(t *testing.T) {
+	p := Pool{Start: 1, Limit: 500}
+	for i := 0; i < 499; i++ {
+		_, ok := p.Get()
+		if !ok {
+			t.Fatalf("pool exhausted before 499 items")
+		}
+	}
+
+	_, ok := p.Get()
+	if ok {
+		t.Errorf("pool not exhausted when it should be")
+	}
+}
+
+func TestPoolRecycle(t *testing.T) {
+	p := Pool{Start: 1, Limit: 500}
+	n1, _ := p.Get()
+	p.Put(n1)
+	n2, _ := p.Get()
+	if n1 != n2 {
+		t.Errorf("pool not recycling items")
+	}
+}
-- 
cgit v1.2.3


From 6bd59b4e08893281468e8af5aebb5fab0f7a8c0d Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 6 Feb 2020 11:12:41 -0800
Subject: Update link address for targets of Neighbor Adverts

Get the link address for the target of an NDP Neighbor Advertisement
from the NDP Target Link Layer Address option.

Tests:
- ipv6.TestNeighorAdvertisementWithTargetLinkLayerOption
- ipv6.TestNeighorAdvertisementWithInvalidTargetLinkLayerOption
PiperOrigin-RevId: 293632609
---
 pkg/tcpip/network/ipv6/icmp.go      |  44 ++++--
 pkg/tcpip/network/ipv6/icmp_test.go | 186 +++++++++++++++---------
 pkg/tcpip/network/ipv6/ndp_test.go  | 278 +++++++++++++++++++++++++-----------
 pkg/tcpip/stack/ndp_test.go         |   8 +-
 4 files changed, 352 insertions(+), 164 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 60817d36d..45dc757c7 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -15,6 +15,8 @@
 package ipv6
 
 import (
+	"log"
+
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -194,7 +196,11 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		// TODO(b/148429853): Properly process the NS message and do Neighbor
 		// Unreachability Detection.
 		for {
-			opt, done, _ := it.Next()
+			opt, done, err := it.Next()
+			if err != nil {
+				// This should never happen as Iter(true) above did not return an error.
+				log.Fatalf("unexpected error when iterating over NDP options: %s", err)
+			}
 			if done {
 				break
 			}
@@ -253,21 +259,25 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		}
 
 		na := header.NDPNeighborAdvert(h.NDPPayload())
+		it, err := na.Options().Iter(true)
+		if err != nil {
+			// If we have a malformed NDP NA option, drop the packet.
+			received.Invalid.Increment()
+			return
+		}
+
 		targetAddr := na.TargetAddress()
 		stack := r.Stack()
 		rxNICID := r.NICID()
 
-		isTentative, err := stack.IsAddrTentative(rxNICID, targetAddr)
-		if err != nil {
+		if isTentative, err := stack.IsAddrTentative(rxNICID, targetAddr); err != nil {
 			// We will only get an error if rxNICID is unrecognized,
 			// which should not happen. For now short-circuit this
 			// packet.
 			//
 			// TODO(b/141002840): Handle this better?
 			return
-		}
-
-		if isTentative {
+		} else if isTentative {
 			// We just got an NA from a node that owns an address we
 			// are performing DAD on, implying the address is not
 			// unique. In this case we let the stack know so it can
@@ -283,13 +293,29 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		// scenario is beyond the scope of RFC 4862. As such, we simply
 		// ignore such a scenario for now and proceed as normal.
 		//
+		// If the NA message has the target link layer option, update the link
+		// address cache with the link address for the target of the message.
+		//
 		// TODO(b/143147598): Handle the scenario described above. Also
 		// inform the netstack integration that a duplicate address was
 		// detected outside of DAD.
+		//
+		// TODO(b/148429853): Properly process the NA message and do Neighbor
+		// Unreachability Detection.
+		for {
+			opt, done, err := it.Next()
+			if err != nil {
+				// This should never happen as Iter(true) above did not return an error.
+				log.Fatalf("unexpected error when iterating over NDP options: %s", err)
+			}
+			if done {
+				break
+			}
 
-		e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, r.RemoteLinkAddress)
-		if targetAddr != r.RemoteAddress {
-			e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, r.RemoteLinkAddress)
+			switch opt := opt.(type) {
+			case header.NDPTargetLinkLayerAddressOption:
+				e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, opt.EthernetAddress())
+			}
 		}
 
 	case header.ICMPv6EchoRequest:
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index d0e930e20..50c4b6474 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -121,21 +121,60 @@ func TestICMPCounts(t *testing.T) {
 	}
 	defer r.Release()
 
+	var tllData [header.NDPLinkLayerAddressSize]byte
+	header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+		header.NDPTargetLinkLayerAddressOption(linkAddr1),
+	})
+
 	types := []struct {
-		typ  header.ICMPv6Type
-		size int
+		typ       header.ICMPv6Type
+		size      int
+		extraData []byte
 	}{
-		{header.ICMPv6DstUnreachable, header.ICMPv6DstUnreachableMinimumSize},
-		{header.ICMPv6PacketTooBig, header.ICMPv6PacketTooBigMinimumSize},
-		{header.ICMPv6TimeExceeded, header.ICMPv6MinimumSize},
-		{header.ICMPv6ParamProblem, header.ICMPv6MinimumSize},
-		{header.ICMPv6EchoRequest, header.ICMPv6EchoMinimumSize},
-		{header.ICMPv6EchoReply, header.ICMPv6EchoMinimumSize},
-		{header.ICMPv6RouterSolicit, header.ICMPv6MinimumSize},
-		{header.ICMPv6RouterAdvert, header.ICMPv6HeaderSize + header.NDPRAMinimumSize},
-		{header.ICMPv6NeighborSolicit, header.ICMPv6NeighborSolicitMinimumSize},
-		{header.ICMPv6NeighborAdvert, header.ICMPv6NeighborAdvertSize},
-		{header.ICMPv6RedirectMsg, header.ICMPv6MinimumSize},
+		{
+			typ:  header.ICMPv6DstUnreachable,
+			size: header.ICMPv6DstUnreachableMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6PacketTooBig,
+			size: header.ICMPv6PacketTooBigMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6TimeExceeded,
+			size: header.ICMPv6MinimumSize,
+		},
+		{
+			typ:  header.ICMPv6ParamProblem,
+			size: header.ICMPv6MinimumSize,
+		},
+		{
+			typ:  header.ICMPv6EchoRequest,
+			size: header.ICMPv6EchoMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6EchoReply,
+			size: header.ICMPv6EchoMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6RouterSolicit,
+			size: header.ICMPv6MinimumSize,
+		},
+		{
+			typ:  header.ICMPv6RouterAdvert,
+			size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6NeighborSolicit,
+			size: header.ICMPv6NeighborSolicitMinimumSize},
+		{
+			typ:       header.ICMPv6NeighborAdvert,
+			size:      header.ICMPv6NeighborAdvertMinimumSize,
+			extraData: tllData[:],
+		},
+		{
+			typ:  header.ICMPv6RedirectMsg,
+			size: header.ICMPv6MinimumSize,
+		},
 	}
 
 	handleIPv6Payload := func(hdr buffer.Prependable) {
@@ -154,10 +193,13 @@ func TestICMPCounts(t *testing.T) {
 	}
 
 	for _, typ := range types {
-		hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size)
+		extraDataLen := len(typ.extraData)
+		hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
+		extraData := buffer.View(hdr.Prepend(extraDataLen))
+		copy(extraData, typ.extraData)
 		pkt := header.ICMPv6(hdr.Prepend(typ.size))
 		pkt.SetType(typ.typ)
-		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView()))
 
 		handleIPv6Payload(hdr)
 	}
@@ -372,97 +414,104 @@ func TestLinkResolution(t *testing.T) {
 }
 
 func TestICMPChecksumValidationSimple(t *testing.T) {
+	var tllData [header.NDPLinkLayerAddressSize]byte
+	header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+		header.NDPTargetLinkLayerAddressOption(linkAddr1),
+	})
+
 	types := []struct {
 		name        string
 		typ         header.ICMPv6Type
 		size        int
+		extraData   []byte
 		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
 	}{
 		{
-			"DstUnreachable",
-			header.ICMPv6DstUnreachable,
-			header.ICMPv6DstUnreachableMinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "DstUnreachable",
+			typ:  header.ICMPv6DstUnreachable,
+			size: header.ICMPv6DstUnreachableMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.DstUnreachable
 			},
 		},
 		{
-			"PacketTooBig",
-			header.ICMPv6PacketTooBig,
-			header.ICMPv6PacketTooBigMinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "PacketTooBig",
+			typ:  header.ICMPv6PacketTooBig,
+			size: header.ICMPv6PacketTooBigMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.PacketTooBig
 			},
 		},
 		{
-			"TimeExceeded",
-			header.ICMPv6TimeExceeded,
-			header.ICMPv6MinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "TimeExceeded",
+			typ:  header.ICMPv6TimeExceeded,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.TimeExceeded
 			},
 		},
 		{
-			"ParamProblem",
-			header.ICMPv6ParamProblem,
-			header.ICMPv6MinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "ParamProblem",
+			typ:  header.ICMPv6ParamProblem,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.ParamProblem
 			},
 		},
 		{
-			"EchoRequest",
-			header.ICMPv6EchoRequest,
-			header.ICMPv6EchoMinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "EchoRequest",
+			typ:  header.ICMPv6EchoRequest,
+			size: header.ICMPv6EchoMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.EchoRequest
 			},
 		},
 		{
-			"EchoReply",
-			header.ICMPv6EchoReply,
-			header.ICMPv6EchoMinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "EchoReply",
+			typ:  header.ICMPv6EchoReply,
+			size: header.ICMPv6EchoMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.EchoReply
 			},
 		},
 		{
-			"RouterSolicit",
-			header.ICMPv6RouterSolicit,
-			header.ICMPv6MinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "RouterSolicit",
+			typ:  header.ICMPv6RouterSolicit,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.RouterSolicit
 			},
 		},
 		{
-			"RouterAdvert",
-			header.ICMPv6RouterAdvert,
-			header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "RouterAdvert",
+			typ:  header.ICMPv6RouterAdvert,
+			size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.RouterAdvert
 			},
 		},
 		{
-			"NeighborSolicit",
-			header.ICMPv6NeighborSolicit,
-			header.ICMPv6NeighborSolicitMinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "NeighborSolicit",
+			typ:  header.ICMPv6NeighborSolicit,
+			size: header.ICMPv6NeighborSolicitMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.NeighborSolicit
 			},
 		},
 		{
-			"NeighborAdvert",
-			header.ICMPv6NeighborAdvert,
-			header.ICMPv6NeighborAdvertSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name:      "NeighborAdvert",
+			typ:       header.ICMPv6NeighborAdvert,
+			size:      header.ICMPv6NeighborAdvertMinimumSize,
+			extraData: tllData[:],
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.NeighborAdvert
 			},
 		},
 		{
-			"RedirectMsg",
-			header.ICMPv6RedirectMsg,
-			header.ICMPv6MinimumSize,
-			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name: "RedirectMsg",
+			typ:  header.ICMPv6RedirectMsg,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.RedirectMsg
 			},
 		},
@@ -494,16 +543,19 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 				)
 			}
 
-			handleIPv6Payload := func(typ header.ICMPv6Type, size int, checksum bool) {
-				hdr := buffer.NewPrependable(header.IPv6MinimumSize + size)
-				pkt := header.ICMPv6(hdr.Prepend(size))
-				pkt.SetType(typ)
+			handleIPv6Payload := func(checksum bool) {
+				extraDataLen := len(typ.extraData)
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
+				extraData := buffer.View(hdr.Prepend(extraDataLen))
+				copy(extraData, typ.extraData)
+				pkt := header.ICMPv6(hdr.Prepend(typ.size))
+				pkt.SetType(typ.typ)
 				if checksum {
-					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, extraData.ToVectorisedView()))
 				}
 				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
 				ip.Encode(&header.IPv6Fields{
-					PayloadLength: uint16(size),
+					PayloadLength: uint16(typ.size + extraDataLen),
 					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
 					HopLimit:      header.NDPHopLimit,
 					SrcAddr:       lladdr1,
@@ -528,7 +580,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 
 			// Without setting checksum, the incoming packet should
 			// be invalid.
-			handleIPv6Payload(typ.typ, typ.size, false)
+			handleIPv6Payload(false)
 			if got := invalid.Value(); got != 1 {
 				t.Fatalf("got invalid = %d, want = 1", got)
 			}
@@ -538,7 +590,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 			}
 
 			// When checksum is set, it should be received.
-			handleIPv6Payload(typ.typ, typ.size, true)
+			handleIPv6Payload(true)
 			if got := typStat.Value(); got != 1 {
 				t.Fatalf("got %s = %d, want = 1", typ.name, got)
 			}
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index bd732f93f..c9395de52 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -70,76 +70,29 @@ func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address) (*stack
 	return s, ep
 }
 
-// TestNeighorSolicitationWithSourceLinkLayerOption tests that receiving an
-// NDP NS message with the Source Link Layer Address option results in a
+// TestNeighorSolicitationWithSourceLinkLayerOption tests that receiving a
+// valid NDP NS message with the Source Link Layer Address option results in a
 // new entry in the link address cache for the sender of the message.
 func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 	const nicID = 1
 
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
-	})
-	e := channel.New(0, 1280, linkAddr0)
-	if err := s.CreateNIC(nicID, e); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
-	}
-	if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
-		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
-	}
-
-	ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
-	hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
-	pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
-	pkt.SetType(header.ICMPv6NeighborSolicit)
-	ns := header.NDPNeighborSolicit(pkt.NDPPayload())
-	ns.SetTargetAddress(lladdr0)
-	ns.Options().Serialize(header.NDPOptionsSerializer{
-		header.NDPSourceLinkLayerAddressOption(linkAddr1),
-	})
-	pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
-	payloadLength := hdr.UsedLength()
-	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
-	ip.Encode(&header.IPv6Fields{
-		PayloadLength: uint16(payloadLength),
-		NextHeader:    uint8(header.ICMPv6ProtocolNumber),
-		HopLimit:      255,
-		SrcAddr:       lladdr1,
-		DstAddr:       lladdr0,
-	})
-	e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
-		Data: hdr.View().ToVectorisedView(),
-	})
-
-	linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
-	if err != nil {
-		t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err)
-	}
-	if c != nil {
-		t.Errorf("got unexpected channel")
-	}
-	if linkAddr != linkAddr1 {
-		t.Errorf("got link address = %s, want = %s", linkAddr, linkAddr1)
-	}
-}
-
-// TestNeighorSolicitationWithInvalidSourceLinkLayerOption tests that receiving
-// an NDP NS message with an invalid Source Link Layer Address option does not
-// result in a new entry in the link address cache for the sender of the
-// message.
-func TestNeighorSolicitationWithInvalidSourceLinkLayerOption(t *testing.T) {
-	const nicID = 1
-
 	tests := []struct {
-		name    string
-		optsBuf []byte
+		name             string
+		optsBuf          []byte
+		expectedLinkAddr tcpip.LinkAddress
 	}{
+		{
+			name:             "Valid",
+			optsBuf:          []byte{1, 1, 2, 3, 4, 5, 6, 7},
+			expectedLinkAddr: "\x02\x03\x04\x05\x06\x07",
+		},
 		{
 			name:    "Too Small",
-			optsBuf: []byte{1, 1, 1, 2, 3, 4, 5},
+			optsBuf: []byte{1, 1, 2, 3, 4, 5, 6},
 		},
 		{
 			name:    "Invalid Length",
-			optsBuf: []byte{1, 2, 1, 2, 3, 4, 5, 6},
+			optsBuf: []byte{1, 2, 2, 3, 4, 5, 6, 7},
 		},
 	}
 
@@ -186,20 +139,138 @@ func TestNeighorSolicitationWithInvalidSourceLinkLayerOption(t *testing.T) {
 				Data: hdr.View().ToVectorisedView(),
 			})
 
-			// Invalid count should have increased.
-			if got := invalid.Value(); got != 1 {
-				t.Fatalf("got invalid = %d, want = 1", got)
+			linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
+			if linkAddr != test.expectedLinkAddr {
+				t.Errorf("got link address = %s, want = %s", linkAddr, test.expectedLinkAddr)
 			}
 
-			linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
-			if err != tcpip.ErrWouldBlock {
-				t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock)
+			if test.expectedLinkAddr != "" {
+				if err != nil {
+					t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err)
+				}
+				if c != nil {
+					t.Errorf("got unexpected channel")
+				}
+
+				// Invalid count should not have increased.
+				if got := invalid.Value(); got != 0 {
+					t.Errorf("got invalid = %d, want = 0", got)
+				}
+			} else {
+				if err != tcpip.ErrWouldBlock {
+					t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock)
+				}
+				if c == nil {
+					t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber)
+				}
+
+				// Invalid count should have increased.
+				if got := invalid.Value(); got != 1 {
+					t.Errorf("got invalid = %d, want = 1", got)
+				}
+			}
+		})
+	}
+}
+
+// TestNeighorAdvertisementWithTargetLinkLayerOption tests that receiving a
+// valid NDP NA message with the Target Link Layer Address option results in a
+// new entry in the link address cache for the target of the message.
+func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name             string
+		optsBuf          []byte
+		expectedLinkAddr tcpip.LinkAddress
+	}{
+		{
+			name:             "Valid",
+			optsBuf:          []byte{2, 1, 2, 3, 4, 5, 6, 7},
+			expectedLinkAddr: "\x02\x03\x04\x05\x06\x07",
+		},
+		{
+			name:    "Too Small",
+			optsBuf: []byte{2, 1, 2, 3, 4, 5, 6},
+		},
+		{
+			name:    "Invalid Length",
+			optsBuf: []byte{2, 2, 2, 3, 4, 5, 6, 7},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			e := channel.New(0, 1280, linkAddr0)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
+			}
+
+			ndpNASize := header.ICMPv6NeighborAdvertMinimumSize + len(test.optsBuf)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNASize))
+			pkt.SetType(header.ICMPv6NeighborAdvert)
+			ns := header.NDPNeighborAdvert(pkt.NDPPayload())
+			ns.SetTargetAddress(lladdr1)
+			opts := ns.Options()
+			copy(opts, test.optsBuf)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       lladdr1,
+				DstAddr:       lladdr0,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
 			}
-			if c == nil {
-				t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber)
+
+			e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
+			if linkAddr != test.expectedLinkAddr {
+				t.Errorf("got link address = %s, want = %s", linkAddr, test.expectedLinkAddr)
 			}
-			if linkAddr != "" {
-				t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (%s, _, ), want = ('', _, _)", nicID, lladdr1, lladdr0, ProtocolNumber, linkAddr)
+
+			if test.expectedLinkAddr != "" {
+				if err != nil {
+					t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err)
+				}
+				if c != nil {
+					t.Errorf("got unexpected channel")
+				}
+
+				// Invalid count should not have increased.
+				if got := invalid.Value(); got != 0 {
+					t.Errorf("got invalid = %d, want = 0", got)
+				}
+			} else {
+				if err != tcpip.ErrWouldBlock {
+					t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock)
+				}
+				if c == nil {
+					t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber)
+				}
+
+				// Invalid count should have increased.
+				if got := invalid.Value(); got != 1 {
+					t.Errorf("got invalid = %d, want = 1", got)
+				}
 			}
 		})
 	}
@@ -238,27 +309,59 @@ func TestHopLimitValidation(t *testing.T) {
 		})
 	}
 
+	var tllData [header.NDPLinkLayerAddressSize]byte
+	header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+		header.NDPTargetLinkLayerAddressOption(linkAddr1),
+	})
+
 	types := []struct {
 		name        string
 		typ         header.ICMPv6Type
 		size        int
+		extraData   []byte
 		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
 	}{
-		{"RouterSolicit", header.ICMPv6RouterSolicit, header.ICMPv6MinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-			return stats.RouterSolicit
-		}},
-		{"RouterAdvert", header.ICMPv6RouterAdvert, header.ICMPv6HeaderSize + header.NDPRAMinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-			return stats.RouterAdvert
-		}},
-		{"NeighborSolicit", header.ICMPv6NeighborSolicit, header.ICMPv6NeighborSolicitMinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-			return stats.NeighborSolicit
-		}},
-		{"NeighborAdvert", header.ICMPv6NeighborAdvert, header.ICMPv6NeighborAdvertSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-			return stats.NeighborAdvert
-		}},
-		{"RedirectMsg", header.ICMPv6RedirectMsg, header.ICMPv6MinimumSize, func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-			return stats.RedirectMsg
-		}},
+		{
+			name: "RouterSolicit",
+			typ:  header.ICMPv6RouterSolicit,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RouterSolicit
+			},
+		},
+		{
+			name: "RouterAdvert",
+			typ:  header.ICMPv6RouterAdvert,
+			size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RouterAdvert
+			},
+		},
+		{
+			name: "NeighborSolicit",
+			typ:  header.ICMPv6NeighborSolicit,
+			size: header.ICMPv6NeighborSolicitMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.NeighborSolicit
+			},
+		},
+		{
+			name:      "NeighborAdvert",
+			typ:       header.ICMPv6NeighborAdvert,
+			size:      header.ICMPv6NeighborAdvertMinimumSize,
+			extraData: tllData[:],
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.NeighborAdvert
+			},
+		},
+		{
+			name: "RedirectMsg",
+			typ:  header.ICMPv6RedirectMsg,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RedirectMsg
+			},
+		},
 	}
 
 	for _, typ := range types {
@@ -270,10 +373,13 @@ func TestHopLimitValidation(t *testing.T) {
 			invalid := stats.Invalid
 			typStat := typ.statCounter(stats)
 
-			hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size)
+			extraDataLen := len(typ.extraData)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
+			extraData := buffer.View(hdr.Prepend(extraDataLen))
+			copy(extraData, typ.extraData)
 			pkt := header.ICMPv6(hdr.Prepend(typ.size))
 			pkt.SetType(typ.typ)
-			pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView()))
 
 			// Invalid count should initially be 0.
 			if got := invalid.Value(); got != 0 {
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 8af8565f7..9a4607dcb 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -478,13 +478,17 @@ func TestDADFail(t *testing.T) {
 		{
 			"RxAdvert",
 			func(tgt tcpip.Address) buffer.Prependable {
-				hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize)
-				pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + naSize)
+				pkt := header.ICMPv6(hdr.Prepend(naSize))
 				pkt.SetType(header.ICMPv6NeighborAdvert)
 				na := header.NDPNeighborAdvert(pkt.NDPPayload())
 				na.SetSolicitedFlag(true)
 				na.SetOverrideFlag(true)
 				na.SetTargetAddress(tgt)
+				na.Options().Serialize(header.NDPOptionsSerializer{
+					header.NDPTargetLinkLayerAddressOption(linkAddr1),
+				})
 				pkt.SetChecksum(header.ICMPv6Checksum(pkt, tgt, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
 				payloadLength := hdr.UsedLength()
 				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
-- 
cgit v1.2.3


From 736775e0ac592c266acac0a7415f21d54715a54c Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 6 Feb 2020 14:03:49 -0800
Subject: Make gonet consistent both internally and with the net package.

The types gonet.Conn and gonet.PacketConn were confusingly named as both
implemented net.Conn. Further, gonet.Conn was perhaps unexpectedly
TCP-specific (net.Conn is not). This change renames them to gonet.TCPConn and
gonet.UDPConn.

Renames gonet.NewListener to gonet.ListenTCP and adds a new gonet.NewTCPListner
function to be consistent with both the gonet.DialXxx and gonet.NewXxxConn
functions as well as net.ListenTCP.

Updates #1632

PiperOrigin-RevId: 293671303
---
 benchmarks/tcp/tcp_proxy.go            |   2 +-
 pkg/tcpip/adapters/gonet/gonet.go      | 111 +++++++++++++++++----------------
 pkg/tcpip/adapters/gonet/gonet_test.go |  22 +++----
 3 files changed, 70 insertions(+), 65 deletions(-)

(limited to 'pkg')

diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go
index 72ada5700..73b7c4f5b 100644
--- a/benchmarks/tcp/tcp_proxy.go
+++ b/benchmarks/tcp/tcp_proxy.go
@@ -274,7 +274,7 @@ func (n netstackImpl) listen(port int) (net.Listener, error) {
 		NIC:  nicID,
 		Port: uint16(port),
 	}
-	listener, err := gonet.NewListener(n.s, addr, ipv4.ProtocolNumber)
+	listener, err := gonet.ListenTCP(n.s, addr, ipv4.ProtocolNumber)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index 711969b9b..6e0db2741 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -43,18 +43,28 @@ func (e *timeoutError) Error() string   { return "i/o timeout" }
 func (e *timeoutError) Timeout() bool   { return true }
 func (e *timeoutError) Temporary() bool { return true }
 
-// A Listener is a wrapper around a tcpip endpoint that implements
+// A TCPListener is a wrapper around a TCP tcpip.Endpoint that implements
 // net.Listener.
-type Listener struct {
+type TCPListener struct {
 	stack  *stack.Stack
 	ep     tcpip.Endpoint
 	wq     *waiter.Queue
 	cancel chan struct{}
 }
 
-// NewListener creates a new Listener.
-func NewListener(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*Listener, error) {
-	// Create TCP endpoint, bind it, then start listening.
+// NewTCPListener creates a new TCPListener from a listening tcpip.Endpoint.
+func NewTCPListener(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *TCPListener {
+	return &TCPListener{
+		stack:  s,
+		ep:     ep,
+		wq:     wq,
+		cancel: make(chan struct{}),
+	}
+}
+
+// ListenTCP creates a new TCPListener.
+func ListenTCP(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPListener, error) {
+	// Create a TCP endpoint, bind it, then start listening.
 	var wq waiter.Queue
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, network, &wq)
 	if err != nil {
@@ -81,28 +91,23 @@ func NewListener(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkPr
 		}
 	}
 
-	return &Listener{
-		stack:  s,
-		ep:     ep,
-		wq:     &wq,
-		cancel: make(chan struct{}),
-	}, nil
+	return NewTCPListener(s, &wq, ep), nil
 }
 
 // Close implements net.Listener.Close.
-func (l *Listener) Close() error {
+func (l *TCPListener) Close() error {
 	l.ep.Close()
 	return nil
 }
 
 // Shutdown stops the HTTP server.
-func (l *Listener) Shutdown() {
+func (l *TCPListener) Shutdown() {
 	l.ep.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
 	close(l.cancel) // broadcast cancellation
 }
 
 // Addr implements net.Listener.Addr.
-func (l *Listener) Addr() net.Addr {
+func (l *TCPListener) Addr() net.Addr {
 	a, err := l.ep.GetLocalAddress()
 	if err != nil {
 		return nil
@@ -208,9 +213,9 @@ func (d *deadlineTimer) SetDeadline(t time.Time) error {
 	return nil
 }
 
-// A Conn is a wrapper around a tcpip.Endpoint that implements the net.Conn
+// A TCPConn is a wrapper around a TCP tcpip.Endpoint that implements the net.Conn
 // interface.
-type Conn struct {
+type TCPConn struct {
 	deadlineTimer
 
 	wq *waiter.Queue
@@ -228,9 +233,9 @@ type Conn struct {
 	read buffer.View
 }
 
-// NewConn creates a new Conn.
-func NewConn(wq *waiter.Queue, ep tcpip.Endpoint) *Conn {
-	c := &Conn{
+// NewTCPConn creates a new TCPConn.
+func NewTCPConn(wq *waiter.Queue, ep tcpip.Endpoint) *TCPConn {
+	c := &TCPConn{
 		wq: wq,
 		ep: ep,
 	}
@@ -239,7 +244,7 @@ func NewConn(wq *waiter.Queue, ep tcpip.Endpoint) *Conn {
 }
 
 // Accept implements net.Conn.Accept.
-func (l *Listener) Accept() (net.Conn, error) {
+func (l *TCPListener) Accept() (net.Conn, error) {
 	n, wq, err := l.ep.Accept()
 
 	if err == tcpip.ErrWouldBlock {
@@ -272,7 +277,7 @@ func (l *Listener) Accept() (net.Conn, error) {
 		}
 	}
 
-	return NewConn(wq, n), nil
+	return NewTCPConn(wq, n), nil
 }
 
 type opErrorer interface {
@@ -323,7 +328,7 @@ func commonRead(ep tcpip.Endpoint, wq *waiter.Queue, deadline <-chan struct{}, a
 }
 
 // Read implements net.Conn.Read.
-func (c *Conn) Read(b []byte) (int, error) {
+func (c *TCPConn) Read(b []byte) (int, error) {
 	c.readMu.Lock()
 	defer c.readMu.Unlock()
 
@@ -352,7 +357,7 @@ func (c *Conn) Read(b []byte) (int, error) {
 }
 
 // Write implements net.Conn.Write.
-func (c *Conn) Write(b []byte) (int, error) {
+func (c *TCPConn) Write(b []byte) (int, error) {
 	deadline := c.writeCancel()
 
 	// Check if deadlineTimer has already expired.
@@ -431,7 +436,7 @@ func (c *Conn) Write(b []byte) (int, error) {
 }
 
 // Close implements net.Conn.Close.
-func (c *Conn) Close() error {
+func (c *TCPConn) Close() error {
 	c.ep.Close()
 	return nil
 }
@@ -440,7 +445,7 @@ func (c *Conn) Close() error {
 // should just use Close.
 //
 // A TCP Half-Close is performed the same as CloseRead for *net.TCPConn.
-func (c *Conn) CloseRead() error {
+func (c *TCPConn) CloseRead() error {
 	if terr := c.ep.Shutdown(tcpip.ShutdownRead); terr != nil {
 		return c.newOpError("close", errors.New(terr.String()))
 	}
@@ -451,7 +456,7 @@ func (c *Conn) CloseRead() error {
 // should just use Close.
 //
 // A TCP Half-Close is performed the same as CloseWrite for *net.TCPConn.
-func (c *Conn) CloseWrite() error {
+func (c *TCPConn) CloseWrite() error {
 	if terr := c.ep.Shutdown(tcpip.ShutdownWrite); terr != nil {
 		return c.newOpError("close", errors.New(terr.String()))
 	}
@@ -459,7 +464,7 @@ func (c *Conn) CloseWrite() error {
 }
 
 // LocalAddr implements net.Conn.LocalAddr.
-func (c *Conn) LocalAddr() net.Addr {
+func (c *TCPConn) LocalAddr() net.Addr {
 	a, err := c.ep.GetLocalAddress()
 	if err != nil {
 		return nil
@@ -468,7 +473,7 @@ func (c *Conn) LocalAddr() net.Addr {
 }
 
 // RemoteAddr implements net.Conn.RemoteAddr.
-func (c *Conn) RemoteAddr() net.Addr {
+func (c *TCPConn) RemoteAddr() net.Addr {
 	a, err := c.ep.GetRemoteAddress()
 	if err != nil {
 		return nil
@@ -476,7 +481,7 @@ func (c *Conn) RemoteAddr() net.Addr {
 	return fullToTCPAddr(a)
 }
 
-func (c *Conn) newOpError(op string, err error) *net.OpError {
+func (c *TCPConn) newOpError(op string, err error) *net.OpError {
 	return &net.OpError{
 		Op:     op,
 		Net:    "tcp",
@@ -494,14 +499,14 @@ func fullToUDPAddr(addr tcpip.FullAddress) *net.UDPAddr {
 	return &net.UDPAddr{IP: net.IP(addr.Addr), Port: int(addr.Port)}
 }
 
-// DialTCP creates a new TCP Conn connected to the specified address.
-func DialTCP(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*Conn, error) {
+// DialTCP creates a new TCPConn connected to the specified address.
+func DialTCP(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPConn, error) {
 	return DialContextTCP(context.Background(), s, addr, network)
 }
 
-// DialContextTCP creates a new TCP Conn connected to the specified address
+// DialContextTCP creates a new TCPConn connected to the specified address
 // with the option of adding cancellation and timeouts.
-func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*Conn, error) {
+func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPConn, error) {
 	// Create TCP endpoint, then connect.
 	var wq waiter.Queue
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, network, &wq)
@@ -543,12 +548,12 @@ func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress,
 		}
 	}
 
-	return NewConn(&wq, ep), nil
+	return NewTCPConn(&wq, ep), nil
 }
 
-// A PacketConn is a wrapper around a tcpip endpoint that implements
-// net.PacketConn.
-type PacketConn struct {
+// A UDPConn is a wrapper around a UDP tcpip.Endpoint that implements
+// net.Conn and net.PacketConn.
+type UDPConn struct {
 	deadlineTimer
 
 	stack *stack.Stack
@@ -556,9 +561,9 @@ type PacketConn struct {
 	wq    *waiter.Queue
 }
 
-// NewPacketConn creates a new PacketConn.
-func NewPacketConn(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *PacketConn {
-	c := &PacketConn{
+// NewUDPConn creates a new UDPConn.
+func NewUDPConn(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *UDPConn {
+	c := &UDPConn{
 		stack: s,
 		ep:    ep,
 		wq:    wq,
@@ -567,12 +572,12 @@ func NewPacketConn(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *PacketC
 	return c
 }
 
-// DialUDP creates a new PacketConn.
+// DialUDP creates a new UDPConn.
 //
 // If laddr is nil, a local address is automatically chosen.
 //
-// If raddr is nil, the PacketConn is left unconnected.
-func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*PacketConn, error) {
+// If raddr is nil, the UDPConn is left unconnected.
+func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*UDPConn, error) {
 	var wq waiter.Queue
 	ep, err := s.NewEndpoint(udp.ProtocolNumber, network, &wq)
 	if err != nil {
@@ -591,7 +596,7 @@ func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.Netw
 		}
 	}
 
-	c := NewPacketConn(s, &wq, ep)
+	c := NewUDPConn(s, &wq, ep)
 
 	if raddr != nil {
 		if err := c.ep.Connect(*raddr); err != nil {
@@ -608,11 +613,11 @@ func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.Netw
 	return c, nil
 }
 
-func (c *PacketConn) newOpError(op string, err error) *net.OpError {
+func (c *UDPConn) newOpError(op string, err error) *net.OpError {
 	return c.newRemoteOpError(op, nil, err)
 }
 
-func (c *PacketConn) newRemoteOpError(op string, remote net.Addr, err error) *net.OpError {
+func (c *UDPConn) newRemoteOpError(op string, remote net.Addr, err error) *net.OpError {
 	return &net.OpError{
 		Op:     op,
 		Net:    "udp",
@@ -623,7 +628,7 @@ func (c *PacketConn) newRemoteOpError(op string, remote net.Addr, err error) *ne
 }
 
 // RemoteAddr implements net.Conn.RemoteAddr.
-func (c *PacketConn) RemoteAddr() net.Addr {
+func (c *UDPConn) RemoteAddr() net.Addr {
 	a, err := c.ep.GetRemoteAddress()
 	if err != nil {
 		return nil
@@ -632,13 +637,13 @@ func (c *PacketConn) RemoteAddr() net.Addr {
 }
 
 // Read implements net.Conn.Read
-func (c *PacketConn) Read(b []byte) (int, error) {
+func (c *UDPConn) Read(b []byte) (int, error) {
 	bytesRead, _, err := c.ReadFrom(b)
 	return bytesRead, err
 }
 
 // ReadFrom implements net.PacketConn.ReadFrom.
-func (c *PacketConn) ReadFrom(b []byte) (int, net.Addr, error) {
+func (c *UDPConn) ReadFrom(b []byte) (int, net.Addr, error) {
 	deadline := c.readCancel()
 
 	var addr tcpip.FullAddress
@@ -650,12 +655,12 @@ func (c *PacketConn) ReadFrom(b []byte) (int, net.Addr, error) {
 	return copy(b, read), fullToUDPAddr(addr), nil
 }
 
-func (c *PacketConn) Write(b []byte) (int, error) {
+func (c *UDPConn) Write(b []byte) (int, error) {
 	return c.WriteTo(b, nil)
 }
 
 // WriteTo implements net.PacketConn.WriteTo.
-func (c *PacketConn) WriteTo(b []byte, addr net.Addr) (int, error) {
+func (c *UDPConn) WriteTo(b []byte, addr net.Addr) (int, error) {
 	deadline := c.writeCancel()
 
 	// Check if deadline has already expired.
@@ -713,13 +718,13 @@ func (c *PacketConn) WriteTo(b []byte, addr net.Addr) (int, error) {
 }
 
 // Close implements net.PacketConn.Close.
-func (c *PacketConn) Close() error {
+func (c *UDPConn) Close() error {
 	c.ep.Close()
 	return nil
 }
 
 // LocalAddr implements net.PacketConn.LocalAddr.
-func (c *PacketConn) LocalAddr() net.Addr {
+func (c *UDPConn) LocalAddr() net.Addr {
 	a, err := c.ep.GetLocalAddress()
 	if err != nil {
 		return nil
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index ee077ae83..ea0a0409a 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -41,7 +41,7 @@ const (
 )
 
 func TestTimeouts(t *testing.T) {
-	nc := NewConn(nil, nil)
+	nc := NewTCPConn(nil, nil)
 	dlfs := []struct {
 		name string
 		f    func(time.Time) error
@@ -132,7 +132,7 @@ func TestCloseReader(t *testing.T) {
 
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
 
-	l, e := NewListener(s, addr, ipv4.ProtocolNumber)
+	l, e := ListenTCP(s, addr, ipv4.ProtocolNumber)
 	if e != nil {
 		t.Fatalf("NewListener() = %v", e)
 	}
@@ -168,7 +168,7 @@ func TestCloseReader(t *testing.T) {
 	sender.close()
 }
 
-// TestCloseReaderWithForwarder tests that Conn.Close() wakes Conn.Read() when
+// TestCloseReaderWithForwarder tests that TCPConn.Close wakes TCPConn.Read when
 // using tcp.Forwarder.
 func TestCloseReaderWithForwarder(t *testing.T) {
 	s, err := newLoopbackStack()
@@ -192,7 +192,7 @@ func TestCloseReaderWithForwarder(t *testing.T) {
 		defer ep.Close()
 		r.Complete(false)
 
-		c := NewConn(&wq, ep)
+		c := NewTCPConn(&wq, ep)
 
 		// Give c.Read() a chance to block before closing the connection.
 		time.AfterFunc(time.Millisecond*50, func() {
@@ -238,7 +238,7 @@ func TestCloseRead(t *testing.T) {
 		defer ep.Close()
 		r.Complete(false)
 
-		c := NewConn(&wq, ep)
+		c := NewTCPConn(&wq, ep)
 
 		buf := make([]byte, 256)
 		n, e := c.Read(buf)
@@ -257,7 +257,7 @@ func TestCloseRead(t *testing.T) {
 	if terr != nil {
 		t.Fatalf("connect() = %v", terr)
 	}
-	c := NewConn(tc.wq, tc.ep)
+	c := NewTCPConn(tc.wq, tc.ep)
 
 	if err := c.CloseRead(); err != nil {
 		t.Errorf("c.CloseRead() = %v", err)
@@ -291,7 +291,7 @@ func TestCloseWrite(t *testing.T) {
 		defer ep.Close()
 		r.Complete(false)
 
-		c := NewConn(&wq, ep)
+		c := NewTCPConn(&wq, ep)
 
 		n, e := c.Read(make([]byte, 256))
 		if n != 0 || e != io.EOF {
@@ -309,7 +309,7 @@ func TestCloseWrite(t *testing.T) {
 	if terr != nil {
 		t.Fatalf("connect() = %v", terr)
 	}
-	c := NewConn(tc.wq, tc.ep)
+	c := NewTCPConn(tc.wq, tc.ep)
 
 	if err := c.CloseWrite(); err != nil {
 		t.Errorf("c.CloseWrite() = %v", err)
@@ -353,7 +353,7 @@ func TestUDPForwarder(t *testing.T) {
 		}
 		defer ep.Close()
 
-		c := NewConn(&wq, ep)
+		c := NewTCPConn(&wq, ep)
 
 		buf := make([]byte, 256)
 		n, e := c.Read(buf)
@@ -396,7 +396,7 @@ func TestDeadlineChange(t *testing.T) {
 
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
 
-	l, e := NewListener(s, addr, ipv4.ProtocolNumber)
+	l, e := ListenTCP(s, addr, ipv4.ProtocolNumber)
 	if e != nil {
 		t.Fatalf("NewListener() = %v", e)
 	}
@@ -541,7 +541,7 @@ func makePipe() (c1, c2 net.Conn, stop func(), err error) {
 	addr := tcpip.FullAddress{NICID, ip, 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, ip)
 
-	l, err := NewListener(s, addr, ipv4.ProtocolNumber)
+	l, err := ListenTCP(s, addr, ipv4.ProtocolNumber)
 	if err != nil {
 		return nil, nil, nil, fmt.Errorf("NewListener: %v", err)
 	}
-- 
cgit v1.2.3


From 940d255971c38af9f91ceed1345fd973f8fdb41d Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 6 Feb 2020 15:57:34 -0800
Subject: Perform DAD on IPv6 addresses when enabling a NIC

Addresses may be added before a NIC is enabled. Make sure DAD is
performed on the permanent IPv6 addresses when they get enabled.

Test:
- stack_test.TestDoDADWhenNICEnabled
- stack.TestDisabledRxStatsWhenNICDisabled
PiperOrigin-RevId: 293697429
---
 pkg/tcpip/stack/BUILD         |   6 ++-
 pkg/tcpip/stack/ndp_test.go   |  74 +++++++++++++--------------
 pkg/tcpip/stack/nic.go        |  84 ++++++++++++++++++++++--------
 pkg/tcpip/stack/nic_test.go   |  62 +++++++++++++++++++++++
 pkg/tcpip/stack/stack_test.go | 115 ++++++++++++++++++++++++++++++++++++++++++
 pkg/tcpip/tcpip.go            |   8 +--
 6 files changed, 287 insertions(+), 62 deletions(-)
 create mode 100644 pkg/tcpip/stack/nic_test.go

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index f5b750046..705cf01ee 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -78,11 +78,15 @@ go_test(
 go_test(
     name = "stack_test",
     size = "small",
-    srcs = ["linkaddrcache_test.go"],
+    srcs = [
+        "linkaddrcache_test.go",
+        "nic_test.go",
+    ],
     library = ":stack",
     deps = [
         "//pkg/sleep",
         "//pkg/sync",
         "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
     ],
 )
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 9a4607dcb..1e575bdaf 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -1539,7 +1539,7 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
 }
 
 // Checks to see if list contains an IPv6 address, item.
-func contains(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool {
+func containsV6Addr(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool {
 	protocolAddress := tcpip.ProtocolAddress{
 		Protocol:          header.IPv6ProtocolNumber,
 		AddressWithPrefix: item,
@@ -1665,7 +1665,7 @@ func TestAutoGenAddr(t *testing.T) {
 	// with non-zero lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
 	expectAutoGenAddrEvent(addr1, newAddr)
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
 	}
 
@@ -1681,10 +1681,10 @@ func TestAutoGenAddr(t *testing.T) {
 	// Receive an RA with prefix2 in a PI.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
 	expectAutoGenAddrEvent(addr2, newAddr)
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr2) {
 		t.Fatalf("Should have %s in the list of addresses", addr2)
 	}
 
@@ -1705,10 +1705,10 @@ func TestAutoGenAddr(t *testing.T) {
 	case <-time.After(newMinVLDuration + defaultAsyncEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
-	if contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+	if containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) {
 		t.Fatalf("Should not have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr2) {
 		t.Fatalf("Should have %s in the list of addresses", addr2)
 	}
 }
@@ -1853,7 +1853,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
 	// Receive PI for prefix1.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
 	expectAutoGenAddrEvent(addr1, newAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should have %s in the list of addresses", addr1)
 	}
 	expectPrimaryAddr(addr1)
@@ -1861,7 +1861,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
 	// Deprecate addr for prefix1 immedaitely.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
 	expectAutoGenAddrEvent(addr1, deprecatedAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should have %s in the list of addresses", addr1)
 	}
 	// addr should still be the primary endpoint as there are no other addresses.
@@ -1879,7 +1879,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
 	// Receive PI for prefix2.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
 	expectAutoGenAddrEvent(addr2, newAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	expectPrimaryAddr(addr2)
@@ -1887,7 +1887,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
 	// Deprecate addr for prefix2 immedaitely.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
 	expectAutoGenAddrEvent(addr2, deprecatedAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	// addr1 should be the primary endpoint now since addr2 is deprecated but
@@ -1982,7 +1982,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	// Receive PI for prefix2.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
 	expectAutoGenAddrEvent(addr2, newAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	expectPrimaryAddr(addr2)
@@ -1990,10 +1990,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	// Receive a PI for prefix1.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 90))
 	expectAutoGenAddrEvent(addr1, newAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	expectPrimaryAddr(addr1)
@@ -2009,10 +2009,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 
 	// Wait for addr of prefix1 to be deprecated.
 	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncEventTimeout)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	// addr2 should be the primary endpoint now since addr1 is deprecated but
@@ -2049,10 +2049,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 
 	// Wait for addr of prefix1 to be deprecated.
 	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncEventTimeout)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	// addr2 should be the primary endpoint now since it is not deprecated.
@@ -2063,10 +2063,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 
 	// Wait for addr of prefix1 to be invalidated.
 	expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncEventTimeout)
-	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 	expectPrimaryAddr(addr2)
@@ -2112,10 +2112,10 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	case <-time.After(newMinVLDuration + defaultAsyncEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
-	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
-	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should not have %s in the list of addresses", addr2)
 	}
 	// Should not have any primary endpoints.
@@ -2600,7 +2600,7 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 	if err := s.AddProtocolAddress(1, tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: addr}); err != nil {
 		t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr.Address, err)
 	}
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
 	}
 
@@ -2613,7 +2613,7 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 		t.Fatal("unexpectedly received an auto gen addr event for an address we already have statically")
 	default:
 	}
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
 	}
 
@@ -2624,7 +2624,7 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 		t.Fatal("unexpectedly received an auto gen addr event")
 	case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
 	}
-	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
 	}
 }
@@ -2702,17 +2702,17 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 	const validLifetimeSecondPrefix1 = 1
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, validLifetimeSecondPrefix1, 0))
 	expectAutoGenAddrEvent(addr1, newAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should have %s in the list of addresses", addr1)
 	}
 
 	// Receive an RA with prefix2 in a PI with a large valid lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
 	expectAutoGenAddrEvent(addr2, newAddr)
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 
@@ -2725,10 +2725,10 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 	case <-time.After(validLifetimeSecondPrefix1*time.Second + defaultAsyncEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
-	if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
-	if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
 		t.Fatalf("should have %s in the list of addresses", addr2)
 	}
 }
@@ -3014,16 +3014,16 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	nicinfo := s.NICInfo()
 	nic1Addrs := nicinfo[nicID1].ProtocolAddresses
 	nic2Addrs := nicinfo[nicID2].ProtocolAddresses
-	if !contains(nic1Addrs, e1Addr1) {
+	if !containsV6Addr(nic1Addrs, e1Addr1) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
 	}
-	if !contains(nic1Addrs, e1Addr2) {
+	if !containsV6Addr(nic1Addrs, e1Addr2) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
 	}
-	if !contains(nic2Addrs, e2Addr1) {
+	if !containsV6Addr(nic2Addrs, e2Addr1) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
 	}
-	if !contains(nic2Addrs, e2Addr2) {
+	if !containsV6Addr(nic2Addrs, e2Addr2) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
 	}
 
@@ -3102,16 +3102,16 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	nicinfo = s.NICInfo()
 	nic1Addrs = nicinfo[nicID1].ProtocolAddresses
 	nic2Addrs = nicinfo[nicID2].ProtocolAddresses
-	if contains(nic1Addrs, e1Addr1) {
+	if containsV6Addr(nic1Addrs, e1Addr1) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
 	}
-	if contains(nic1Addrs, e1Addr2) {
+	if containsV6Addr(nic1Addrs, e1Addr2) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
 	}
-	if contains(nic2Addrs, e2Addr1) {
+	if containsV6Addr(nic2Addrs, e2Addr1) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
 	}
-	if contains(nic2Addrs, e2Addr2) {
+	if containsV6Addr(nic2Addrs, e2Addr2) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
 	}
 
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 7dad9a8cb..682e9c416 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -16,6 +16,7 @@ package stack
 
 import (
 	"log"
+	"reflect"
 	"sort"
 	"strings"
 	"sync/atomic"
@@ -39,6 +40,7 @@ type NIC struct {
 
 	mu struct {
 		sync.RWMutex
+		enabled       bool
 		spoofing      bool
 		promiscuous   bool
 		primary       map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint
@@ -56,6 +58,14 @@ type NIC struct {
 type NICStats struct {
 	Tx DirectionStats
 	Rx DirectionStats
+
+	DisabledRx DirectionStats
+}
+
+func makeNICStats() NICStats {
+	var s NICStats
+	tcpip.InitStatCounters(reflect.ValueOf(&s).Elem())
+	return s
 }
 
 // DirectionStats includes packet and byte counts.
@@ -99,16 +109,7 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 		name:    name,
 		linkEP:  ep,
 		context: ctx,
-		stats: NICStats{
-			Tx: DirectionStats{
-				Packets: &tcpip.StatCounter{},
-				Bytes:   &tcpip.StatCounter{},
-			},
-			Rx: DirectionStats{
-				Packets: &tcpip.StatCounter{},
-				Bytes:   &tcpip.StatCounter{},
-			},
-		},
+		stats:   makeNICStats(),
 	}
 	nic.mu.primary = make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint)
 	nic.mu.endpoints = make(map[NetworkEndpointID]*referencedNetworkEndpoint)
@@ -137,14 +138,30 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 // enable enables the NIC. enable will attach the link to its LinkEndpoint and
 // join the IPv6 All-Nodes Multicast address (ff02::1).
 func (n *NIC) enable() *tcpip.Error {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	n.mu.RUnlock()
+	if enabled {
+		return nil
+	}
+
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if n.mu.enabled {
+		return nil
+	}
+
+	n.mu.enabled = true
+
 	n.attachLinkEndpoint()
 
 	// Create an endpoint to receive broadcast packets on this interface.
 	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
-		if err := n.AddAddress(tcpip.ProtocolAddress{
+		if _, err := n.addAddressLocked(tcpip.ProtocolAddress{
 			Protocol:          header.IPv4ProtocolNumber,
 			AddressWithPrefix: tcpip.AddressWithPrefix{header.IPv4Broadcast, 8 * header.IPv4AddressSize},
-		}, NeverPrimaryEndpoint); err != nil {
+		}, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
 			return err
 		}
 	}
@@ -166,8 +183,22 @@ func (n *NIC) enable() *tcpip.Error {
 		return nil
 	}
 
-	n.mu.Lock()
-	defer n.mu.Unlock()
+	// Perform DAD on the all the unicast IPv6 endpoints that are in the permanent
+	// state.
+	//
+	// Addresses may have aleady completed DAD but in the time since the NIC was
+	// last enabled, other devices may have acquired the same addresses.
+	for _, r := range n.mu.endpoints {
+		addr := r.ep.ID().LocalAddress
+		if k := r.getKind(); (k != permanent && k != permanentTentative) || !header.IsV6UnicastAddress(addr) {
+			continue
+		}
+
+		r.setKind(permanentTentative)
+		if err := n.mu.ndp.startDuplicateAddressDetection(addr, r); err != nil {
+			return err
+		}
+	}
 
 	if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil {
 		return err
@@ -633,7 +664,9 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 	isIPv6Unicast := protocolAddress.Protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(protocolAddress.AddressWithPrefix.Address)
 
 	// If the address is an IPv6 address and it is a permanent address,
-	// mark it as tentative so it goes through the DAD process.
+	// mark it as tentative so it goes through the DAD process if the NIC is
+	// enabled. If the NIC is not enabled, DAD will be started when the NIC is
+	// enabled.
 	if isIPv6Unicast && kind == permanent {
 		kind = permanentTentative
 	}
@@ -668,8 +701,8 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 
 	n.insertPrimaryEndpointLocked(ref, peb)
 
-	// If we are adding a tentative IPv6 address, start DAD.
-	if isIPv6Unicast && kind == permanentTentative {
+	// If we are adding a tentative IPv6 address, start DAD if the NIC is enabled.
+	if isIPv6Unicast && kind == permanentTentative && n.mu.enabled {
 		if err := n.mu.ndp.startDuplicateAddressDetection(protocolAddress.AddressWithPrefix.Address, ref); err != nil {
 			return nil, err
 		}
@@ -700,9 +733,7 @@ func (n *NIC) AllAddresses() []tcpip.ProtocolAddress {
 		// Don't include tentative, expired or temporary endpoints to
 		// avoid confusion and prevent the caller from using those.
 		switch ref.getKind() {
-		case permanentTentative, permanentExpired, temporary:
-			// TODO(b/140898488): Should tentative addresses be
-			//                    returned?
+		case permanentExpired, temporary:
 			continue
 		}
 		addrs = append(addrs, tcpip.ProtocolAddress{
@@ -1016,11 +1047,23 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
 // This rule applies only to the slice itself, not to the items of the slice;
 // the ownership of the items is not retained by the caller.
 func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	// If the NIC is not yet enabled, don't receive any packets.
+	if !enabled {
+		n.mu.RUnlock()
+
+		n.stats.DisabledRx.Packets.Increment()
+		n.stats.DisabledRx.Bytes.IncrementBy(uint64(pkt.Data.Size()))
+		return
+	}
+
 	n.stats.Rx.Packets.Increment()
 	n.stats.Rx.Bytes.IncrementBy(uint64(pkt.Data.Size()))
 
 	netProto, ok := n.stack.networkProtocols[protocol]
 	if !ok {
+		n.mu.RUnlock()
 		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
 		return
 	}
@@ -1032,7 +1075,6 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	}
 
 	// Are any packet sockets listening for this network protocol?
-	n.mu.RLock()
 	packetEPs := n.mu.packetEPs[protocol]
 	// Check whether there are packet sockets listening for every protocol.
 	// If we received a packet with protocol EthernetProtocolAll, then the
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
new file mode 100644
index 000000000..edaee3b86
--- /dev/null
+++ b/pkg/tcpip/stack/nic_test.go
@@ -0,0 +1,62 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+func TestDisabledRxStatsWhenNICDisabled(t *testing.T) {
+	// When the NIC is disabled, the only field that matters is the stats field.
+	// This test is limited to stats counter checks.
+	nic := NIC{
+		stats: makeNICStats(),
+	}
+
+	if got := nic.stats.DisabledRx.Packets.Value(); got != 0 {
+		t.Errorf("got DisabledRx.Packets = %d, want = 0", got)
+	}
+	if got := nic.stats.DisabledRx.Bytes.Value(); got != 0 {
+		t.Errorf("got DisabledRx.Bytes = %d, want = 0", got)
+	}
+	if got := nic.stats.Rx.Packets.Value(); got != 0 {
+		t.Errorf("got Rx.Packets = %d, want = 0", got)
+	}
+	if got := nic.stats.Rx.Bytes.Value(); got != 0 {
+		t.Errorf("got Rx.Bytes = %d, want = 0", got)
+	}
+
+	if t.Failed() {
+		t.FailNow()
+	}
+
+	nic.DeliverNetworkPacket(nil, "", "", 0, tcpip.PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
+
+	if got := nic.stats.DisabledRx.Packets.Value(); got != 1 {
+		t.Errorf("got DisabledRx.Packets = %d, want = 1", got)
+	}
+	if got := nic.stats.DisabledRx.Bytes.Value(); got != 4 {
+		t.Errorf("got DisabledRx.Bytes = %d, want = 4", got)
+	}
+	if got := nic.stats.Rx.Packets.Value(); got != 0 {
+		t.Errorf("got Rx.Packets = %d, want = 0", got)
+	}
+	if got := nic.stats.Rx.Bytes.Value(); got != 0 {
+		t.Errorf("got Rx.Bytes = %d, want = 0", got)
+	}
+}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 834fe9487..243868f3a 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -2561,3 +2561,118 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 		})
 	}
 }
+
+// TestDoDADWhenNICEnabled tests that IPv6 endpoints that were added while a NIC
+// was disabled have DAD performed on them when the NIC is enabled.
+func TestDoDADWhenNICEnabled(t *testing.T) {
+	t.Parallel()
+
+	const dadTransmits = 1
+	const retransmitTimer = time.Second
+	const nicID = 1
+
+	ndpDisp := ndpDispatcher{
+		dadC: make(chan ndpDADEvent),
+	}
+	opts := stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			DupAddrDetectTransmits: dadTransmits,
+			RetransmitTimer:        retransmitTimer,
+		},
+		NDPDisp: &ndpDisp,
+	}
+
+	e := channel.New(dadTransmits, 1280, linkAddr1)
+	s := stack.New(opts)
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	addr := tcpip.ProtocolAddress{
+		Protocol: header.IPv6ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   llAddr1,
+			PrefixLen: 128,
+		},
+	}
+	if err := s.AddProtocolAddress(nicID, addr); err != nil {
+		t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, addr, err)
+	}
+
+	// Address should be in the list of all addresses.
+	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
+		t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr)
+	}
+
+	// Address should be tentative so it should not be a main address.
+	got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if want := (tcpip.AddressWithPrefix{}); got != want {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, want)
+	}
+
+	// Enabling the NIC should start DAD for the address.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
+		t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr)
+	}
+
+	// Address should not be considered bound to the NIC yet (DAD ongoing).
+	got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if want := (tcpip.AddressWithPrefix{}); got != want {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, want)
+	}
+
+	// Wait for DAD to resolve.
+	select {
+	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+		t.Fatal("timed out waiting for DAD resolution")
+	case e := <-ndpDisp.dadC:
+		if e.err != nil {
+			t.Fatal("got DAD error: ", e.err)
+		}
+		if e.nicID != nicID {
+			t.Fatalf("got DAD event w/ nicID = %d, want = %d", e.nicID, nicID)
+		}
+		if e.addr != addr.AddressWithPrefix.Address {
+			t.Fatalf("got DAD event w/ addr = %s, want = %s", e.addr, addr.AddressWithPrefix.Address)
+		}
+		if !e.resolved {
+			t.Fatal("got DAD event w/ resolved = false, want = true")
+		}
+	}
+	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
+		t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr)
+	}
+	got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if got != addr.AddressWithPrefix {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr.AddressWithPrefix)
+	}
+
+	// Enabling the NIC again should be a no-op.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
+		t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr)
+	}
+	got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if got != addr.AddressWithPrefix {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, addr.AddressWithPrefix)
+	}
+}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index d29d9a704..0e944712f 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1170,7 +1170,9 @@ type TransportEndpointStats struct {
 // marker interface.
 func (*TransportEndpointStats) IsEndpointStats() {}
 
-func fillIn(v reflect.Value) {
+// InitStatCounters initializes v's fields with nil StatCounter fields to new
+// StatCounters.
+func InitStatCounters(v reflect.Value) {
 	for i := 0; i < v.NumField(); i++ {
 		v := v.Field(i)
 		if s, ok := v.Addr().Interface().(**StatCounter); ok {
@@ -1178,14 +1180,14 @@ func fillIn(v reflect.Value) {
 				*s = new(StatCounter)
 			}
 		} else {
-			fillIn(v)
+			InitStatCounters(v)
 		}
 	}
 }
 
 // FillIn returns a copy of s with nil fields initialized to new StatCounters.
 func (s Stats) FillIn() Stats {
-	fillIn(reflect.ValueOf(&s).Elem())
+	InitStatCounters(reflect.ValueOf(&s).Elem())
 	return s
 }
 
-- 
cgit v1.2.3


From 3700221b1f3ff0779a0f4479fd2bafa3312d5a23 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 6 Feb 2020 16:42:37 -0800
Subject: Auto-generate link-local address as a SLAAC address

Auto-generated link-local addresses should have the same lifecycle hooks
as global SLAAC addresses.

The Stack's NDP dispatcher should be notified when link-local addresses
are auto-generated and invalidated. They should also be removed when a
NIC is disabled (which will be supported in a later change).

Tests:
- stack_test.TestNICAutoGenAddrWithOpaque
- stack_test.TestNICAutoGenAddr
PiperOrigin-RevId: 293706760
---
 pkg/tcpip/stack/ndp.go        |  36 +++--
 pkg/tcpip/stack/ndp_test.go   |  85 +++++++-----
 pkg/tcpip/stack/nic.go        |  30 +----
 pkg/tcpip/stack/stack_test.go | 307 +++++++++++++++++++-----------------------
 4 files changed, 218 insertions(+), 240 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 6123fda33..fae5f5014 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -906,22 +906,21 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 		return
 	}
 
-	// We do not already have an address within the prefix, prefix. Do the
+	// We do not already have an address with the prefix prefix. Do the
 	// work as outlined by RFC 4862 section 5.5.3.d if n is configured
-	// to auto-generated global addresses by SLAAC.
-	ndp.newAutoGenAddress(prefix, pl, vl)
+	// to auto-generate global addresses by SLAAC.
+	if !ndp.configs.AutoGenGlobalAddresses {
+		return
+	}
+
+	ndp.doSLAAC(prefix, pl, vl)
 }
 
-// newAutoGenAddress generates a new SLAAC address with the provided lifetimes
+// doSLAAC generates a new SLAAC address with the provided lifetimes
 // for prefix.
 //
 // pl is the new preferred lifetime. vl is the new valid lifetime.
-func (ndp *ndpState) newAutoGenAddress(prefix tcpip.Subnet, pl, vl time.Duration) {
-	// Are we configured to auto-generate new global addresses?
-	if !ndp.configs.AutoGenGlobalAddresses {
-		return
-	}
-
+func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	// If we do not already have an address for this prefix and the valid
 	// lifetime is 0, no need to do anything further, as per RFC 4862
 	// section 5.5.3.d.
@@ -1152,12 +1151,21 @@ func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bo
 //
 // The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupHostOnlyState() {
+	linkLocalSubnet := header.IPv6LinkLocalPrefix.Subnet()
+	linkLocalAddrs := 0
 	for addr := range ndp.autoGenAddresses {
+		// RFC 4862 section 5 states that routers are also expected to generate a
+		// link-local address so we do not invalidate them.
+		if linkLocalSubnet.Contains(addr) {
+			linkLocalAddrs++
+			continue
+		}
+
 		ndp.invalidateAutoGenAddress(addr)
 	}
 
-	if got := len(ndp.autoGenAddresses); got != 0 {
-		log.Fatalf("ndp: still have auto-generated addresses after cleaning up, found = %d", got)
+	if got := len(ndp.autoGenAddresses); got != linkLocalAddrs {
+		log.Fatalf("ndp: still have non-linklocal auto-generated addresses after cleaning up; found = %d prefixes, of which %d are link-local", got, linkLocalAddrs)
 	}
 
 	for prefix := range ndp.onLinkPrefixes {
@@ -1165,7 +1173,7 @@ func (ndp *ndpState) cleanupHostOnlyState() {
 	}
 
 	if got := len(ndp.onLinkPrefixes); got != 0 {
-		log.Fatalf("ndp: still have discovered on-link prefixes after cleaning up, found = %d", got)
+		log.Fatalf("ndp: still have discovered on-link prefixes after cleaning up; found = %d", got)
 	}
 
 	for router := range ndp.defaultRouters {
@@ -1173,7 +1181,7 @@ func (ndp *ndpState) cleanupHostOnlyState() {
 	}
 
 	if got := len(ndp.defaultRouters); got != 0 {
-		log.Fatalf("ndp: still have discovered default routers after cleaning up, found = %d", got)
+		log.Fatalf("ndp: still have discovered default routers after cleaning up; found = %d", got)
 	}
 }
 
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 1e575bdaf..e13509fbd 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -42,6 +42,7 @@ const (
 	linkAddr1                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
 	linkAddr2                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x07")
 	linkAddr3                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x08")
+	linkAddr4                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x09")
 	defaultTimeout           = 100 * time.Millisecond
 	defaultAsyncEventTimeout = time.Second
 )
@@ -50,6 +51,7 @@ var (
 	llAddr1 = header.LinkLocalAddr(linkAddr1)
 	llAddr2 = header.LinkLocalAddr(linkAddr2)
 	llAddr3 = header.LinkLocalAddr(linkAddr3)
+	llAddr4 = header.LinkLocalAddr(linkAddr4)
 	dstAddr = tcpip.FullAddress{
 		Addr: "\x0a\x0b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01",
 		Port: 25,
@@ -2882,8 +2884,8 @@ func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
 }
 
 // TestCleanupHostOnlyStateOnBecomingRouter tests that all discovered routers
-// and prefixes, and auto-generated addresses get invalidated when a NIC
-// becomes a router.
+// and prefixes, and non-linklocal auto-generated addresses are invalidated when
+// a NIC becomes a router.
 func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	t.Parallel()
 
@@ -2898,6 +2900,14 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	prefix2, subnet2, e1Addr2 := prefixSubnetAddr(1, linkAddr1)
 	e2Addr1 := addrForSubnet(subnet1, linkAddr2)
 	e2Addr2 := addrForSubnet(subnet2, linkAddr2)
+	llAddrWithPrefix1 := tcpip.AddressWithPrefix{
+		Address:   llAddr1,
+		PrefixLen: 64,
+	}
+	llAddrWithPrefix2 := tcpip.AddressWithPrefix{
+		Address:   llAddr2,
+		PrefixLen: 64,
+	}
 
 	ndpDisp := ndpDispatcher{
 		routerC:        make(chan ndpRouterEvent, maxEvents),
@@ -2907,7 +2917,8 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		autoGenAddrC:   make(chan ndpAutoGenAddrEvent, maxEvents),
 	}
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+		AutoGenIPv6LinkLocal: true,
 		NDPConfigs: stack.NDPConfigurations{
 			HandleRAs:              true,
 			DiscoverDefaultRouters: true,
@@ -2917,16 +2928,6 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		NDPDisp: &ndpDisp,
 	})
 
-	e1 := channel.New(0, 1280, linkAddr1)
-	if err := s.CreateNIC(nicID1, e1); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
-	}
-
-	e2 := channel.New(0, 1280, linkAddr2)
-	if err := s.CreateNIC(nicID2, e2); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
-	}
-
 	expectRouterEvent := func() (bool, ndpRouterEvent) {
 		select {
 		case e := <-ndpDisp.routerC:
@@ -2957,18 +2958,30 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		return false, ndpAutoGenAddrEvent{}
 	}
 
-	// Receive RAs on NIC(1) and NIC(2) from default routers (llAddr1 and
-	// llAddr2) w/ PI (for prefix1 in RA from llAddr1 and prefix2 in RA from
-	// llAddr2) to discover multiple routers and prefixes, and auto-gen
-	// multiple addresses.
-
-	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr1, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+	e1 := channel.New(0, 1280, linkAddr1)
+	if err := s.CreateNIC(nicID1, e1); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
+	}
 	// We have other tests that make sure we receive the *correct* events
 	// on normal discovery of routers/prefixes, and auto-generated
 	// addresses. Here we just make sure we get an event and let other tests
 	// handle the correctness check.
+	expectAutoGenAddrEvent()
+
+	e2 := channel.New(0, 1280, linkAddr2)
+	if err := s.CreateNIC(nicID2, e2); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
+	}
+	expectAutoGenAddrEvent()
+
+	// Receive RAs on NIC(1) and NIC(2) from default routers (llAddr3 and
+	// llAddr4) w/ PI (for prefix1 in RA from llAddr3 and prefix2 in RA from
+	// llAddr4) to discover multiple routers and prefixes, and auto-gen
+	// multiple addresses.
+
+	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
 	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr1, nicID1)
+		t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID1)
 	}
 	if ok, _ := expectPrefixEvent(); !ok {
 		t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1)
@@ -2977,9 +2990,9 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr1, nicID1)
 	}
 
-	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
 	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr2, nicID1)
+		t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID1)
 	}
 	if ok, _ := expectPrefixEvent(); !ok {
 		t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1)
@@ -2988,9 +3001,9 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID1)
 	}
 
-	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr1, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
 	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr1, nicID2)
+		t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID2)
 	}
 	if ok, _ := expectPrefixEvent(); !ok {
 		t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2)
@@ -2999,9 +3012,9 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID2)
 	}
 
-	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
 	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr2, nicID2)
+		t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID2)
 	}
 	if ok, _ := expectPrefixEvent(); !ok {
 		t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2)
@@ -3014,12 +3027,18 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	nicinfo := s.NICInfo()
 	nic1Addrs := nicinfo[nicID1].ProtocolAddresses
 	nic2Addrs := nicinfo[nicID2].ProtocolAddresses
+	if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
+		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+	}
 	if !containsV6Addr(nic1Addrs, e1Addr1) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
 	}
 	if !containsV6Addr(nic1Addrs, e1Addr2) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
 	}
+	if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
+		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+	}
 	if !containsV6Addr(nic2Addrs, e2Addr1) {
 		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
 	}
@@ -3071,10 +3090,10 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	}
 
 	expectedRouterEvents := map[ndpRouterEvent]int{
-		{nicID: nicID1, addr: llAddr1, discovered: false}: 1,
-		{nicID: nicID1, addr: llAddr2, discovered: false}: 1,
-		{nicID: nicID2, addr: llAddr1, discovered: false}: 1,
-		{nicID: nicID2, addr: llAddr2, discovered: false}: 1,
+		{nicID: nicID1, addr: llAddr3, discovered: false}: 1,
+		{nicID: nicID1, addr: llAddr4, discovered: false}: 1,
+		{nicID: nicID2, addr: llAddr3, discovered: false}: 1,
+		{nicID: nicID2, addr: llAddr4, discovered: false}: 1,
 	}
 	if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" {
 		t.Errorf("router events mismatch (-want +got):\n%s", diff)
@@ -3102,12 +3121,18 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 	nicinfo = s.NICInfo()
 	nic1Addrs = nicinfo[nicID1].ProtocolAddresses
 	nic2Addrs = nicinfo[nicID2].ProtocolAddresses
+	if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
+		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+	}
 	if containsV6Addr(nic1Addrs, e1Addr1) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
 	}
 	if containsV6Addr(nic1Addrs, e1Addr2) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
 	}
+	if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
+		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+	}
 	if containsV6Addr(nic2Addrs, e2Addr1) {
 		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
 	}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 682e9c416..78d451cca 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -206,33 +206,9 @@ func (n *NIC) enable() *tcpip.Error {
 
 	// Do not auto-generate an IPv6 link-local address for loopback devices.
 	if n.stack.autoGenIPv6LinkLocal && !n.isLoopback() {
-		var addr tcpip.Address
-		if oIID := n.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
-			addr = header.LinkLocalAddrWithOpaqueIID(oIID.NICNameFromID(n.ID(), n.name), 0, oIID.SecretKey)
-		} else {
-			l2addr := n.linkEP.LinkAddress()
-
-			// Only attempt to generate the link-local address if we have a valid MAC
-			// address.
-			//
-			// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
-			// LinkEndpoint.LinkAddress) before reaching this point.
-			if !header.IsValidUnicastEthernetAddress(l2addr) {
-				return nil
-			}
-
-			addr = header.LinkLocalAddr(l2addr)
-		}
-
-		if _, err := n.addAddressLocked(tcpip.ProtocolAddress{
-			Protocol: header.IPv6ProtocolNumber,
-			AddressWithPrefix: tcpip.AddressWithPrefix{
-				Address:   addr,
-				PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen,
-			},
-		}, CanBePrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
-			return err
-		}
+		// The valid and preferred lifetime is infinite for the auto-generated
+		// link-local address.
+		n.mu.ndp.doSLAAC(header.IPv6LinkLocalPrefix.Subnet(), header.NDPInfiniteLifetime, header.NDPInfiniteLifetime)
 	}
 
 	// If we are operating as a router, then do not solicit routers since we
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 243868f3a..b2c1763bf 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -1894,112 +1894,6 @@ func TestNICForwarding(t *testing.T) {
 	}
 }
 
-// TestNICAutoGenAddr tests the auto-generation of IPv6 link-local addresses
-// using the modified EUI-64 of the NIC's MAC address (or lack there-of if
-// disabled (default)). Note, DAD will be disabled in these tests.
-func TestNICAutoGenAddr(t *testing.T) {
-	tests := []struct {
-		name      string
-		autoGen   bool
-		linkAddr  tcpip.LinkAddress
-		iidOpts   stack.OpaqueInterfaceIdentifierOptions
-		shouldGen bool
-	}{
-		{
-			"Disabled",
-			false,
-			linkAddr1,
-			stack.OpaqueInterfaceIdentifierOptions{
-				NICNameFromID: func(nicID tcpip.NICID, _ string) string {
-					return fmt.Sprintf("nic%d", nicID)
-				},
-			},
-			false,
-		},
-		{
-			"Enabled",
-			true,
-			linkAddr1,
-			stack.OpaqueInterfaceIdentifierOptions{},
-			true,
-		},
-		{
-			"Nil MAC",
-			true,
-			tcpip.LinkAddress([]byte(nil)),
-			stack.OpaqueInterfaceIdentifierOptions{},
-			false,
-		},
-		{
-			"Empty MAC",
-			true,
-			tcpip.LinkAddress(""),
-			stack.OpaqueInterfaceIdentifierOptions{},
-			false,
-		},
-		{
-			"Invalid MAC",
-			true,
-			tcpip.LinkAddress("\x01\x02\x03"),
-			stack.OpaqueInterfaceIdentifierOptions{},
-			false,
-		},
-		{
-			"Multicast MAC",
-			true,
-			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
-			stack.OpaqueInterfaceIdentifierOptions{},
-			false,
-		},
-		{
-			"Unspecified MAC",
-			true,
-			tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"),
-			stack.OpaqueInterfaceIdentifierOptions{},
-			false,
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			opts := stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				OpaqueIIDOpts:    test.iidOpts,
-			}
-
-			if test.autoGen {
-				// Only set opts.AutoGenIPv6LinkLocal when test.autoGen is true because
-				// opts.AutoGenIPv6LinkLocal should be false by default.
-				opts.AutoGenIPv6LinkLocal = true
-			}
-
-			e := channel.New(10, 1280, test.linkAddr)
-			s := stack.New(opts)
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
-			}
-
-			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
-			if err != nil {
-				t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
-			}
-
-			if test.shouldGen {
-				// Should have auto-generated an address and resolved immediately (DAD
-				// is disabled).
-				if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddr(test.linkAddr), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
-					t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
-				}
-			} else {
-				// Should not have auto-generated an address.
-				if want := (tcpip.AddressWithPrefix{}); addr != want {
-					t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
-				}
-			}
-		})
-	}
-}
-
 // TestNICContextPreservation tests that you can read out via stack.NICInfo the
 // Context data you pass via NICContext.Context in stack.CreateNICWithOptions.
 func TestNICContextPreservation(t *testing.T) {
@@ -2040,11 +1934,9 @@ func TestNICContextPreservation(t *testing.T) {
 	}
 }
 
-// TestNICAutoGenAddrWithOpaque tests the auto-generation of IPv6 link-local
-// addresses with opaque interface identifiers. Link Local addresses should
-// always be generated with opaque IIDs if configured to use them, even if the
-// NIC has an invalid MAC address.
-func TestNICAutoGenAddrWithOpaque(t *testing.T) {
+// TestNICAutoGenLinkLocalAddr tests the auto-generation of IPv6 link-local
+// addresses.
+func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 	const nicID = 1
 
 	var secretKey [header.OpaqueIIDSecretKeyMinBytes]byte
@@ -2056,108 +1948,185 @@ func TestNICAutoGenAddrWithOpaque(t *testing.T) {
 		t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", header.OpaqueIIDSecretKeyMinBytes, n)
 	}
 
+	nicNameFunc := func(_ tcpip.NICID, name string) string {
+		return name
+	}
+
 	tests := []struct {
-		name      string
-		nicName   string
-		autoGen   bool
-		linkAddr  tcpip.LinkAddress
-		secretKey []byte
+		name         string
+		nicName      string
+		autoGen      bool
+		linkAddr     tcpip.LinkAddress
+		iidOpts      stack.OpaqueInterfaceIdentifierOptions
+		shouldGen    bool
+		expectedAddr tcpip.Address
 	}{
 		{
 			name:      "Disabled",
 			nicName:   "nic1",
 			autoGen:   false,
 			linkAddr:  linkAddr1,
-			secretKey: secretKey[:],
+			shouldGen: false,
 		},
 		{
-			name:      "Enabled",
-			nicName:   "nic1",
-			autoGen:   true,
-			linkAddr:  linkAddr1,
-			secretKey: secretKey[:],
+			name:     "Disabled without OIID options",
+			nicName:  "nic1",
+			autoGen:  false,
+			linkAddr: linkAddr1,
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:],
+			},
+			shouldGen: false,
 		},
-		// These are all cases where we would not have generated a
-		// link-local address if opaque IIDs were disabled.
+
+		// Tests for EUI64 based addresses.
 		{
-			name:      "Nil MAC and empty nicName",
-			nicName:   "",
+			name:         "EUI64 Enabled",
+			autoGen:      true,
+			linkAddr:     linkAddr1,
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddr(linkAddr1),
+		},
+		{
+			name:      "EUI64 Empty MAC",
 			autoGen:   true,
-			linkAddr:  tcpip.LinkAddress([]byte(nil)),
-			secretKey: secretKey[:1],
+			shouldGen: false,
 		},
 		{
-			name:      "Empty MAC and empty nicName",
+			name:      "EUI64 Invalid MAC",
 			autoGen:   true,
-			linkAddr:  tcpip.LinkAddress(""),
-			secretKey: secretKey[:2],
+			linkAddr:  "\x01\x02\x03",
+			shouldGen: false,
 		},
 		{
-			name:      "Invalid MAC",
-			nicName:   "test",
+			name:      "EUI64 Multicast MAC",
 			autoGen:   true,
-			linkAddr:  tcpip.LinkAddress("\x01\x02\x03"),
-			secretKey: secretKey[:3],
+			linkAddr:  "\x01\x02\x03\x04\x05\x06",
+			shouldGen: false,
 		},
 		{
-			name:      "Multicast MAC",
-			nicName:   "test2",
+			name:      "EUI64 Unspecified MAC",
 			autoGen:   true,
-			linkAddr:  tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
-			secretKey: secretKey[:4],
+			linkAddr:  "\x00\x00\x00\x00\x00\x00",
+			shouldGen: false,
 		},
+
+		// Tests for Opaque IID based addresses.
 		{
-			name:     "Unspecified MAC and nil SecretKey",
+			name:     "OIID Enabled",
+			nicName:  "nic1",
+			autoGen:  true,
+			linkAddr: linkAddr1,
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:],
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("nic1", 0, secretKey[:]),
+		},
+		// These are all cases where we would not have generated a
+		// link-local address if opaque IIDs were disabled.
+		{
+			name:    "OIID Empty MAC and empty nicName",
+			autoGen: true,
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:1],
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("", 0, secretKey[:1]),
+		},
+		{
+			name:     "OIID Invalid MAC",
+			nicName:  "test",
+			autoGen:  true,
+			linkAddr: "\x01\x02\x03",
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:2],
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("test", 0, secretKey[:2]),
+		},
+		{
+			name:     "OIID Multicast MAC",
+			nicName:  "test2",
+			autoGen:  true,
+			linkAddr: "\x01\x02\x03\x04\x05\x06",
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:3],
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("test2", 0, secretKey[:3]),
+		},
+		{
+			name:     "OIID Unspecified MAC and nil SecretKey",
 			nicName:  "test3",
 			autoGen:  true,
-			linkAddr: tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"),
+			linkAddr: "\x00\x00\x00\x00\x00\x00",
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("test3", 0, nil),
 		},
 	}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			opts := stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
-					NICNameFromID: func(_ tcpip.NICID, nicName string) string {
-						return nicName
-					},
-					SecretKey: test.secretKey,
-				},
+			ndpDisp := ndpDispatcher{
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 			}
-
-			if test.autoGen {
-				// Only set opts.AutoGenIPv6LinkLocal when
-				// test.autoGen is true because
-				// opts.AutoGenIPv6LinkLocal should be false by
-				// default.
-				opts.AutoGenIPv6LinkLocal = true
+			opts := stack.Options{
+				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+				AutoGenIPv6LinkLocal: test.autoGen,
+				NDPDisp:              &ndpDisp,
+				OpaqueIIDOpts:        test.iidOpts,
 			}
 
-			e := channel.New(10, 1280, test.linkAddr)
+			e := channel.New(0, 1280, test.linkAddr)
 			s := stack.New(opts)
 			nicOpts := stack.NICOptions{Name: test.nicName}
 			if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
 				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
 			}
 
-			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
-			if err != nil {
-				t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err)
-			}
+			var expectedMainAddr tcpip.AddressWithPrefix
+
+			if test.shouldGen {
+				expectedMainAddr = tcpip.AddressWithPrefix{
+					Address:   test.expectedAddr,
+					PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen,
+				}
 
-			if test.autoGen {
-				// Should have auto-generated an address and
-				// resolved immediately (DAD is disabled).
-				if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddrWithOpaqueIID(test.nicName, 0, test.secretKey), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
-					t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
+				// Should have auto-generated an address and resolved immediately (DAD
+				// is disabled).
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, expectedMainAddr, newAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
 				}
 			} else {
 				// Should not have auto-generated an address.
-				if want := (tcpip.AddressWithPrefix{}); addr != want {
-					t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				select {
+				case <-ndpDisp.autoGenAddrC:
+					t.Fatal("unexpectedly auto-generated an address")
+				default:
 				}
 			}
+
+			gotMainAddr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+			}
+			if gotMainAddr != expectedMainAddr {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", gotMainAddr, expectedMainAddr)
+			}
 		})
 	}
 }
@@ -2226,7 +2195,7 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) {
 		NDPDisp:              &ndpDisp,
 	}
 
-	e := channel.New(10, 1280, linkAddr1)
+	e := channel.New(int(ndpConfigs.DupAddrDetectTransmits), 1280, linkAddr1)
 	s := stack.New(opts)
 	if err := s.CreateNIC(1, e); err != nil {
 		t.Fatalf("CreateNIC(_) = %s", err)
-- 
cgit v1.2.3


From ca30dfa065f5458228b06dcf5379ed4edf29c165 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 6 Feb 2020 19:49:30 -0800
Subject: Send DAD event when DAD resolves immediately

Previously, a DAD event would not be sent if DAD was disabled.

This allows integrators to do some work when an IPv6 address is bound to
a NIC without special logic that checks if DAD is enabled.

Without this change, integrators would need to check if a NIC has DAD
enabled when an address is auto-generated. If DAD is enabled, it would
need to delay the work until the DAD completion event; otherwise, it
would need to do the work in the address auto-generated event handler.

Test: stack_test.TestDADDisabled
PiperOrigin-RevId: 293732914
---
 pkg/tcpip/stack/ndp.go        |   7 ++
 pkg/tcpip/stack/ndp_test.go   | 263 +++++++++++++++++++++---------------------
 pkg/tcpip/stack/stack_test.go |  44 +++----
 3 files changed, 154 insertions(+), 160 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index fae5f5014..045409bda 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -448,6 +448,13 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 	remaining := ndp.configs.DupAddrDetectTransmits
 	if remaining == 0 {
 		ref.setKind(permanent)
+
+		// Consider DAD to have resolved even if no DAD messages were actually
+		// transmitted.
+		if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, true, nil)
+		}
+
 		return nil
 	}
 
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index e13509fbd..1f6f77439 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -86,39 +86,6 @@ func prefixSubnetAddr(offset uint8, linkAddr tcpip.LinkAddress) (tcpip.AddressWi
 	return prefix, subnet, addrForSubnet(subnet, linkAddr)
 }
 
-// TestDADDisabled tests that an address successfully resolves immediately
-// when DAD is not enabled (the default for an empty stack.Options).
-func TestDADDisabled(t *testing.T) {
-	opts := stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-	}
-
-	e := channel.New(0, 1280, linkAddr1)
-	s := stack.New(opts)
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(_) = %s", err)
-	}
-
-	if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
-		t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
-	}
-
-	// Should get the address immediately since we should not have performed
-	// DAD on it.
-	addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
-	if err != nil {
-		t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
-	}
-	if addr.Address != addr1 {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, addr1)
-	}
-
-	// We should not have sent any NDP NS messages.
-	if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != 0 {
-		t.Fatalf("got NeighborSolicit = %d, want = 0", got)
-	}
-}
-
 // ndpDADEvent is a set of parameters that was passed to
 // ndpDispatcher.OnDuplicateAddressDetectionStatus.
 type ndpDADEvent struct {
@@ -300,6 +267,58 @@ func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration s
 	}
 }
 
+// Check e to make sure that the event is for addr on nic with ID 1, and the
+// resolved flag set to resolved with the specified err.
+func checkDADEvent(e ndpDADEvent, nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) string {
+	return cmp.Diff(ndpDADEvent{nicID: nicID, addr: addr, resolved: resolved, err: err}, e, cmp.AllowUnexported(e))
+}
+
+// TestDADDisabled tests that an address successfully resolves immediately
+// when DAD is not enabled (the default for an empty stack.Options).
+func TestDADDisabled(t *testing.T) {
+	const nicID = 1
+	ndpDisp := ndpDispatcher{
+		dadC: make(chan ndpDADEvent, 1),
+	}
+	opts := stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPDisp:          &ndpDisp,
+	}
+
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(opts)
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
+	}
+
+	// Should get the address immediately since we should not have performed
+	// DAD on it.
+	select {
+	case e := <-ndpDisp.dadC:
+		if diff := checkDADEvent(e, nicID, addr1, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+		}
+	default:
+		t.Fatal("expected DAD event")
+	}
+	addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("stack.GetMainNICAddress(%d, %d) err = %s", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if addr.Address != addr1 {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, addr, addr1)
+	}
+
+	// We should not have sent any NDP NS messages.
+	if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != 0 {
+		t.Fatalf("got NeighborSolicit = %d, want = 0", got)
+	}
+}
+
 // TestDADResolve tests that an address successfully resolves after performing
 // DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
 // Included in the subtests is a test to make sure that an invalid
@@ -381,17 +400,8 @@ func TestDADResolve(t *testing.T) {
 				// means something is wrong.
 				t.Fatal("timed out waiting for DAD resolution")
 			case e := <-ndpDisp.dadC:
-				if e.err != nil {
-					t.Fatal("got DAD error: ", e.err)
-				}
-				if e.nicID != nicID {
-					t.Fatalf("got DAD event w/ nicID = %d, want = %d", e.nicID, nicID)
-				}
-				if e.addr != addr1 {
-					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
-				}
-				if !e.resolved {
-					t.Fatal("got DAD event w/ resolved = false, want = true")
+				if diff := checkDADEvent(e, nicID, addr1, true, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 				}
 			}
 			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
@@ -445,6 +455,8 @@ func TestDADResolve(t *testing.T) {
 // a node doing DAD for the same address), or if another node is detected to own
 // the address already (receive an NA message for the tentative address).
 func TestDADFail(t *testing.T) {
+	const nicID = 1
+
 	tests := []struct {
 		name    string
 		makeBuf func(tgt tcpip.Address) buffer.Prependable
@@ -526,22 +538,22 @@ func TestDADFail(t *testing.T) {
 
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(opts)
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
 
-			if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
-				t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+			if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
 			}
 
 			// Address should not be considered bound to the NIC yet
 			// (DAD ongoing).
-			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 			}
 			if want := (tcpip.AddressWithPrefix{}); addr != want {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 			}
 
 			// Receive a packet to simulate multiple nodes owning or
@@ -565,25 +577,16 @@ func TestDADFail(t *testing.T) {
 				// something is wrong.
 				t.Fatal("timed out waiting for DAD failure")
 			case e := <-ndpDisp.dadC:
-				if e.err != nil {
-					t.Fatal("got DAD error: ", e.err)
-				}
-				if e.nicID != 1 {
-					t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
-				}
-				if e.addr != addr1 {
-					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
-				}
-				if e.resolved {
-					t.Fatal("got DAD event w/ resolved = true, want = false")
+				if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 				}
 			}
-			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 			}
 			if want := (tcpip.AddressWithPrefix{}); addr != want {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 			}
 		})
 	}
@@ -592,6 +595,8 @@ func TestDADFail(t *testing.T) {
 // TestDADStop tests to make sure that the DAD process stops when an address is
 // removed.
 func TestDADStop(t *testing.T) {
+	const nicID = 1
+
 	ndpDisp := ndpDispatcher{
 		dadC: make(chan ndpDADEvent, 1),
 	}
@@ -607,26 +612,26 @@ func TestDADStop(t *testing.T) {
 
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(opts)
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(_) = %s", err)
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 	}
 
-	if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
-		t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+	if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
 	}
 
 	// Address should not be considered bound to the NIC yet (DAD ongoing).
-	addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 	}
 	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 	}
 
 	// Remove the address. This should stop DAD.
-	if err := s.RemoveAddress(1, addr1); err != nil {
-		t.Fatalf("RemoveAddress(_, %s) = %s", addr1, err)
+	if err := s.RemoveAddress(nicID, addr1); err != nil {
+		t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr1, err)
 	}
 
 	// Wait for DAD to fail (since the address was removed during DAD).
@@ -636,26 +641,16 @@ func TestDADStop(t *testing.T) {
 		// time + extra 1s buffer, something is wrong.
 		t.Fatal("timed out waiting for DAD failure")
 	case e := <-ndpDisp.dadC:
-		if e.err != nil {
-			t.Fatal("got DAD error: ", e.err)
-		}
-		if e.nicID != 1 {
-			t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
-		}
-		if e.addr != addr1 {
-			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
+		if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 		}
-		if e.resolved {
-			t.Fatal("got DAD event w/ resolved = true, want = false")
-		}
-
 	}
-	addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 	}
 	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 	}
 
 	// Should not have sent more than 1 NS message.
@@ -681,6 +676,10 @@ func TestSetNDPConfigurationFailsForBadNICID(t *testing.T) {
 // configurations without affecting the default NDP configurations or other
 // interfaces' configurations.
 func TestSetNDPConfigurations(t *testing.T) {
+	const nicID1 = 1
+	const nicID2 = 2
+	const nicID3 = 3
+
 	tests := []struct {
 		name                    string
 		dupAddrDetectTransmits  uint8
@@ -704,7 +703,7 @@ func TestSetNDPConfigurations(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			ndpDisp := ndpDispatcher{
-				dadC: make(chan ndpDADEvent),
+				dadC: make(chan ndpDADEvent, 1),
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
@@ -712,17 +711,28 @@ func TestSetNDPConfigurations(t *testing.T) {
 				NDPDisp:          &ndpDisp,
 			})
 
+			expectDADEvent := func(nicID tcpip.NICID, addr tcpip.Address) {
+				select {
+				case e := <-ndpDisp.dadC:
+					if diff := checkDADEvent(e, nicID, addr, true, nil); diff != "" {
+						t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatalf("expected DAD event for %s", addr)
+				}
+			}
+
 			// This NIC(1)'s NDP configurations will be updated to
 			// be different from the default.
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(1) = %s", err)
+			if err := s.CreateNIC(nicID1, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
 			}
 
 			// Created before updating NIC(1)'s NDP configurations
 			// but updating NIC(1)'s NDP configurations should not
 			// affect other existing NICs.
-			if err := s.CreateNIC(2, e); err != nil {
-				t.Fatalf("CreateNIC(2) = %s", err)
+			if err := s.CreateNIC(nicID2, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
 			}
 
 			// Update the NDP configurations on NIC(1) to use DAD.
@@ -730,36 +740,38 @@ func TestSetNDPConfigurations(t *testing.T) {
 				DupAddrDetectTransmits: test.dupAddrDetectTransmits,
 				RetransmitTimer:        test.retransmitTimer,
 			}
-			if err := s.SetNDPConfigurations(1, configs); err != nil {
-				t.Fatalf("got SetNDPConfigurations(1, _) = %s", err)
+			if err := s.SetNDPConfigurations(nicID1, configs); err != nil {
+				t.Fatalf("got SetNDPConfigurations(%d, _) = %s", nicID1, err)
 			}
 
 			// Created after updating NIC(1)'s NDP configurations
 			// but the stack's default NDP configurations should not
 			// have been updated.
-			if err := s.CreateNIC(3, e); err != nil {
-				t.Fatalf("CreateNIC(3) = %s", err)
+			if err := s.CreateNIC(nicID3, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID3, err)
 			}
 
 			// Add addresses for each NIC.
-			if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
-				t.Fatalf("AddAddress(1, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+			if err := s.AddAddress(nicID1, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID1, header.IPv6ProtocolNumber, addr1, err)
 			}
-			if err := s.AddAddress(2, header.IPv6ProtocolNumber, addr2); err != nil {
-				t.Fatalf("AddAddress(2, %d, %s) = %s", header.IPv6ProtocolNumber, addr2, err)
+			if err := s.AddAddress(nicID2, header.IPv6ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID2, header.IPv6ProtocolNumber, addr2, err)
 			}
-			if err := s.AddAddress(3, header.IPv6ProtocolNumber, addr3); err != nil {
-				t.Fatalf("AddAddress(3, %d, %s) = %s", header.IPv6ProtocolNumber, addr3, err)
+			expectDADEvent(nicID2, addr2)
+			if err := s.AddAddress(nicID3, header.IPv6ProtocolNumber, addr3); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID3, header.IPv6ProtocolNumber, addr3, err)
 			}
+			expectDADEvent(nicID3, addr3)
 
 			// Address should not be considered bound to NIC(1) yet
 			// (DAD ongoing).
-			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err := s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err)
 			}
 			if want := (tcpip.AddressWithPrefix{}); addr != want {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID1, header.IPv6ProtocolNumber, addr, want)
 			}
 
 			// Should get the address on NIC(2) and NIC(3)
@@ -767,31 +779,31 @@ func TestSetNDPConfigurations(t *testing.T) {
 			// it as the stack was configured to not do DAD by
 			// default and we only updated the NDP configurations on
 			// NIC(1).
-			addr, err = s.GetMainNICAddress(2, header.IPv6ProtocolNumber)
+			addr, err = s.GetMainNICAddress(nicID2, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("stack.GetMainNICAddress(2, _) err = %s", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID2, header.IPv6ProtocolNumber, err)
 			}
 			if addr.Address != addr2 {
-				t.Fatalf("got stack.GetMainNICAddress(2, _) = %s, want = %s", addr, addr2)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID2, header.IPv6ProtocolNumber, addr, addr2)
 			}
-			addr, err = s.GetMainNICAddress(3, header.IPv6ProtocolNumber)
+			addr, err = s.GetMainNICAddress(nicID3, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("stack.GetMainNICAddress(3, _) err = %s", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID3, header.IPv6ProtocolNumber, err)
 			}
 			if addr.Address != addr3 {
-				t.Fatalf("got stack.GetMainNICAddress(3, _) = %s, want = %s", addr, addr3)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID3, header.IPv6ProtocolNumber, addr, addr3)
 			}
 
 			// Sleep until right (500ms before) before resolution to
 			// make sure the address didn't resolve on NIC(1) yet.
 			const delta = 500 * time.Millisecond
 			time.Sleep(time.Duration(test.dupAddrDetectTransmits)*test.expectedRetransmitTimer - delta)
-			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err = s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err)
 			}
 			if want := (tcpip.AddressWithPrefix{}); addr != want {
-				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID1, header.IPv6ProtocolNumber, addr, want)
 			}
 
 			// Wait for DAD to resolve.
@@ -805,25 +817,16 @@ func TestSetNDPConfigurations(t *testing.T) {
 				// means something is wrong.
 				t.Fatal("timed out waiting for DAD resolution")
 			case e := <-ndpDisp.dadC:
-				if e.err != nil {
-					t.Fatal("got DAD error: ", e.err)
-				}
-				if e.nicID != 1 {
-					t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
-				}
-				if e.addr != addr1 {
-					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
-				}
-				if !e.resolved {
-					t.Fatal("got DAD event w/ resolved = false, want = true")
+				if diff := checkDADEvent(e, nicID1, addr1, true, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 				}
 			}
-			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			addr, err = s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber)
 			if err != nil {
-				t.Fatalf("stack.GetMainNICAddress(1, _) err = %s", err)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err)
 			}
 			if addr.Address != addr1 {
-				t.Fatalf("got stack.GetMainNICAddress(1, _) = %s, want = %s", addr, addr1)
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID1, header.IPv6ProtocolNumber, addr, addr1)
 			}
 		})
 	}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index b2c1763bf..24133e6f2 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -2184,6 +2184,8 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
 // TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6
 // link-local addresses will only be assigned after the DAD process resolves.
 func TestNICAutoGenAddrDoesDAD(t *testing.T) {
+	const nicID = 1
+
 	ndpDisp := ndpDispatcher{
 		dadC: make(chan ndpDADEvent),
 	}
@@ -2197,18 +2199,18 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) {
 
 	e := channel.New(int(ndpConfigs.DupAddrDetectTransmits), 1280, linkAddr1)
 	s := stack.New(opts)
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(_) = %s", err)
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 	}
 
 	// Address should not be considered bound to the
 	// NIC yet (DAD ongoing).
-	addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 	}
 	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 	}
 
 	linkLocalAddr := header.LinkLocalAddr(linkAddr1)
@@ -2222,25 +2224,16 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) {
 		// means something is wrong.
 		t.Fatal("timed out waiting for DAD resolution")
 	case e := <-ndpDisp.dadC:
-		if e.err != nil {
-			t.Fatal("got DAD error: ", e.err)
-		}
-		if e.nicID != 1 {
-			t.Fatalf("got DAD event w/ nicID = %d, want = 1", e.nicID)
-		}
-		if e.addr != linkLocalAddr {
-			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, linkLocalAddr)
-		}
-		if !e.resolved {
-			t.Fatal("got DAD event w/ resolved = false, want = true")
+		if diff := checkDADEvent(e, nicID, linkLocalAddr, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 		}
 	}
-	addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 	if err != nil {
-		t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
 	}
 	if want := (tcpip.AddressWithPrefix{Address: linkLocalAddr, PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 	}
 }
 
@@ -2606,17 +2599,8 @@ func TestDoDADWhenNICEnabled(t *testing.T) {
 	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
 		t.Fatal("timed out waiting for DAD resolution")
 	case e := <-ndpDisp.dadC:
-		if e.err != nil {
-			t.Fatal("got DAD error: ", e.err)
-		}
-		if e.nicID != nicID {
-			t.Fatalf("got DAD event w/ nicID = %d, want = %d", e.nicID, nicID)
-		}
-		if e.addr != addr.AddressWithPrefix.Address {
-			t.Fatalf("got DAD event w/ addr = %s, want = %s", e.addr, addr.AddressWithPrefix.Address)
-		}
-		if !e.resolved {
-			t.Fatal("got DAD event w/ resolved = false, want = true")
+		if diff := checkDADEvent(e, nicID, addr.AddressWithPrefix.Address, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 		}
 	}
 	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
-- 
cgit v1.2.3


From c141eb5f430dc50f6bf90232c369b7b3a542155e Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 7 Feb 2020 13:47:57 -0800
Subject: Address GH comments.

---
 pkg/abi/linux/netfilter.go                | 2 +-
 pkg/sentry/socket/netfilter/extensions.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index e4aabb6bb..7363185b7 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -363,7 +363,7 @@ type XTTCP struct {
 	// range to which the matcher applies.
 	DestinationPortStart uint16
 
-	// DestinationPortEnd specifies the start of the destination port
+	// DestinationPortEnd specifies the end of the destination port
 	// range to which the matcher applies.
 	DestinationPortEnd uint16
 
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index b5fbb52e4..3082976cd 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -1,4 +1,4 @@
-// Copyright 2019 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
-- 
cgit v1.2.3


From e1587a28876f8aac689a2cd1b7630f1637655b58 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 7 Feb 2020 14:00:50 -0800
Subject: Log level, optname, optval and optlen in getsockopt/setsockopt in
 strace.

Log 8, 16, and 32 int optvals and dump the memory of other sizes.

Updates #1782

PiperOrigin-RevId: 293889388
---
 pkg/sentry/strace/linux64_amd64.go |   4 +-
 pkg/sentry/strace/socket.go        | 215 +++++++++++++++++++++++++++++++++++++
 pkg/sentry/strace/strace.go        |  22 +++-
 pkg/sentry/strace/syscalls.go      |  22 +++-
 4 files changed, 258 insertions(+), 5 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go
index 85ec66fd3..a4de545e9 100644
--- a/pkg/sentry/strace/linux64_amd64.go
+++ b/pkg/sentry/strace/linux64_amd64.go
@@ -78,8 +78,8 @@ var linuxAMD64 = SyscallMap{
 	51:  makeSyscallInfo("getsockname", FD, PostSockAddr, SockLen),
 	52:  makeSyscallInfo("getpeername", FD, PostSockAddr, SockLen),
 	53:  makeSyscallInfo("socketpair", SockFamily, SockType, SockProtocol, Hex),
-	54:  makeSyscallInfo("setsockopt", FD, Hex, Hex, Hex, Hex),
-	55:  makeSyscallInfo("getsockopt", FD, Hex, Hex, Hex, Hex),
+	54:  makeSyscallInfo("setsockopt", FD, SockOptLevel, SockOptName, SetSockOptVal, Hex /* length by value, not a pointer */),
+	55:  makeSyscallInfo("getsockopt", FD, SockOptLevel, SockOptName, GetSockOptVal, SockLen),
 	56:  makeSyscallInfo("clone", CloneFlags, Hex, Hex, Hex, Hex),
 	57:  makeSyscallInfo("fork"),
 	58:  makeSyscallInfo("vfork"),
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index d2079c85f..f7ff4573e 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -419,3 +419,218 @@ func sockFlags(flags int32) string {
 	}
 	return SocketFlagSet.Parse(uint64(flags))
 }
+
+func getSockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, optLen usermem.Addr, maximumBlobSize uint, rval uintptr) string {
+	if int(rval) < 0 {
+		return hexNum(uint64(optVal))
+	}
+	if optVal == 0 {
+		return "null"
+	}
+	l, err := copySockLen(t, optLen)
+	if err != nil {
+		return fmt.Sprintf("%#x {error reading length: %v}", optLen, err)
+	}
+	return sockOptVal(t, level, optname, optVal, uint64(l), maximumBlobSize)
+}
+
+func sockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, optLen uint64, maximumBlobSize uint) string {
+	switch optLen {
+	case 1:
+		var v uint8
+		_, err := t.CopyIn(optVal, &v)
+		if err != nil {
+			return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
+		}
+		return fmt.Sprintf("%#x {value=%v}", optVal, v)
+	case 2:
+		var v uint16
+		_, err := t.CopyIn(optVal, &v)
+		if err != nil {
+			return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
+		}
+		return fmt.Sprintf("%#x {value=%v}", optVal, v)
+	case 4:
+		var v uint32
+		_, err := t.CopyIn(optVal, &v)
+		if err != nil {
+			return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
+		}
+		return fmt.Sprintf("%#x {value=%v}", optVal, v)
+	default:
+		return dump(t, optVal, uint(optLen), maximumBlobSize)
+	}
+}
+
+var sockOptLevels = abi.ValueSet{
+	linux.SOL_IP:      "SOL_IP",
+	linux.SOL_SOCKET:  "SOL_SOCKET",
+	linux.SOL_TCP:     "SOL_TCP",
+	linux.SOL_UDP:     "SOL_UDP",
+	linux.SOL_IPV6:    "SOL_IPV6",
+	linux.SOL_ICMPV6:  "SOL_ICMPV6",
+	linux.SOL_RAW:     "SOL_RAW",
+	linux.SOL_PACKET:  "SOL_PACKET",
+	linux.SOL_NETLINK: "SOL_NETLINK",
+}
+
+var sockOptNames = map[uint64]abi.ValueSet{
+	linux.SOL_IP: {
+		linux.IP_TTL:                    "IP_TTL",
+		linux.IP_MULTICAST_TTL:          "IP_MULTICAST_TTL",
+		linux.IP_MULTICAST_IF:           "IP_MULTICAST_IF",
+		linux.IP_MULTICAST_LOOP:         "IP_MULTICAST_LOOP",
+		linux.IP_TOS:                    "IP_TOS",
+		linux.IP_RECVTOS:                "IP_RECVTOS",
+		linux.IPT_SO_GET_INFO:           "IPT_SO_GET_INFO",
+		linux.IPT_SO_GET_ENTRIES:        "IPT_SO_GET_ENTRIES",
+		linux.IP_ADD_MEMBERSHIP:         "IP_ADD_MEMBERSHIP",
+		linux.IP_DROP_MEMBERSHIP:        "IP_DROP_MEMBERSHIP",
+		linux.MCAST_JOIN_GROUP:          "MCAST_JOIN_GROUP",
+		linux.IP_ADD_SOURCE_MEMBERSHIP:  "IP_ADD_SOURCE_MEMBERSHIP",
+		linux.IP_BIND_ADDRESS_NO_PORT:   "IP_BIND_ADDRESS_NO_PORT",
+		linux.IP_BLOCK_SOURCE:           "IP_BLOCK_SOURCE",
+		linux.IP_CHECKSUM:               "IP_CHECKSUM",
+		linux.IP_DROP_SOURCE_MEMBERSHIP: "IP_DROP_SOURCE_MEMBERSHIP",
+		linux.IP_FREEBIND:               "IP_FREEBIND",
+		linux.IP_HDRINCL:                "IP_HDRINCL",
+		linux.IP_IPSEC_POLICY:           "IP_IPSEC_POLICY",
+		linux.IP_MINTTL:                 "IP_MINTTL",
+		linux.IP_MSFILTER:               "IP_MSFILTER",
+		linux.IP_MTU_DISCOVER:           "IP_MTU_DISCOVER",
+		linux.IP_MULTICAST_ALL:          "IP_MULTICAST_ALL",
+		linux.IP_NODEFRAG:               "IP_NODEFRAG",
+		linux.IP_OPTIONS:                "IP_OPTIONS",
+		linux.IP_PASSSEC:                "IP_PASSSEC",
+		linux.IP_PKTINFO:                "IP_PKTINFO",
+		linux.IP_RECVERR:                "IP_RECVERR",
+		linux.IP_RECVFRAGSIZE:           "IP_RECVFRAGSIZE",
+		linux.IP_RECVOPTS:               "IP_RECVOPTS",
+		linux.IP_RECVORIGDSTADDR:        "IP_RECVORIGDSTADDR",
+		linux.IP_RECVTTL:                "IP_RECVTTL",
+		linux.IP_RETOPTS:                "IP_RETOPTS",
+		linux.IP_TRANSPARENT:            "IP_TRANSPARENT",
+		linux.IP_UNBLOCK_SOURCE:         "IP_UNBLOCK_SOURCE",
+		linux.IP_UNICAST_IF:             "IP_UNICAST_IF",
+		linux.IP_XFRM_POLICY:            "IP_XFRM_POLICY",
+		linux.MCAST_BLOCK_SOURCE:        "MCAST_BLOCK_SOURCE",
+		linux.MCAST_JOIN_SOURCE_GROUP:   "MCAST_JOIN_SOURCE_GROUP",
+		linux.MCAST_LEAVE_GROUP:         "MCAST_LEAVE_GROUP",
+		linux.MCAST_LEAVE_SOURCE_GROUP:  "MCAST_LEAVE_SOURCE_GROUP",
+		linux.MCAST_MSFILTER:            "MCAST_MSFILTER",
+		linux.MCAST_UNBLOCK_SOURCE:      "MCAST_UNBLOCK_SOURCE",
+		linux.IP_ROUTER_ALERT:           "IP_ROUTER_ALERT",
+		linux.IP_PKTOPTIONS:             "IP_PKTOPTIONS",
+		linux.IP_MTU:                    "IP_MTU",
+	},
+	linux.SOL_SOCKET: {
+		linux.SO_ERROR:        "SO_ERROR",
+		linux.SO_PEERCRED:     "SO_PEERCRED",
+		linux.SO_PASSCRED:     "SO_PASSCRED",
+		linux.SO_SNDBUF:       "SO_SNDBUF",
+		linux.SO_RCVBUF:       "SO_RCVBUF",
+		linux.SO_REUSEADDR:    "SO_REUSEADDR",
+		linux.SO_REUSEPORT:    "SO_REUSEPORT",
+		linux.SO_BINDTODEVICE: "SO_BINDTODEVICE",
+		linux.SO_BROADCAST:    "SO_BROADCAST",
+		linux.SO_KEEPALIVE:    "SO_KEEPALIVE",
+		linux.SO_LINGER:       "SO_LINGER",
+		linux.SO_SNDTIMEO:     "SO_SNDTIMEO",
+		linux.SO_RCVTIMEO:     "SO_RCVTIMEO",
+		linux.SO_OOBINLINE:    "SO_OOBINLINE",
+		linux.SO_TIMESTAMP:    "SO_TIMESTAMP",
+	},
+	linux.SOL_TCP: {
+		linux.TCP_NODELAY:              "TCP_NODELAY",
+		linux.TCP_CORK:                 "TCP_CORK",
+		linux.TCP_QUICKACK:             "TCP_QUICKACK",
+		linux.TCP_MAXSEG:               "TCP_MAXSEG",
+		linux.TCP_KEEPIDLE:             "TCP_KEEPIDLE",
+		linux.TCP_KEEPINTVL:            "TCP_KEEPINTVL",
+		linux.TCP_USER_TIMEOUT:         "TCP_USER_TIMEOUT",
+		linux.TCP_INFO:                 "TCP_INFO",
+		linux.TCP_CC_INFO:              "TCP_CC_INFO",
+		linux.TCP_NOTSENT_LOWAT:        "TCP_NOTSENT_LOWAT",
+		linux.TCP_ZEROCOPY_RECEIVE:     "TCP_ZEROCOPY_RECEIVE",
+		linux.TCP_CONGESTION:           "TCP_CONGESTION",
+		linux.TCP_LINGER2:              "TCP_LINGER2",
+		linux.TCP_DEFER_ACCEPT:         "TCP_DEFER_ACCEPT",
+		linux.TCP_REPAIR_OPTIONS:       "TCP_REPAIR_OPTIONS",
+		linux.TCP_INQ:                  "TCP_INQ",
+		linux.TCP_FASTOPEN:             "TCP_FASTOPEN",
+		linux.TCP_FASTOPEN_CONNECT:     "TCP_FASTOPEN_CONNECT",
+		linux.TCP_FASTOPEN_KEY:         "TCP_FASTOPEN_KEY",
+		linux.TCP_FASTOPEN_NO_COOKIE:   "TCP_FASTOPEN_NO_COOKIE",
+		linux.TCP_KEEPCNT:              "TCP_KEEPCNT",
+		linux.TCP_QUEUE_SEQ:            "TCP_QUEUE_SEQ",
+		linux.TCP_REPAIR:               "TCP_REPAIR",
+		linux.TCP_REPAIR_QUEUE:         "TCP_REPAIR_QUEUE",
+		linux.TCP_REPAIR_WINDOW:        "TCP_REPAIR_WINDOW",
+		linux.TCP_SAVED_SYN:            "TCP_SAVED_SYN",
+		linux.TCP_SAVE_SYN:             "TCP_SAVE_SYN",
+		linux.TCP_SYNCNT:               "TCP_SYNCNT",
+		linux.TCP_THIN_DUPACK:          "TCP_THIN_DUPACK",
+		linux.TCP_THIN_LINEAR_TIMEOUTS: "TCP_THIN_LINEAR_TIMEOUTS",
+		linux.TCP_TIMESTAMP:            "TCP_TIMESTAMP",
+		linux.TCP_ULP:                  "TCP_ULP",
+		linux.TCP_WINDOW_CLAMP:         "TCP_WINDOW_CLAMP",
+	},
+	linux.SOL_IPV6: {
+		linux.IPV6_V6ONLY:              "IPV6_V6ONLY",
+		linux.IPV6_PATHMTU:             "IPV6_PATHMTU",
+		linux.IPV6_TCLASS:              "IPV6_TCLASS",
+		linux.IPV6_ADD_MEMBERSHIP:      "IPV6_ADD_MEMBERSHIP",
+		linux.IPV6_DROP_MEMBERSHIP:     "IPV6_DROP_MEMBERSHIP",
+		linux.IPV6_IPSEC_POLICY:        "IPV6_IPSEC_POLICY",
+		linux.IPV6_JOIN_ANYCAST:        "IPV6_JOIN_ANYCAST",
+		linux.IPV6_LEAVE_ANYCAST:       "IPV6_LEAVE_ANYCAST",
+		linux.IPV6_PKTINFO:             "IPV6_PKTINFO",
+		linux.IPV6_ROUTER_ALERT:        "IPV6_ROUTER_ALERT",
+		linux.IPV6_XFRM_POLICY:         "IPV6_XFRM_POLICY",
+		linux.MCAST_BLOCK_SOURCE:       "MCAST_BLOCK_SOURCE",
+		linux.MCAST_JOIN_GROUP:         "MCAST_JOIN_GROUP",
+		linux.MCAST_JOIN_SOURCE_GROUP:  "MCAST_JOIN_SOURCE_GROUP",
+		linux.MCAST_LEAVE_GROUP:        "MCAST_LEAVE_GROUP",
+		linux.MCAST_LEAVE_SOURCE_GROUP: "MCAST_LEAVE_SOURCE_GROUP",
+		linux.MCAST_UNBLOCK_SOURCE:     "MCAST_UNBLOCK_SOURCE",
+		linux.IPV6_2292DSTOPTS:         "IPV6_2292DSTOPTS",
+		linux.IPV6_2292HOPLIMIT:        "IPV6_2292HOPLIMIT",
+		linux.IPV6_2292HOPOPTS:         "IPV6_2292HOPOPTS",
+		linux.IPV6_2292PKTINFO:         "IPV6_2292PKTINFO",
+		linux.IPV6_2292PKTOPTIONS:      "IPV6_2292PKTOPTIONS",
+		linux.IPV6_2292RTHDR:           "IPV6_2292RTHDR",
+		linux.IPV6_ADDR_PREFERENCES:    "IPV6_ADDR_PREFERENCES",
+		linux.IPV6_AUTOFLOWLABEL:       "IPV6_AUTOFLOWLABEL",
+		linux.IPV6_DONTFRAG:            "IPV6_DONTFRAG",
+		linux.IPV6_DSTOPTS:             "IPV6_DSTOPTS",
+		linux.IPV6_FLOWINFO:            "IPV6_FLOWINFO",
+		linux.IPV6_FLOWINFO_SEND:       "IPV6_FLOWINFO_SEND",
+		linux.IPV6_FLOWLABEL_MGR:       "IPV6_FLOWLABEL_MGR",
+		linux.IPV6_FREEBIND:            "IPV6_FREEBIND",
+		linux.IPV6_HOPOPTS:             "IPV6_HOPOPTS",
+		linux.IPV6_MINHOPCOUNT:         "IPV6_MINHOPCOUNT",
+		linux.IPV6_MTU:                 "IPV6_MTU",
+		linux.IPV6_MTU_DISCOVER:        "IPV6_MTU_DISCOVER",
+		linux.IPV6_MULTICAST_ALL:       "IPV6_MULTICAST_ALL",
+		linux.IPV6_MULTICAST_HOPS:      "IPV6_MULTICAST_HOPS",
+		linux.IPV6_MULTICAST_IF:        "IPV6_MULTICAST_IF",
+		linux.IPV6_MULTICAST_LOOP:      "IPV6_MULTICAST_LOOP",
+		linux.IPV6_RECVDSTOPTS:         "IPV6_RECVDSTOPTS",
+		linux.IPV6_RECVERR:             "IPV6_RECVERR",
+		linux.IPV6_RECVFRAGSIZE:        "IPV6_RECVFRAGSIZE",
+		linux.IPV6_RECVHOPLIMIT:        "IPV6_RECVHOPLIMIT",
+		linux.IPV6_RECVHOPOPTS:         "IPV6_RECVHOPOPTS",
+		linux.IPV6_RECVORIGDSTADDR:     "IPV6_RECVORIGDSTADDR",
+		linux.IPV6_RECVPATHMTU:         "IPV6_RECVPATHMTU",
+		linux.IPV6_RECVPKTINFO:         "IPV6_RECVPKTINFO",
+		linux.IPV6_RECVRTHDR:           "IPV6_RECVRTHDR",
+		linux.IPV6_RECVTCLASS:          "IPV6_RECVTCLASS",
+		linux.IPV6_RTHDR:               "IPV6_RTHDR",
+		linux.IPV6_RTHDRDSTOPTS:        "IPV6_RTHDRDSTOPTS",
+		linux.IPV6_TRANSPARENT:         "IPV6_TRANSPARENT",
+		linux.IPV6_UNICAST_HOPS:        "IPV6_UNICAST_HOPS",
+		linux.IPV6_UNICAST_IF:          "IPV6_UNICAST_IF",
+		linux.MCAST_MSFILTER:           "MCAST_MSFILTER",
+		linux.IPV6_ADDRFORM:            "IPV6_ADDRFORM",
+	},
+}
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 3fc4a47fc..a796b2396 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -55,6 +55,14 @@ var ItimerTypes = abi.ValueSet{
 	linux.ITIMER_PROF:    "ITIMER_PROF",
 }
 
+func hexNum(num uint64) string {
+	return "0x" + strconv.FormatUint(num, 16)
+}
+
+func hexArg(arg arch.SyscallArgument) string {
+	return hexNum(arg.Uint64())
+}
+
 func iovecs(t *kernel.Task, addr usermem.Addr, iovcnt int, printContent bool, maxBytes uint64) string {
 	if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
 		return fmt.Sprintf("%#x (error decoding iovecs: invalid iovcnt)", addr)
@@ -389,6 +397,12 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			output = append(output, path(t, args[arg].Pointer()))
 		case ExecveStringVector:
 			output = append(output, stringVector(t, args[arg].Pointer()))
+		case SetSockOptVal:
+			output = append(output, sockOptVal(t, args[arg-2].Uint64() /* level */, args[arg-1].Uint64() /* optName */, args[arg].Pointer() /* optVal */, args[arg+1].Uint64() /* optLen */, maximumBlobSize))
+		case SockOptLevel:
+			output = append(output, sockOptLevels.Parse(args[arg].Uint64()))
+		case SockOptName:
+			output = append(output, sockOptNames[args[arg-1].Uint64() /* level */].Parse(args[arg].Uint64()))
 		case SockAddr:
 			output = append(output, sockAddr(t, args[arg].Pointer(), uint32(args[arg+1].Uint64())))
 		case SockLen:
@@ -446,7 +460,7 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 		case Hex:
 			fallthrough
 		default:
-			output = append(output, "0x"+strconv.FormatUint(args[arg].Uint64(), 16))
+			output = append(output, hexArg(args[arg]))
 		}
 	}
 
@@ -507,6 +521,12 @@ func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uint
 			output[arg] = capData(t, args[arg-1].Pointer(), args[arg].Pointer())
 		case PollFDs:
 			output[arg] = pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), true)
+		case GetSockOptVal:
+			output[arg] = getSockOptVal(t, args[arg-2].Uint64() /* level */, args[arg-1].Uint64() /* optName */, args[arg].Pointer() /* optVal */, args[arg+1].Pointer() /* optLen */, maximumBlobSize, rval)
+		case SetSockOptVal:
+			// No need to print the value again. While it usually
+			// isn't, the string version of this arg can be long.
+			output[arg] = hexArg(args[arg])
 		}
 	}
 }
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 24e29a2ba..446d1e0f6 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -207,9 +207,27 @@ const (
 	// array is in the next argument.
 	PollFDs
 
-	// SelectFDSet is an fd_set argument in select(2)/pselect(2). The number of
-	// fds represented must be the first argument.
+	// SelectFDSet is an fd_set argument in select(2)/pselect(2). The
+	// number of FDs represented must be the first argument.
 	SelectFDSet
+
+	// GetSockOptVal is the optval argument in getsockopt(2).
+	//
+	// Formatted after syscall execution.
+	GetSockOptVal
+
+	// SetSockOptVal is the optval argument in setsockopt(2).
+	//
+	// Contents omitted after syscall execution.
+	SetSockOptVal
+
+	// SockOptLevel is the level argument in getsockopt(2) and
+	// setsockopt(2).
+	SockOptLevel
+
+	// SockOptLevel is the optname argument in getsockopt(2) and
+	// setsockopt(2).
+	SockOptName
 )
 
 // defaultFormat is the syscall argument format to use if the actual format is
-- 
cgit v1.2.3


From 17b9f5e66238bde1e4ed3bd9e5fb67342c8b58ec Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 7 Feb 2020 14:46:24 -0800
Subject: Support listxattr and removexattr syscalls.

Note that these are only implemented for tmpfs, and other impls will still
return EOPNOTSUPP.

PiperOrigin-RevId: 293899385
---
 pkg/p9/client_file.go                      |  33 ++++
 pkg/p9/file.go                             |  16 ++
 pkg/p9/handlers.go                         |  33 ++++
 pkg/p9/messages.go                         | 199 +++++++++++++++----
 pkg/p9/p9.go                               |   4 +
 pkg/p9/version.go                          |   8 +-
 pkg/sentry/fs/copy_up.go                   |   2 +-
 pkg/sentry/fs/fsutil/inode.go              |  20 +-
 pkg/sentry/fs/gofer/context_file.go        |  14 ++
 pkg/sentry/fs/gofer/inode.go               |  13 +-
 pkg/sentry/fs/inode.go                     |  14 +-
 pkg/sentry/fs/inode_operations.go          |  13 +-
 pkg/sentry/fs/inode_overlay.go             |  18 +-
 pkg/sentry/fs/tmpfs/tmpfs.go               |   9 +-
 pkg/sentry/syscalls/linux/linux64_amd64.go |  27 ++-
 pkg/sentry/syscalls/linux/linux64_arm64.go |  37 ++--
 pkg/sentry/syscalls/linux/sys_xattr.go     | 200 +++++++++++++++++++-
 runsc/fsgofer/fsgofer.go                   |  16 +-
 test/syscalls/linux/BUILD                  |   1 +
 test/syscalls/linux/xattr.cc               | 294 ++++++++++++++++-------------
 20 files changed, 733 insertions(+), 238 deletions(-)

(limited to 'pkg')

diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 0254e4ccc..2ee07b664 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -194,6 +194,39 @@ func (c *clientFile) SetXattr(name, value string, flags uint32) error {
 	return c.client.sendRecv(&Tsetxattr{FID: c.fid, Name: name, Value: value, Flags: flags}, &Rsetxattr{})
 }
 
+// ListXattr implements File.ListXattr.
+func (c *clientFile) ListXattr(size uint64) (map[string]struct{}, error) {
+	if atomic.LoadUint32(&c.closed) != 0 {
+		return nil, syscall.EBADF
+	}
+	if !versionSupportsListRemoveXattr(c.client.version) {
+		return nil, syscall.EOPNOTSUPP
+	}
+
+	rlistxattr := Rlistxattr{}
+	if err := c.client.sendRecv(&Tlistxattr{FID: c.fid, Size: size}, &rlistxattr); err != nil {
+		return nil, err
+	}
+
+	xattrs := make(map[string]struct{}, len(rlistxattr.Xattrs))
+	for _, x := range rlistxattr.Xattrs {
+		xattrs[x] = struct{}{}
+	}
+	return xattrs, nil
+}
+
+// RemoveXattr implements File.RemoveXattr.
+func (c *clientFile) RemoveXattr(name string) error {
+	if atomic.LoadUint32(&c.closed) != 0 {
+		return syscall.EBADF
+	}
+	if !versionSupportsListRemoveXattr(c.client.version) {
+		return syscall.EOPNOTSUPP
+	}
+
+	return c.client.sendRecv(&Tremovexattr{FID: c.fid, Name: name}, &Rremovexattr{})
+}
+
 // Allocate implements File.Allocate.
 func (c *clientFile) Allocate(mode AllocateMode, offset, length uint64) error {
 	if atomic.LoadUint32(&c.closed) != 0 {
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index 4607cfcdf..d4ffbc8e3 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -105,6 +105,22 @@ type File interface {
 	// TODO(b/127675828): Determine concurrency guarantees once implemented.
 	SetXattr(name, value string, flags uint32) error
 
+	// ListXattr lists the names of the extended attributes on this node.
+	//
+	// Size indicates the size of the buffer that has been allocated to hold the
+	// attribute list. If the list would be larger than size, implementations may
+	// return ERANGE to indicate that the buffer is too small, but they are also
+	// free to ignore the hint entirely (i.e. the value returned may be larger
+	// than size). All size checking is done independently at the syscall layer.
+	//
+	// TODO(b/148303075): Determine concurrency guarantees once implemented.
+	ListXattr(size uint64) (map[string]struct{}, error)
+
+	// RemoveXattr removes extended attributes on this node.
+	//
+	// TODO(b/148303075): Determine concurrency guarantees once implemented.
+	RemoveXattr(name string) error
+
 	// Allocate allows the caller to directly manipulate the allocated disk space
 	// for the file. See fallocate(2) for more details.
 	Allocate(mode AllocateMode, offset, length uint64) error
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 7d6653a07..2ac45eb80 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -941,6 +941,39 @@ func (t *Tsetxattr) handle(cs *connState) message {
 	return &Rsetxattr{}
 }
 
+// handle implements handler.handle.
+func (t *Tlistxattr) handle(cs *connState) message {
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	xattrs, err := ref.file.ListXattr(t.Size)
+	if err != nil {
+		return newErr(err)
+	}
+	xattrList := make([]string, 0, len(xattrs))
+	for x := range xattrs {
+		xattrList = append(xattrList, x)
+	}
+	return &Rlistxattr{Xattrs: xattrList}
+}
+
+// handle implements handler.handle.
+func (t *Tremovexattr) handle(cs *connState) message {
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	if err := ref.file.RemoveXattr(t.Name); err != nil {
+		return newErr(err)
+	}
+	return &Rremovexattr{}
+}
+
 // handle implements handler.handle.
 func (t *Treaddir) handle(cs *connState) message {
 	ref, ok := cs.LookupFID(t.Directory)
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index ceb723d86..b1cede5f5 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -174,11 +174,11 @@ type Rflush struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rflush) Decode(b *buffer) {
+func (*Rflush) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rflush) Encode(b *buffer) {
+func (*Rflush) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -188,7 +188,7 @@ func (*Rflush) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rflush) String() string {
-	return fmt.Sprintf("RFlush{}")
+	return "RFlush{}"
 }
 
 // Twalk is a walk request.
@@ -300,11 +300,11 @@ type Rclunk struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rclunk) Decode(b *buffer) {
+func (*Rclunk) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rclunk) Encode(b *buffer) {
+func (*Rclunk) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -314,7 +314,7 @@ func (*Rclunk) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rclunk) String() string {
-	return fmt.Sprintf("Rclunk{}")
+	return "Rclunk{}"
 }
 
 // Tremove is a remove request.
@@ -350,11 +350,11 @@ type Rremove struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rremove) Decode(b *buffer) {
+func (*Rremove) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rremove) Encode(b *buffer) {
+func (*Rremove) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -364,7 +364,7 @@ func (*Rremove) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rremove) String() string {
-	return fmt.Sprintf("Rremove{}")
+	return "Rremove{}"
 }
 
 // Rlerror is an error response.
@@ -745,16 +745,16 @@ func (*Rlink) Type() MsgType {
 }
 
 // Decode implements encoder.Decode.
-func (*Rlink) Decode(b *buffer) {
+func (*Rlink) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rlink) Encode(b *buffer) {
+func (*Rlink) Encode(*buffer) {
 }
 
 // String implements fmt.Stringer.
 func (r *Rlink) String() string {
-	return fmt.Sprintf("Rlink{}")
+	return "Rlink{}"
 }
 
 // Trenameat is a rename request.
@@ -803,11 +803,11 @@ type Rrenameat struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rrenameat) Decode(b *buffer) {
+func (*Rrenameat) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rrenameat) Encode(b *buffer) {
+func (*Rrenameat) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -817,7 +817,7 @@ func (*Rrenameat) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rrenameat) String() string {
-	return fmt.Sprintf("Rrenameat{}")
+	return "Rrenameat{}"
 }
 
 // Tunlinkat is an unlink request.
@@ -861,11 +861,11 @@ type Runlinkat struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Runlinkat) Decode(b *buffer) {
+func (*Runlinkat) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Runlinkat) Encode(b *buffer) {
+func (*Runlinkat) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -875,7 +875,7 @@ func (*Runlinkat) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Runlinkat) String() string {
-	return fmt.Sprintf("Runlinkat{}")
+	return "Runlinkat{}"
 }
 
 // Trename is a rename request.
@@ -922,11 +922,11 @@ type Rrename struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rrename) Decode(b *buffer) {
+func (*Rrename) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rrename) Encode(b *buffer) {
+func (*Rrename) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -936,7 +936,7 @@ func (*Rrename) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rrename) String() string {
-	return fmt.Sprintf("Rrename{}")
+	return "Rrename{}"
 }
 
 // Treadlink is a readlink request.
@@ -1409,11 +1409,11 @@ type Rsetattr struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rsetattr) Decode(b *buffer) {
+func (*Rsetattr) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rsetattr) Encode(b *buffer) {
+func (*Rsetattr) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1423,7 +1423,7 @@ func (*Rsetattr) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rsetattr) String() string {
-	return fmt.Sprintf("Rsetattr{}")
+	return "Rsetattr{}"
 }
 
 // Tallocate is an allocate request. This is an extension to 9P protocol, not
@@ -1466,11 +1466,11 @@ type Rallocate struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rallocate) Decode(b *buffer) {
+func (*Rallocate) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rallocate) Encode(b *buffer) {
+func (*Rallocate) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1480,7 +1480,71 @@ func (*Rallocate) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rallocate) String() string {
-	return fmt.Sprintf("Rallocate{}")
+	return "Rallocate{}"
+}
+
+// Tlistxattr is a listxattr request.
+type Tlistxattr struct {
+	// FID refers to the file on which to list xattrs.
+	FID FID
+
+	// Size is the buffer size for the xattr list.
+	Size uint64
+}
+
+// Decode implements encoder.Decode.
+func (t *Tlistxattr) Decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Size = b.Read64()
+}
+
+// Encode implements encoder.Encode.
+func (t *Tlistxattr) Encode(b *buffer) {
+	b.WriteFID(t.FID)
+	b.Write64(t.Size)
+}
+
+// Type implements message.Type.
+func (*Tlistxattr) Type() MsgType {
+	return MsgTlistxattr
+}
+
+// String implements fmt.Stringer.
+func (t *Tlistxattr) String() string {
+	return fmt.Sprintf("Tlistxattr{FID: %d, Size: %d}", t.FID, t.Size)
+}
+
+// Rlistxattr is a listxattr response.
+type Rlistxattr struct {
+	// Xattrs is a list of extended attribute names.
+	Xattrs []string
+}
+
+// Decode implements encoder.Decode.
+func (r *Rlistxattr) Decode(b *buffer) {
+	n := b.Read16()
+	r.Xattrs = r.Xattrs[:0]
+	for i := 0; i < int(n); i++ {
+		r.Xattrs = append(r.Xattrs, b.ReadString())
+	}
+}
+
+// Encode implements encoder.Encode.
+func (r *Rlistxattr) Encode(b *buffer) {
+	b.Write16(uint16(len(r.Xattrs)))
+	for _, x := range r.Xattrs {
+		b.WriteString(x)
+	}
+}
+
+// Type implements message.Type.
+func (*Rlistxattr) Type() MsgType {
+	return MsgRlistxattr
+}
+
+// String implements fmt.Stringer.
+func (r *Rlistxattr) String() string {
+	return fmt.Sprintf("Rlistxattr{Xattrs: %v}", r.Xattrs)
 }
 
 // Txattrwalk walks extended attributes.
@@ -1594,11 +1658,11 @@ type Rxattrcreate struct {
 }
 
 // Decode implements encoder.Decode.
-func (r *Rxattrcreate) Decode(b *buffer) {
+func (r *Rxattrcreate) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (r *Rxattrcreate) Encode(b *buffer) {
+func (r *Rxattrcreate) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1608,7 +1672,7 @@ func (*Rxattrcreate) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rxattrcreate) String() string {
-	return fmt.Sprintf("Rxattrcreate{}")
+	return "Rxattrcreate{}"
 }
 
 // Tgetxattr is a getxattr request.
@@ -1719,11 +1783,11 @@ type Rsetxattr struct {
 }
 
 // Decode implements encoder.Decode.
-func (r *Rsetxattr) Decode(b *buffer) {
+func (r *Rsetxattr) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (r *Rsetxattr) Encode(b *buffer) {
+func (r *Rsetxattr) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1733,7 +1797,60 @@ func (*Rsetxattr) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rsetxattr) String() string {
-	return fmt.Sprintf("Rsetxattr{}")
+	return "Rsetxattr{}"
+}
+
+// Tremovexattr is a removexattr request.
+type Tremovexattr struct {
+	// FID refers to the file on which to set xattrs.
+	FID FID
+
+	// Name is the attribute name.
+	Name string
+}
+
+// Decode implements encoder.Decode.
+func (t *Tremovexattr) Decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Name = b.ReadString()
+}
+
+// Encode implements encoder.Encode.
+func (t *Tremovexattr) Encode(b *buffer) {
+	b.WriteFID(t.FID)
+	b.WriteString(t.Name)
+}
+
+// Type implements message.Type.
+func (*Tremovexattr) Type() MsgType {
+	return MsgTremovexattr
+}
+
+// String implements fmt.Stringer.
+func (t *Tremovexattr) String() string {
+	return fmt.Sprintf("Tremovexattr{FID: %d, Name: %s}", t.FID, t.Name)
+}
+
+// Rremovexattr is a removexattr response.
+type Rremovexattr struct {
+}
+
+// Decode implements encoder.Decode.
+func (r *Rremovexattr) Decode(*buffer) {
+}
+
+// Encode implements encoder.Encode.
+func (r *Rremovexattr) Encode(*buffer) {
+}
+
+// Type implements message.Type.
+func (*Rremovexattr) Type() MsgType {
+	return MsgRremovexattr
+}
+
+// String implements fmt.Stringer.
+func (r *Rremovexattr) String() string {
+	return "Rremovexattr{}"
 }
 
 // Treaddir is a readdir request.
@@ -1880,11 +1997,11 @@ type Rfsync struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rfsync) Decode(b *buffer) {
+func (*Rfsync) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rfsync) Encode(b *buffer) {
+func (*Rfsync) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1894,7 +2011,7 @@ func (*Rfsync) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rfsync) String() string {
-	return fmt.Sprintf("Rfsync{}")
+	return "Rfsync{}"
 }
 
 // Tstatfs is a stat request.
@@ -1980,11 +2097,11 @@ type Rflushf struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rflushf) Decode(b *buffer) {
+func (*Rflushf) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rflushf) Encode(b *buffer) {
+func (*Rflushf) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1994,7 +2111,7 @@ func (*Rflushf) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (*Rflushf) String() string {
-	return fmt.Sprintf("Rflushf{}")
+	return "Rflushf{}"
 }
 
 // Twalkgetattr is a walk request.
@@ -2484,6 +2601,8 @@ func init() {
 	msgRegistry.register(MsgRgetattr, func() message { return &Rgetattr{} })
 	msgRegistry.register(MsgTsetattr, func() message { return &Tsetattr{} })
 	msgRegistry.register(MsgRsetattr, func() message { return &Rsetattr{} })
+	msgRegistry.register(MsgTlistxattr, func() message { return &Tlistxattr{} })
+	msgRegistry.register(MsgRlistxattr, func() message { return &Rlistxattr{} })
 	msgRegistry.register(MsgTxattrwalk, func() message { return &Txattrwalk{} })
 	msgRegistry.register(MsgRxattrwalk, func() message { return &Rxattrwalk{} })
 	msgRegistry.register(MsgTxattrcreate, func() message { return &Txattrcreate{} })
@@ -2492,6 +2611,8 @@ func init() {
 	msgRegistry.register(MsgRgetxattr, func() message { return &Rgetxattr{} })
 	msgRegistry.register(MsgTsetxattr, func() message { return &Tsetxattr{} })
 	msgRegistry.register(MsgRsetxattr, func() message { return &Rsetxattr{} })
+	msgRegistry.register(MsgTremovexattr, func() message { return &Tremovexattr{} })
+	msgRegistry.register(MsgRremovexattr, func() message { return &Rremovexattr{} })
 	msgRegistry.register(MsgTreaddir, func() message { return &Treaddir{} })
 	msgRegistry.register(MsgRreaddir, func() message { return &Rreaddir{} })
 	msgRegistry.register(MsgTfsync, func() message { return &Tfsync{} })
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 5ab00d625..20ab31f7a 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -335,6 +335,8 @@ const (
 	MsgRgetattr             = 25
 	MsgTsetattr             = 26
 	MsgRsetattr             = 27
+	MsgTlistxattr           = 28
+	MsgRlistxattr           = 29
 	MsgTxattrwalk           = 30
 	MsgRxattrwalk           = 31
 	MsgTxattrcreate         = 32
@@ -343,6 +345,8 @@ const (
 	MsgRgetxattr            = 35
 	MsgTsetxattr            = 36
 	MsgRsetxattr            = 37
+	MsgTremovexattr         = 38
+	MsgRremovexattr         = 39
 	MsgTreaddir             = 40
 	MsgRreaddir             = 41
 	MsgTfsync               = 50
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index 34a15eb55..09cde9f5a 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -26,7 +26,7 @@ const (
 	//
 	// Clients are expected to start requesting this version number and
 	// to continuously decrement it until a Tversion request succeeds.
-	highestSupportedVersion uint32 = 10
+	highestSupportedVersion uint32 = 11
 
 	// lowestSupportedVersion is the lowest supported version X in a
 	// version string of the format 9P2000.L.Google.X.
@@ -167,3 +167,9 @@ func VersionSupportsOpenTruncateFlag(v uint32) bool {
 func versionSupportsGetSetXattr(v uint32) bool {
 	return v >= 10
 }
+
+// versionSupportsListRemoveXattr returns true if version v supports
+// the Tlistxattr and Tremovexattr messages.
+func versionSupportsListRemoveXattr(v uint32) bool {
+	return v >= 11
+}
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index f6c79e51b..b060a12ff 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -401,7 +401,7 @@ func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error
 	if err != nil {
 		return err
 	}
-	lowerXattr, err := lower.ListXattr(ctx)
+	lowerXattr, err := lower.ListXattr(ctx, linux.XATTR_SIZE_MAX)
 	if err != nil && err != syserror.EOPNOTSUPP {
 		return err
 	}
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 252830572..daecc4ffe 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -247,7 +247,7 @@ func (i *InodeSimpleExtendedAttributes) SetXattr(_ context.Context, _ *fs.Inode,
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
+func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) {
 	i.mu.RLock()
 	names := make(map[string]struct{}, len(i.xattrs))
 	for name := range i.xattrs {
@@ -257,6 +257,17 @@ func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (m
 	return names, nil
 }
 
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (i *InodeSimpleExtendedAttributes) RemoveXattr(_ context.Context, _ *fs.Inode, name string) error {
+	i.mu.RLock()
+	defer i.mu.RUnlock()
+	if _, ok := i.xattrs[name]; ok {
+		delete(i.xattrs, name)
+		return nil
+	}
+	return syserror.ENOATTR
+}
+
 // staticFile is a file with static contents. It is returned by
 // InodeStaticFileGetter.GetFile.
 //
@@ -460,10 +471,15 @@ func (InodeNoExtendedAttributes) SetXattr(context.Context, *fs.Inode, string, st
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
+func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) {
 	return nil, syserror.EOPNOTSUPP
 }
 
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (InodeNoExtendedAttributes) RemoveXattr(context.Context, *fs.Inode, string) error {
+	return syserror.EOPNOTSUPP
+}
+
 // InodeNoopRelease implements fs.InodeOperations.Release as a noop.
 type InodeNoopRelease struct{}
 
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index 3da818aed..125907d70 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -73,6 +73,20 @@ func (c *contextFile) setXattr(ctx context.Context, name, value string, flags ui
 	return err
 }
 
+func (c *contextFile) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
+	ctx.UninterruptibleSleepStart(false)
+	xattrs, err := c.file.ListXattr(size)
+	ctx.UninterruptibleSleepFinish(false)
+	return xattrs, err
+}
+
+func (c *contextFile) removeXattr(ctx context.Context, name string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := c.file.RemoveXattr(name)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (c *contextFile) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
 	ctx.UninterruptibleSleepStart(false)
 	err := c.file.Allocate(mode, offset, length)
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index ac28174d2..1c934981b 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -604,18 +604,23 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length
 }
 
 // GetXattr implements fs.InodeOperations.GetXattr.
-func (i *inodeOperations) GetXattr(ctx context.Context, inode *fs.Inode, name string, size uint64) (string, error) {
+func (i *inodeOperations) GetXattr(ctx context.Context, _ *fs.Inode, name string, size uint64) (string, error) {
 	return i.fileState.file.getXattr(ctx, name, size)
 }
 
 // SetXattr implements fs.InodeOperations.SetXattr.
-func (i *inodeOperations) SetXattr(ctx context.Context, inode *fs.Inode, name string, value string, flags uint32) error {
+func (i *inodeOperations) SetXattr(ctx context.Context, _ *fs.Inode, name string, value string, flags uint32) error {
 	return i.fileState.file.setXattr(ctx, name, value, flags)
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (i *inodeOperations) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
-	return nil, syscall.EOPNOTSUPP
+func (i *inodeOperations) ListXattr(ctx context.Context, _ *fs.Inode, size uint64) (map[string]struct{}, error) {
+	return i.fileState.file.listXattr(ctx, size)
+}
+
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (i *inodeOperations) RemoveXattr(ctx context.Context, _ *fs.Inode, name string) error {
+	return i.fileState.file.removeXattr(ctx, name)
 }
 
 // Allocate implements fs.InodeOperations.Allocate.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index b66c091ab..55fb71c16 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -278,11 +278,19 @@ func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, fla
 }
 
 // ListXattr calls i.InodeOperations.ListXattr with i as the Inode.
-func (i *Inode) ListXattr(ctx context.Context) (map[string]struct{}, error) {
+func (i *Inode) ListXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
 	if i.overlay != nil {
-		return overlayListXattr(ctx, i.overlay)
+		return overlayListXattr(ctx, i.overlay, size)
 	}
-	return i.InodeOperations.ListXattr(ctx, i)
+	return i.InodeOperations.ListXattr(ctx, i, size)
+}
+
+// RemoveXattr calls i.InodeOperations.RemoveXattr with i as the Inode.
+func (i *Inode) RemoveXattr(ctx context.Context, d *Dirent, name string) error {
+	if i.overlay != nil {
+		return overlayRemoveXattr(ctx, i.overlay, d, name)
+	}
+	return i.InodeOperations.RemoveXattr(ctx, i, name)
 }
 
 // CheckPermission will check if the caller may access this file in the
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 70f2eae96..2bbfb72ef 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -190,7 +190,18 @@ type InodeOperations interface {
 	// ListXattr returns the set of all extended attributes names that
 	// have values. Inodes that do not support extended attributes return
 	// EOPNOTSUPP.
-	ListXattr(ctx context.Context, inode *Inode) (map[string]struct{}, error)
+	//
+	// If this is called through the listxattr(2) syscall, size indicates the
+	// size of the buffer that the application has allocated to hold the
+	// attribute list. If the list would be larger than size, implementations may
+	// return ERANGE to indicate that the buffer is too small, but they are also
+	// free to ignore the hint entirely. All size checking is done independently
+	// at the syscall layer.
+	ListXattr(ctx context.Context, inode *Inode, size uint64) (map[string]struct{}, error)
+
+	// RemoveXattr removes an extended attribute specified by name. Inodes that
+	// do not support extended attributes return EOPNOTSUPP.
+	RemoveXattr(ctx context.Context, inode *Inode, name string) error
 
 	// Check determines whether an Inode can be accessed with the
 	// requested permission mask using the context (which gives access
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 4729b4aac..5ada33a32 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -564,15 +564,15 @@ func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, valu
 	return o.upper.SetXattr(ctx, d, name, value, flags)
 }
 
-func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}, error) {
+func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[string]struct{}, error) {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
 	var names map[string]struct{}
 	var err error
 	if o.upper != nil {
-		names, err = o.upper.ListXattr(ctx)
+		names, err = o.upper.ListXattr(ctx, size)
 	} else {
-		names, err = o.lower.ListXattr(ctx)
+		names, err = o.lower.ListXattr(ctx, size)
 	}
 	for name := range names {
 		// Same as overlayGetXattr, we shouldn't forward along
@@ -584,6 +584,18 @@ func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}
 	return names, err
 }
 
+func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error {
+	// Don't allow changes to overlay xattrs through a removexattr syscall.
+	if strings.HasPrefix(XattrOverlayPrefix, name) {
+		return syserror.EPERM
+	}
+
+	if err := copyUp(ctx, d); err != nil {
+		return err
+	}
+	return o.upper.RemoveXattr(ctx, d, name)
+}
+
 func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error {
 	o.copyMu.RLock()
 	// Hot path. Avoid defers.
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index c00cef0a5..3c2b583ae 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -159,8 +159,13 @@ func (d *Dir) SetXattr(ctx context.Context, i *fs.Inode, name, value string, fla
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode) (map[string]struct{}, error) {
-	return d.ramfsDir.ListXattr(ctx, i)
+func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode, size uint64) (map[string]struct{}, error) {
+	return d.ramfsDir.ListXattr(ctx, i, size)
+}
+
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (d *Dir) RemoveXattr(ctx context.Context, i *fs.Inode, name string) error {
+	return d.ramfsDir.RemoveXattr(ctx, i, name)
 }
 
 // Lookup implements fs.InodeOperations.Lookup.
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 588f8b087..79066ad2a 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -228,21 +228,18 @@ var AMD64 = &kernel.SyscallTable{
 		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
-		// TODO(b/148303075): Enable set/getxattr (in their various
-		// forms) once we also have list and removexattr. The JVM
-		// assumes that if get/set exist, then list and remove do too.
-		188: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		189: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		190: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+		194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+		195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+		196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+		197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+		198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+		199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
 		200: syscalls.Supported("tkill", Tkill),
 		201: syscalls.Supported("time", Time),
 		202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 06e5ee401..7421619de 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -36,26 +36,23 @@ var ARM64 = &kernel.SyscallTable{
 	},
 	AuditNumber: linux.AUDIT_ARCH_AARCH64,
 	Table: map[uintptr]kernel.Syscall{
-		0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		// TODO(b/148303075): Enable set/getxattr (in their various
-		// forms) once we also have list and removexattr. The JVM
-		// assumes that if get/set exist, then list and remove do too.
-		5:   syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		6:   syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		7:   syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		8:   syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		13:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		14:  syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		15:  syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		16:  syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		0:   syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		1:   syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+		11:  syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+		12:  syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+		13:  syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+		14:  syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+		15:  syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+		16:  syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
 		17:  syscalls.Supported("getcwd", Getcwd),
 		18:  syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
 		19:  syscalls.Supported("eventfd2", Eventfd2),
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index efb95555c..342337726 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -72,7 +72,7 @@ func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink
 	}
 
 	valueLen := 0
-	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
 		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
 			return syserror.ENOTDIR
 		}
@@ -172,7 +172,7 @@ func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink
 		return 0, nil, err
 	}
 
-	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
 		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
 			return syserror.ENOTDIR
 		}
@@ -187,12 +187,12 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, si
 		return syserror.EINVAL
 	}
 
-	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
 		return err
 	}
 
-	name, err := copyInXattrName(t, nameAddr)
-	if err != nil {
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
 		return err
 	}
 
@@ -226,12 +226,18 @@ func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
 	return name, nil
 }
 
+// Restrict xattrs to regular files and directories.
+//
+// TODO(b/148380782): In Linux, this restriction technically only applies to
+// xattrs in the "user.*" namespace. Make file type checks specific to the
+// namespace once we allow other xattr prefixes.
+func xattrFileTypeOk(i *fs.Inode) bool {
+	return fs.IsRegular(i.StableAttr) || fs.IsDir(i.StableAttr)
+}
+
 func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error {
 	// Restrict xattrs to regular files and directories.
-	//
-	// In Linux, this restriction technically only applies to xattrs in the
-	// "user.*" namespace, but we don't allow any other xattr prefixes anyway.
-	if !fs.IsRegular(i.StableAttr) && !fs.IsDir(i.StableAttr) {
+	if !xattrFileTypeOk(i) {
 		if perms.Write {
 			return syserror.EPERM
 		}
@@ -240,3 +246,179 @@ func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error
 
 	return i.CheckPermission(t, perms)
 }
+
+// ListXattr implements linux syscall listxattr(2).
+func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listXattrFromPath(t, args, true)
+}
+
+// LListXattr implements linux syscall llistxattr(2).
+func LListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listXattrFromPath(t, args, false)
+}
+
+// FListXattr implements linux syscall flistxattr(2).
+func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	listAddr := args[1].Pointer()
+	size := uint64(args[2].SizeT())
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	n, err := listXattr(t, f.Dirent, listAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(n), nil, nil
+}
+
+func listXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	listAddr := args[1].Pointer()
+	size := uint64(args[2].SizeT())
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n := 0
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		n, err = listXattr(t, d, listAddr, size)
+		return err
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(n), nil, nil
+}
+
+func listXattr(t *kernel.Task, d *fs.Dirent, addr usermem.Addr, size uint64) (int, error) {
+	if !xattrFileTypeOk(d.Inode) {
+		return 0, nil
+	}
+
+	// If listxattr(2) is called with size 0, the buffer size needed to contain
+	// the xattr list will be returned successfully even if it is nonzero. In
+	// that case, we need to retrieve the entire list so we can compute and
+	// return the correct size.
+	requestedSize := size
+	if size == 0 || size > linux.XATTR_SIZE_MAX {
+		requestedSize = linux.XATTR_SIZE_MAX
+	}
+	xattrs, err := d.Inode.ListXattr(t, requestedSize)
+	if err != nil {
+		return 0, err
+	}
+
+	// TODO(b/148380782): support namespaces other than "user".
+	for x := range xattrs {
+		if !strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
+			delete(xattrs, x)
+		}
+	}
+
+	listSize := xattrListSize(xattrs)
+	if listSize > linux.XATTR_SIZE_MAX {
+		return 0, syserror.E2BIG
+	}
+	if uint64(listSize) > requestedSize {
+		return 0, syserror.ERANGE
+	}
+
+	// Don't copy out the attributes if size is 0.
+	if size == 0 {
+		return listSize, nil
+	}
+
+	buf := make([]byte, 0, listSize)
+	for x := range xattrs {
+		buf = append(buf, []byte(x)...)
+		buf = append(buf, 0)
+	}
+	if _, err := t.CopyOutBytes(addr, buf); err != nil {
+		return 0, err
+	}
+
+	return len(buf), nil
+}
+
+func xattrListSize(xattrs map[string]struct{}) int {
+	size := 0
+	for x := range xattrs {
+		size += len(x) + 1
+	}
+	return size
+}
+
+// RemoveXattr implements linux syscall removexattr(2).
+func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return removeXattrFromPath(t, args, true)
+}
+
+// LRemoveXattr implements linux syscall lremovexattr(2).
+func LRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return removeXattrFromPath(t, args, false)
+}
+
+// FRemoveXattr implements linux syscall fremovexattr(2).
+func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	return 0, nil, removeXattr(t, f.Dirent, nameAddr)
+}
+
+func removeXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		return removeXattr(t, d, nameAddr)
+	})
+}
+
+// removeXattr implements removexattr(2) from the given *fs.Dirent.
+func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+		return err
+	}
+
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+
+	return d.Inode.RemoveXattr(t, d, name)
+}
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 4d84ad999..cadd83273 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -768,12 +768,22 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 }
 
 // TODO(b/127675828): support getxattr.
-func (l *localFile) GetXattr(name string, size uint64) (string, error) {
+func (*localFile) GetXattr(string, uint64) (string, error) {
 	return "", syscall.EOPNOTSUPP
 }
 
 // TODO(b/127675828): support setxattr.
-func (l *localFile) SetXattr(name, value string, flags uint32) error {
+func (*localFile) SetXattr(string, string, uint32) error {
+	return syscall.EOPNOTSUPP
+}
+
+// TODO(b/148303075): support listxattr.
+func (*localFile) ListXattr(uint64) (map[string]struct{}, error) {
+	return nil, syscall.EOPNOTSUPP
+}
+
+// TODO(b/148303075): support removexattr.
+func (*localFile) RemoveXattr(string) error {
 	return syscall.EOPNOTSUPP
 }
 
@@ -790,7 +800,7 @@ func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error
 }
 
 // Rename implements p9.File; this should never be called.
-func (l *localFile) Rename(p9.File, string) error {
+func (*localFile) Rename(p9.File, string) error {
 	panic("rename called directly")
 }
 
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 12d389c3e..ca1af209a 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3782,6 +3782,7 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         gtest,
         "//test/util:posix_error",
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index 85eb31847..8b00ef44c 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -24,6 +24,7 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#include "absl/container/flat_hash_set.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
@@ -38,36 +39,36 @@ namespace {
 
 class XattrTest : public FileTest {};
 
-TEST_F(XattrTest, XattrNullName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+TEST_F(XattrTest, XattrNonexistentFile) {
+  const char* path = "/does/not/exist";
+  EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
+              SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(listxattr(path, nullptr, 0), SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(ENOENT));
+}
 
+TEST_F(XattrTest, XattrNullName) {
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EFAULT));
   EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
               SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(EFAULT));
 }
 
 TEST_F(XattrTest, XattrEmptyName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(ERANGE));
   EXPECT_THAT(getxattr(path, "", nullptr, 0), SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(removexattr(path, ""), SyscallFailsWithErrno(ERANGE));
 }
 
 TEST_F(XattrTest, XattrLargeName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   std::string name = "user.";
   name += std::string(XATTR_NAME_MAX - name.length(), 'a');
@@ -86,28 +87,23 @@ TEST_F(XattrTest, XattrLargeName) {
               SyscallFailsWithErrno(ERANGE));
   EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
               SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(removexattr(path, name.c_str()), SyscallFailsWithErrno(ERANGE));
 }
 
 TEST_F(XattrTest, XattrInvalidPrefix) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   std::string name(XATTR_NAME_MAX, 'a');
   EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EOPNOTSUPP));
   EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
               SyscallFailsWithErrno(EOPNOTSUPP));
+  EXPECT_THAT(removexattr(path, name.c_str()),
+              SyscallFailsWithErrno(EOPNOTSUPP));
 }
 
 // Do not allow save/restore cycles after making the test file read-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -124,19 +120,21 @@ TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
 
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
               SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EACCES));
 
   char buf = '-';
   EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
   EXPECT_EQ(buf, val);
+
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
 }
 
 // Do not allow save/restore cycles after making the test file write-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -152,6 +150,14 @@ TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
 
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
+
+  // listxattr will succeed even without read permissions.
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(removexattr(path, name), SyscallSucceeds());
 }
 
 TEST_F(XattrTest, XattrTrustedWithNonadmin) {
@@ -163,64 +169,66 @@ TEST_F(XattrTest, XattrTrustedWithNonadmin) {
   const char name[] = "trusted.abc";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, XattrOnDirectory) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const char name[] = "user.test";
-  EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(dir.path().c_str(), name, nullptr, 0, /*flags=*/0),
               SyscallSucceeds());
-  EXPECT_THAT(getxattr(dir.path().c_str(), name, NULL, 0),
+  EXPECT_THAT(getxattr(dir.path().c_str(), name, nullptr, 0),
               SyscallSucceedsWithValue(0));
+
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(dir.path().c_str(), list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(removexattr(dir.path().c_str(), name), SyscallSucceeds());
 }
 
 TEST_F(XattrTest, XattrOnSymlink) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
   const char name[] = "user.test";
-  EXPECT_THAT(setxattr(link.path().c_str(), name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(link.path().c_str(), name, nullptr, 0, /*flags=*/0),
               SyscallSucceeds());
-  EXPECT_THAT(getxattr(link.path().c_str(), name, NULL, 0),
+  EXPECT_THAT(getxattr(link.path().c_str(), name, nullptr, 0),
               SyscallSucceedsWithValue(0));
+
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(link.path().c_str(), list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(removexattr(link.path().c_str(), name), SyscallSucceeds());
 }
 
 TEST_F(XattrTest, XattrOnInvalidFileTypes) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char name[] = "user.test";
 
   char char_device[] = "/dev/zero";
-  EXPECT_THAT(setxattr(char_device, name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(char_device, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EPERM));
-  EXPECT_THAT(getxattr(char_device, name, NULL, 0),
+  EXPECT_THAT(getxattr(char_device, name, nullptr, 0),
               SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(listxattr(char_device, nullptr, 0), SyscallSucceedsWithValue(0));
 
   // Use tmpfs, where creation of named pipes is supported.
   const std::string fifo = NewTempAbsPathInDir("/dev/shm");
   const char* path = fifo.c_str();
   EXPECT_THAT(mknod(path, S_IFIFO | S_IRUSR | S_IWUSR, 0), SyscallSucceeds());
-  EXPECT_THAT(setxattr(path, name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EPERM));
-  EXPECT_THAT(getxattr(path, name, NULL, 0), SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(listxattr(path, nullptr, 0), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
 }
 
 TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -236,10 +244,6 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, SetxattrZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -252,10 +256,6 @@ TEST_F(XattrTest, SetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrSizeTooLarge) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
 
@@ -271,10 +271,6 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
@@ -284,10 +280,6 @@ TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -296,10 +288,6 @@ TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val(XATTR_SIZE_MAX + 1);
@@ -316,10 +304,6 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -335,10 +319,6 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithLarger) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -353,10 +333,6 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) {
 }
 
 TEST_F(XattrTest, SetxattrCreateFlag) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
@@ -368,10 +344,6 @@ TEST_F(XattrTest, SetxattrCreateFlag) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceFlag) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
@@ -384,10 +356,6 @@ TEST_F(XattrTest, SetxattrReplaceFlag) {
 }
 
 TEST_F(XattrTest, SetxattrInvalidFlags) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   int invalid_flags = 0xff;
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
@@ -395,10 +363,6 @@ TEST_F(XattrTest, SetxattrInvalidFlags) {
 }
 
 TEST_F(XattrTest, Getxattr) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -411,10 +375,6 @@ TEST_F(XattrTest, Getxattr) {
 }
 
 TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -427,10 +387,6 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -446,10 +402,6 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -463,10 +415,6 @@ TEST_F(XattrTest, GetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrSizeTooLarge) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -483,10 +431,6 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, GetxattrNullValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -498,10 +442,6 @@ TEST_F(XattrTest, GetxattrNullValue) {
 }
 
 TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -518,35 +458,109 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrNonexistentName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, Listxattr) {
+  const char* path = test_file_name_.c_str();
+  const std::string name = "user.test";
+  const std::string name2 = "user.test2";
+  const std::string name3 = "user.test3";
+  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name2.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name3.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
 
+  std::vector<char> list(name.size() + 1 + name2.size() + 1 + name3.size() + 1);
+  char* buf = list.data();
+  EXPECT_THAT(listxattr(path, buf, XATTR_SIZE_MAX),
+              SyscallSucceedsWithValue(list.size()));
+
+  absl::flat_hash_set<std::string> got = {};
+  for (char* p = buf; p < buf + list.size(); p += strlen(p) + 1) {
+    got.insert(std::string{p});
+  }
+
+  absl::flat_hash_set<std::string> expected = {name, name2, name3};
+  EXPECT_EQ(got, expected);
+}
+
+TEST_F(XattrTest, ListxattrNoXattrs) {
+  const char* path = test_file_name_.c_str();
+
+  std::vector<char> list, expected;
+  EXPECT_THAT(listxattr(path, list.data(), sizeof(list)),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(list, expected);
+
+  // Listxattr should succeed if there are no attributes, even if the buffer
+  // passed in is a nullptr.
+  EXPECT_THAT(listxattr(path, nullptr, sizeof(list)),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, ListxattrNullBuffer) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+  EXPECT_THAT(listxattr(path, nullptr, sizeof(name)),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, ListxattrSizeTooSmall) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+  char list[sizeof(name) - 1];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, ListxattrZeroSize) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+  EXPECT_THAT(listxattr(path, nullptr, 0),
+              SyscallSucceedsWithValue(sizeof(name)));
+}
+
+TEST_F(XattrTest, RemoveXattr) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+  EXPECT_THAT(removexattr(path, name), SyscallSucceeds());
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
-TEST_F(XattrTest, LGetSetxattrOnSymlink) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+TEST_F(XattrTest, RemoveXattrNonexistentName) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(ENODATA));
+}
 
+TEST_F(XattrTest, LXattrOnSymlink) {
+  const char name[] = "user.test";
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
 
-  EXPECT_THAT(lsetxattr(link.path().c_str(), nullptr, nullptr, 0, 0),
+  EXPECT_THAT(lsetxattr(link.path().c_str(), name, nullptr, 0, 0),
               SyscallFailsWithErrno(EPERM));
-  EXPECT_THAT(lgetxattr(link.path().c_str(), nullptr, nullptr, 0),
+  EXPECT_THAT(lgetxattr(link.path().c_str(), name, nullptr, 0),
               SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(llistxattr(link.path().c_str(), nullptr, 0),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(lremovexattr(link.path().c_str(), name),
+              SyscallFailsWithErrno(EPERM));
 }
 
-TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
+TEST_F(XattrTest, LXattrOnNonsymlink) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -558,13 +572,16 @@ TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
   EXPECT_THAT(lgetxattr(path, name, &buf, size),
               SyscallSucceedsWithValue(size));
   EXPECT_EQ(buf, val);
-}
 
-TEST_F(XattrTest, FGetSetxattr) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+  char list[sizeof(name)];
+  EXPECT_THAT(llistxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
 
+  EXPECT_THAT(lremovexattr(path, name), SyscallSucceeds());
+}
+
+TEST_F(XattrTest, XattrWithFD) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0));
   const char name[] = "user.test";
@@ -577,6 +594,13 @@ TEST_F(XattrTest, FGetSetxattr) {
   EXPECT_THAT(fgetxattr(fd.get(), name, &buf, size),
               SyscallSucceedsWithValue(size));
   EXPECT_EQ(buf, val);
+
+  char list[sizeof(name)];
+  EXPECT_THAT(flistxattr(fd.get(), list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(fremovexattr(fd.get(), name), SyscallSucceeds());
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 9cbf5a3dcc71d85cd857f117d4b7189a101be9c1 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 25 Oct 2019 03:09:59 +0000
Subject: Enable pkg/cpuid support on arm64.

Fixes #1255

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I8614e6f3ee321c2989567e4e712aa8f28cc9db14
---
 pkg/cpuid/BUILD                   |    9 +-
 pkg/cpuid/cpuid.go                | 1098 +-----------------------------------
 pkg/cpuid/cpuid_arm64.go          |  461 ++++++++++++++++
 pkg/cpuid/cpuid_arm64_test.go     |   55 ++
 pkg/cpuid/cpuid_parse_test.go     |  142 -----
 pkg/cpuid/cpuid_parse_x86_test.go |  144 +++++
 pkg/cpuid/cpuid_test.go           |  241 --------
 pkg/cpuid/cpuid_x86.go            | 1100 +++++++++++++++++++++++++++++++++++++
 pkg/cpuid/cpuid_x86_test.go       |  243 ++++++++
 9 files changed, 2018 insertions(+), 1475 deletions(-)
 create mode 100644 pkg/cpuid/cpuid_arm64.go
 create mode 100644 pkg/cpuid/cpuid_arm64_test.go
 delete mode 100644 pkg/cpuid/cpuid_parse_test.go
 create mode 100644 pkg/cpuid/cpuid_parse_x86_test.go
 delete mode 100644 pkg/cpuid/cpuid_test.go
 create mode 100644 pkg/cpuid/cpuid_x86.go
 create mode 100644 pkg/cpuid/cpuid_x86_test.go

(limited to 'pkg')

diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD
index 43a432190..d6cb1a549 100644
--- a/pkg/cpuid/BUILD
+++ b/pkg/cpuid/BUILD
@@ -7,6 +7,8 @@ go_library(
     srcs = [
         "cpu_amd64.s",
         "cpuid.go",
+        "cpuid_arm64.go",
+        "cpuid_x86.go",
     ],
     visibility = ["//:sandbox"],
     deps = ["//pkg/log"],
@@ -15,7 +17,10 @@ go_library(
 go_test(
     name = "cpuid_test",
     size = "small",
-    srcs = ["cpuid_test.go"],
+    srcs = [
+        "cpuid_arm64_test.go",
+        "cpuid_x86_test.go",
+    ],
     library = ":cpuid",
 )
 
@@ -23,7 +28,7 @@ go_test(
     name = "cpuid_parse_test",
     size = "small",
     srcs = [
-        "cpuid_parse_test.go",
+        "cpuid_parse_x86_test.go",
     ],
     library = ":cpuid",
     tags = ["manual"],
diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index cf50ee53f..f7f9dbf86 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
-
 // Package cpuid provides basic functionality for creating and adjusting CPU
 // feature sets.
 //
@@ -21,1100 +19,20 @@
 // known platform, or HostFeatureSet()) and then add, remove, and test for
 // features as desired.
 //
-// For example: Test for hardware extended state saving, and if we don't have
-// it, don't expose AVX, which cannot be saved with fxsave.
+// For example: on x86, test for hardware extended state saving, and if
+// we don't have it, don't expose AVX, which cannot be saved with fxsave.
 //
 //   if !HostFeatureSet().HasFeature(X86FeatureXSAVE) {
 //     exposedFeatures.Remove(X86FeatureAVX)
 //   }
 package cpuid
 
-import (
-	"bytes"
-	"fmt"
-	"io/ioutil"
-	"strconv"
-	"strings"
-
-	"gvisor.dev/gvisor/pkg/log"
-)
-
-// Common references for CPUID leaves and bits:
-//
-// Intel:
-//   * Intel SDM Volume 2, Chapter 3.2 "CPUID" (more up-to-date)
-//   * Intel Application Note 485 (more detailed)
-//
-// AMD:
-//   * AMD64 APM Volume 3, Appendix 3 "Obtaining Processor Information ..."
-
 // Feature is a unique identifier for a particular cpu feature. We just use an
-// int as a feature number on x86.
+// int as a feature number on x86 and arm64.
 //
-// Features are numbered according to "blocks". Each block is 32 bits, and
-// feature bits from the same source (cpuid leaf/level) are in the same block.
-type Feature int
-
-// block is a collection of 32 Feature bits.
-type block int
-
-const blockSize = 32
-
-// Feature bits are numbered according to "blocks". Each block is 32 bits, and
+// On x86, features are numbered according to "blocks". Each block is 32 bits, and
 // feature bits from the same source (cpuid leaf/level) are in the same block.
-func featureID(b block, bit int) Feature {
-	return Feature(32*int(b) + bit)
-}
-
-// Block 0 constants are all of the "basic" feature bits returned by a cpuid in
-// ecx with eax=1.
-const (
-	X86FeatureSSE3 Feature = iota
-	X86FeaturePCLMULDQ
-	X86FeatureDTES64
-	X86FeatureMONITOR
-	X86FeatureDSCPL
-	X86FeatureVMX
-	X86FeatureSMX
-	X86FeatureEST
-	X86FeatureTM2
-	X86FeatureSSSE3 // Not a typo, "supplemental" SSE3.
-	X86FeatureCNXTID
-	X86FeatureSDBG
-	X86FeatureFMA
-	X86FeatureCX16
-	X86FeatureXTPR
-	X86FeaturePDCM
-	_ // ecx bit 16 is reserved.
-	X86FeaturePCID
-	X86FeatureDCA
-	X86FeatureSSE4_1
-	X86FeatureSSE4_2
-	X86FeatureX2APIC
-	X86FeatureMOVBE
-	X86FeaturePOPCNT
-	X86FeatureTSCD
-	X86FeatureAES
-	X86FeatureXSAVE
-	X86FeatureOSXSAVE
-	X86FeatureAVX
-	X86FeatureF16C
-	X86FeatureRDRAND
-	_ // ecx bit 31 is reserved.
-)
-
-// Block 1 constants are all of the "basic" feature bits returned by a cpuid in
-// edx with eax=1.
-const (
-	X86FeatureFPU Feature = 32 + iota
-	X86FeatureVME
-	X86FeatureDE
-	X86FeaturePSE
-	X86FeatureTSC
-	X86FeatureMSR
-	X86FeaturePAE
-	X86FeatureMCE
-	X86FeatureCX8
-	X86FeatureAPIC
-	_ // edx bit 10 is reserved.
-	X86FeatureSEP
-	X86FeatureMTRR
-	X86FeaturePGE
-	X86FeatureMCA
-	X86FeatureCMOV
-	X86FeaturePAT
-	X86FeaturePSE36
-	X86FeaturePSN
-	X86FeatureCLFSH
-	_ // edx bit 20 is reserved.
-	X86FeatureDS
-	X86FeatureACPI
-	X86FeatureMMX
-	X86FeatureFXSR
-	X86FeatureSSE
-	X86FeatureSSE2
-	X86FeatureSS
-	X86FeatureHTT
-	X86FeatureTM
-	X86FeatureIA64
-	X86FeaturePBE
-)
-
-// Block 2 bits are the "structured extended" features returned in ebx for
-// eax=7, ecx=0.
-const (
-	X86FeatureFSGSBase Feature = 2*32 + iota
-	X86FeatureTSC_ADJUST
-	_ // ebx bit 2 is reserved.
-	X86FeatureBMI1
-	X86FeatureHLE
-	X86FeatureAVX2
-	X86FeatureFDP_EXCPTN_ONLY
-	X86FeatureSMEP
-	X86FeatureBMI2
-	X86FeatureERMS
-	X86FeatureINVPCID
-	X86FeatureRTM
-	X86FeatureCQM
-	X86FeatureFPCSDS
-	X86FeatureMPX
-	X86FeatureRDT
-	X86FeatureAVX512F
-	X86FeatureAVX512DQ
-	X86FeatureRDSEED
-	X86FeatureADX
-	X86FeatureSMAP
-	X86FeatureAVX512IFMA
-	X86FeaturePCOMMIT
-	X86FeatureCLFLUSHOPT
-	X86FeatureCLWB
-	X86FeatureIPT // Intel processor trace.
-	X86FeatureAVX512PF
-	X86FeatureAVX512ER
-	X86FeatureAVX512CD
-	X86FeatureSHA
-	X86FeatureAVX512BW
-	X86FeatureAVX512VL
-)
-
-// Block 3 bits are the "extended" features returned in ecx for eax=7, ecx=0.
-const (
-	X86FeaturePREFETCHWT1 Feature = 3*32 + iota
-	X86FeatureAVX512VBMI
-	X86FeatureUMIP
-	X86FeaturePKU
-	X86FeatureOSPKE
-	X86FeatureWAITPKG
-	X86FeatureAVX512_VBMI2
-	_ // ecx bit 7 is reserved
-	X86FeatureGFNI
-	X86FeatureVAES
-	X86FeatureVPCLMULQDQ
-	X86FeatureAVX512_VNNI
-	X86FeatureAVX512_BITALG
-	X86FeatureTME
-	X86FeatureAVX512_VPOPCNTDQ
-	_ // ecx bit 15 is reserved
-	X86FeatureLA57
-	// ecx bits 17-21 are reserved
-	_
-	_
-	_
-	_
-	_
-	X86FeatureRDPID
-	// ecx bits 23-24 are reserved
-	_
-	_
-	X86FeatureCLDEMOTE
-	_ // ecx bit 26 is reserved
-	X86FeatureMOVDIRI
-	X86FeatureMOVDIR64B
-)
-
-// Block 4 constants are for xsave capabilities in CPUID.(EAX=0DH,ECX=01H):EAX.
-// The CPUID leaf is available only if 'X86FeatureXSAVE' is present.
-const (
-	X86FeatureXSAVEOPT Feature = 4*32 + iota
-	X86FeatureXSAVEC
-	X86FeatureXGETBV1
-	X86FeatureXSAVES
-	// EAX[31:4] are reserved.
-)
-
-// Block 5 constants are the extended feature bits in
-// CPUID.(EAX=0x80000001):ECX.
-const (
-	X86FeatureLAHF64 Feature = 5*32 + iota
-	X86FeatureCMP_LEGACY
-	X86FeatureSVM
-	X86FeatureEXTAPIC
-	X86FeatureCR8_LEGACY
-	X86FeatureLZCNT
-	X86FeatureSSE4A
-	X86FeatureMISALIGNSSE
-	X86FeaturePREFETCHW
-	X86FeatureOSVW
-	X86FeatureIBS
-	X86FeatureXOP
-	X86FeatureSKINIT
-	X86FeatureWDT
-	_ // ecx bit 14 is reserved.
-	X86FeatureLWP
-	X86FeatureFMA4
-	X86FeatureTCE
-	_ // ecx bit 18 is reserved.
-	_ // ecx bit 19 is reserved.
-	_ // ecx bit 20 is reserved.
-	X86FeatureTBM
-	X86FeatureTOPOLOGY
-	X86FeaturePERFCTR_CORE
-	X86FeaturePERFCTR_NB
-	_ // ecx bit 25 is reserved.
-	X86FeatureBPEXT
-	X86FeaturePERFCTR_TSC
-	X86FeaturePERFCTR_LLC
-	X86FeatureMWAITX
-	// ECX[31:30] are reserved.
-)
-
-// Block 6 constants are the extended feature bits in
-// CPUID.(EAX=0x80000001):EDX.
 //
-// These are sparse, and so the bit positions are assigned manually.
-const (
-	// On AMD, EDX[24:23] | EDX[17:12] | EDX[9:0] are duplicate features
-	// also defined in block 1 (in identical bit positions). Those features
-	// are not listed here.
-	block6DuplicateMask = 0x183f3ff
-
-	X86FeatureSYSCALL  Feature = 6*32 + 11
-	X86FeatureNX       Feature = 6*32 + 20
-	X86FeatureMMXEXT   Feature = 6*32 + 22
-	X86FeatureFXSR_OPT Feature = 6*32 + 25
-	X86FeatureGBPAGES  Feature = 6*32 + 26
-	X86FeatureRDTSCP   Feature = 6*32 + 27
-	X86FeatureLM       Feature = 6*32 + 29
-	X86Feature3DNOWEXT Feature = 6*32 + 30
-	X86Feature3DNOW    Feature = 6*32 + 31
-)
-
-// linuxBlockOrder defines the order in which linux organizes the feature
-// blocks. Linux also tracks feature bits in 32-bit blocks, but in an order
-// which doesn't match well here, so for the /proc/cpuinfo generation we simply
-// re-map the blocks to Linux's ordering and then go through the bits in each
-// block.
-var linuxBlockOrder = []block{1, 6, 0, 5, 2, 4, 3}
-
-// To make emulation of /proc/cpuinfo easy, these names match the names of the
-// basic features in Linux defined in arch/x86/kernel/cpu/capflags.c.
-var x86FeatureStrings = map[Feature]string{
-	// Block 0.
-	X86FeatureSSE3:     "pni",
-	X86FeaturePCLMULDQ: "pclmulqdq",
-	X86FeatureDTES64:   "dtes64",
-	X86FeatureMONITOR:  "monitor",
-	X86FeatureDSCPL:    "ds_cpl",
-	X86FeatureVMX:      "vmx",
-	X86FeatureSMX:      "smx",
-	X86FeatureEST:      "est",
-	X86FeatureTM2:      "tm2",
-	X86FeatureSSSE3:    "ssse3",
-	X86FeatureCNXTID:   "cid",
-	X86FeatureSDBG:     "sdbg",
-	X86FeatureFMA:      "fma",
-	X86FeatureCX16:     "cx16",
-	X86FeatureXTPR:     "xtpr",
-	X86FeaturePDCM:     "pdcm",
-	X86FeaturePCID:     "pcid",
-	X86FeatureDCA:      "dca",
-	X86FeatureSSE4_1:   "sse4_1",
-	X86FeatureSSE4_2:   "sse4_2",
-	X86FeatureX2APIC:   "x2apic",
-	X86FeatureMOVBE:    "movbe",
-	X86FeaturePOPCNT:   "popcnt",
-	X86FeatureTSCD:     "tsc_deadline_timer",
-	X86FeatureAES:      "aes",
-	X86FeatureXSAVE:    "xsave",
-	X86FeatureAVX:      "avx",
-	X86FeatureF16C:     "f16c",
-	X86FeatureRDRAND:   "rdrand",
-
-	// Block 1.
-	X86FeatureFPU:   "fpu",
-	X86FeatureVME:   "vme",
-	X86FeatureDE:    "de",
-	X86FeaturePSE:   "pse",
-	X86FeatureTSC:   "tsc",
-	X86FeatureMSR:   "msr",
-	X86FeaturePAE:   "pae",
-	X86FeatureMCE:   "mce",
-	X86FeatureCX8:   "cx8",
-	X86FeatureAPIC:  "apic",
-	X86FeatureSEP:   "sep",
-	X86FeatureMTRR:  "mtrr",
-	X86FeaturePGE:   "pge",
-	X86FeatureMCA:   "mca",
-	X86FeatureCMOV:  "cmov",
-	X86FeaturePAT:   "pat",
-	X86FeaturePSE36: "pse36",
-	X86FeaturePSN:   "pn",
-	X86FeatureCLFSH: "clflush",
-	X86FeatureDS:    "dts",
-	X86FeatureACPI:  "acpi",
-	X86FeatureMMX:   "mmx",
-	X86FeatureFXSR:  "fxsr",
-	X86FeatureSSE:   "sse",
-	X86FeatureSSE2:  "sse2",
-	X86FeatureSS:    "ss",
-	X86FeatureHTT:   "ht",
-	X86FeatureTM:    "tm",
-	X86FeatureIA64:  "ia64",
-	X86FeaturePBE:   "pbe",
-
-	// Block 2.
-	X86FeatureFSGSBase:   "fsgsbase",
-	X86FeatureTSC_ADJUST: "tsc_adjust",
-	X86FeatureBMI1:       "bmi1",
-	X86FeatureHLE:        "hle",
-	X86FeatureAVX2:       "avx2",
-	X86FeatureSMEP:       "smep",
-	X86FeatureBMI2:       "bmi2",
-	X86FeatureERMS:       "erms",
-	X86FeatureINVPCID:    "invpcid",
-	X86FeatureRTM:        "rtm",
-	X86FeatureCQM:        "cqm",
-	X86FeatureMPX:        "mpx",
-	X86FeatureRDT:        "rdt_a",
-	X86FeatureAVX512F:    "avx512f",
-	X86FeatureAVX512DQ:   "avx512dq",
-	X86FeatureRDSEED:     "rdseed",
-	X86FeatureADX:        "adx",
-	X86FeatureSMAP:       "smap",
-	X86FeatureCLWB:       "clwb",
-	X86FeatureAVX512PF:   "avx512pf",
-	X86FeatureAVX512ER:   "avx512er",
-	X86FeatureAVX512CD:   "avx512cd",
-	X86FeatureSHA:        "sha_ni",
-	X86FeatureAVX512BW:   "avx512bw",
-	X86FeatureAVX512VL:   "avx512vl",
-
-	// Block 3.
-	X86FeatureAVX512VBMI:       "avx512vbmi",
-	X86FeatureUMIP:             "umip",
-	X86FeaturePKU:              "pku",
-	X86FeatureOSPKE:            "ospke",
-	X86FeatureWAITPKG:          "waitpkg",
-	X86FeatureAVX512_VBMI2:     "avx512_vbmi2",
-	X86FeatureGFNI:             "gfni",
-	X86FeatureVAES:             "vaes",
-	X86FeatureVPCLMULQDQ:       "vpclmulqdq",
-	X86FeatureAVX512_VNNI:      "avx512_vnni",
-	X86FeatureAVX512_BITALG:    "avx512_bitalg",
-	X86FeatureTME:              "tme",
-	X86FeatureAVX512_VPOPCNTDQ: "avx512_vpopcntdq",
-	X86FeatureLA57:             "la57",
-	X86FeatureRDPID:            "rdpid",
-	X86FeatureCLDEMOTE:         "cldemote",
-	X86FeatureMOVDIRI:          "movdiri",
-	X86FeatureMOVDIR64B:        "movdir64b",
-
-	// Block 4.
-	X86FeatureXSAVEOPT: "xsaveopt",
-	X86FeatureXSAVEC:   "xsavec",
-	X86FeatureXGETBV1:  "xgetbv1",
-	X86FeatureXSAVES:   "xsaves",
-
-	// Block 5.
-	X86FeatureLAHF64:       "lahf_lm", // LAHF/SAHF in long mode
-	X86FeatureCMP_LEGACY:   "cmp_legacy",
-	X86FeatureSVM:          "svm",
-	X86FeatureEXTAPIC:      "extapic",
-	X86FeatureCR8_LEGACY:   "cr8_legacy",
-	X86FeatureLZCNT:        "abm", // Advanced bit manipulation
-	X86FeatureSSE4A:        "sse4a",
-	X86FeatureMISALIGNSSE:  "misalignsse",
-	X86FeaturePREFETCHW:    "3dnowprefetch",
-	X86FeatureOSVW:         "osvw",
-	X86FeatureIBS:          "ibs",
-	X86FeatureXOP:          "xop",
-	X86FeatureSKINIT:       "skinit",
-	X86FeatureWDT:          "wdt",
-	X86FeatureLWP:          "lwp",
-	X86FeatureFMA4:         "fma4",
-	X86FeatureTCE:          "tce",
-	X86FeatureTBM:          "tbm",
-	X86FeatureTOPOLOGY:     "topoext",
-	X86FeaturePERFCTR_CORE: "perfctr_core",
-	X86FeaturePERFCTR_NB:   "perfctr_nb",
-	X86FeatureBPEXT:        "bpext",
-	X86FeaturePERFCTR_TSC:  "ptsc",
-	X86FeaturePERFCTR_LLC:  "perfctr_llc",
-	X86FeatureMWAITX:       "mwaitx",
-
-	// Block 6.
-	X86FeatureSYSCALL:  "syscall",
-	X86FeatureNX:       "nx",
-	X86FeatureMMXEXT:   "mmxext",
-	X86FeatureFXSR_OPT: "fxsr_opt",
-	X86FeatureGBPAGES:  "pdpe1gb",
-	X86FeatureRDTSCP:   "rdtscp",
-	X86FeatureLM:       "lm",
-	X86Feature3DNOWEXT: "3dnowext",
-	X86Feature3DNOW:    "3dnow",
-}
-
-// These flags are parse only---they can be used for setting / unsetting the
-// flags, but will not get printed out in /proc/cpuinfo.
-var x86FeatureParseOnlyStrings = map[Feature]string{
-	// Block 0.
-	X86FeatureOSXSAVE: "osxsave",
-
-	// Block 2.
-	X86FeatureFDP_EXCPTN_ONLY: "fdp_excptn_only",
-	X86FeatureFPCSDS:          "fpcsds",
-	X86FeatureIPT:             "pt",
-	X86FeatureCLFLUSHOPT:      "clfushopt",
-
-	// Block 3.
-	X86FeaturePREFETCHWT1: "prefetchwt1",
-}
-
-// intelCacheDescriptors describe the caches and TLBs on the system. They are
-// returned in the registers for eax=2. Intel only.
-type intelCacheDescriptor uint8
-
-// Valid cache/TLB descriptors. All descriptors can be found in Intel SDM Vol.
-// 2, Ch. 3.2, "CPUID", Table 3-12 "Encoding of CPUID Leaf 2 Descriptors".
-const (
-	intelNullDescriptor    intelCacheDescriptor = 0
-	intelNoTLBDescriptor   intelCacheDescriptor = 0xfe
-	intelNoCacheDescriptor intelCacheDescriptor = 0xff
-
-	// Most descriptors omitted for brevity as they are currently unused.
-)
-
-// CacheType describes the type of a cache, as returned in eax[4:0] for eax=4.
-type CacheType uint8
-
-const (
-	// cacheNull indicates that there are no more entries.
-	cacheNull CacheType = iota
-
-	// CacheData is a data cache.
-	CacheData
-
-	// CacheInstruction is an instruction cache.
-	CacheInstruction
-
-	// CacheUnified is a unified instruction and data cache.
-	CacheUnified
-)
-
-// Cache describes the parameters of a single cache on the system.
-//
-// +stateify savable
-type Cache struct {
-	// Level is the hierarchical level of this cache (L1, L2, etc).
-	Level uint32
-
-	// Type is the type of cache.
-	Type CacheType
-
-	// FullyAssociative indicates that entries may be placed in any block.
-	FullyAssociative bool
-
-	// Partitions is the number of physical partitions in the cache.
-	Partitions uint32
-
-	// Ways is the number of ways of associativity in the cache.
-	Ways uint32
-
-	// Sets is the number of sets in the cache.
-	Sets uint32
-
-	// InvalidateHierarchical indicates that WBINVD/INVD from threads
-	// sharing this cache acts upon lower level caches for threads sharing
-	// this cache.
-	InvalidateHierarchical bool
-
-	// Inclusive indicates that this cache is inclusive of lower cache
-	// levels.
-	Inclusive bool
-
-	// DirectMapped indicates that this cache is directly mapped from
-	// address, rather than using a hash function.
-	DirectMapped bool
-}
-
-// Just a way to wrap cpuid function numbers.
-type cpuidFunction uint32
-
-// The constants below are the lower or "standard" cpuid functions, ordered as
-// defined by the hardware.
-const (
-	vendorID                      cpuidFunction = iota // Returns vendor ID and largest standard function.
-	featureInfo                                        // Returns basic feature bits and processor signature.
-	intelCacheDescriptors                              // Returns list of cache descriptors. Intel only.
-	intelSerialNumber                                  // Returns processor serial number (obsolete on new hardware). Intel only.
-	intelDeterministicCacheParams                      // Returns deterministic cache information. Intel only.
-	monitorMwaitParams                                 // Returns information about monitor/mwait instructions.
-	powerParams                                        // Returns information about power management and thermal sensors.
-	extendedFeatureInfo                                // Returns extended feature bits.
-	_                                                  // Function 0x8 is reserved.
-	intelDCAParams                                     // Returns direct cache access information. Intel only.
-	intelPMCInfo                                       // Returns information about performance monitoring features. Intel only.
-	intelX2APICInfo                                    // Returns core/logical processor topology. Intel only.
-	_                                                  // Function 0xc is reserved.
-	xSaveInfo                                          // Returns information about extended state management.
-)
-
-// The "extended" functions start at 0x80000000.
-const (
-	extendedFunctionInfo cpuidFunction = 0x80000000 + iota // Returns highest available extended function in eax.
-	extendedFeatures                                       // Returns some extended feature bits in edx and ecx.
-)
-
-// These are the extended floating point state features. They are used to
-// enumerate floating point features in XCR0, XSTATE_BV, etc.
-const (
-	XSAVEFeatureX87         = 1 << 0
-	XSAVEFeatureSSE         = 1 << 1
-	XSAVEFeatureAVX         = 1 << 2
-	XSAVEFeatureBNDREGS     = 1 << 3
-	XSAVEFeatureBNDCSR      = 1 << 4
-	XSAVEFeatureAVX512op    = 1 << 5
-	XSAVEFeatureAVX512zmm0  = 1 << 6
-	XSAVEFeatureAVX512zmm16 = 1 << 7
-	XSAVEFeaturePKRU        = 1 << 9
-)
-
-var cpuFreqMHz float64
-
-// x86FeaturesFromString includes features from x86FeatureStrings and
-// x86FeatureParseOnlyStrings.
-var x86FeaturesFromString = make(map[string]Feature)
-
-// FeatureFromString returns the Feature associated with the given feature
-// string plus a bool to indicate if it could find the feature.
-func FeatureFromString(s string) (Feature, bool) {
-	f, b := x86FeaturesFromString[s]
-	return f, b
-}
-
-// String implements fmt.Stringer.
-func (f Feature) String() string {
-	if s := f.flagString(false); s != "" {
-		return s
-	}
-
-	block := int(f) / 32
-	bit := int(f) % 32
-	return fmt.Sprintf("<cpuflag %d; block %d bit %d>", f, block, bit)
-}
-
-func (f Feature) flagString(cpuinfoOnly bool) string {
-	if s, ok := x86FeatureStrings[f]; ok {
-		return s
-	}
-	if !cpuinfoOnly {
-		return x86FeatureParseOnlyStrings[f]
-	}
-	return ""
-}
-
-// FeatureSet is a set of Features for a CPU.
-//
-// +stateify savable
-type FeatureSet struct {
-	// Set is the set of features that are enabled in this FeatureSet.
-	Set map[Feature]bool
-
-	// VendorID is the 12-char string returned in ebx:edx:ecx for eax=0.
-	VendorID string
-
-	// ExtendedFamily is part of the processor signature.
-	ExtendedFamily uint8
-
-	// ExtendedModel is part of the processor signature.
-	ExtendedModel uint8
-
-	// ProcessorType is part of the processor signature.
-	ProcessorType uint8
-
-	// Family is part of the processor signature.
-	Family uint8
-
-	// Model is part of the processor signature.
-	Model uint8
-
-	// SteppingID is part of the processor signature.
-	SteppingID uint8
-
-	// Caches describes the caches on the CPU.
-	Caches []Cache
-
-	// CacheLine is the size of a cache line in bytes.
-	//
-	// All caches use the same line size. This is not enforced in the CPUID
-	// encoding, but is true on all known x86 processors.
-	CacheLine uint32
-}
-
-// FlagsString prints out supported CPU flags. If cpuinfoOnly is true, it is
-// equivalent to the "flags" field in /proc/cpuinfo.
-func (fs *FeatureSet) FlagsString(cpuinfoOnly bool) string {
-	var s []string
-	for _, b := range linuxBlockOrder {
-		for i := 0; i < blockSize; i++ {
-			if f := featureID(b, i); fs.Set[f] {
-				if fstr := f.flagString(cpuinfoOnly); fstr != "" {
-					s = append(s, fstr)
-				}
-			}
-		}
-	}
-	return strings.Join(s, " ")
-}
-
-// WriteCPUInfoTo is to generate a section of one cpu in /proc/cpuinfo. This is
-// a minimal /proc/cpuinfo, it is missing some fields like "microcode" that are
-// not always printed in Linux. The bogomips field is simply made up.
-func (fs FeatureSet) WriteCPUInfoTo(cpu uint, b *bytes.Buffer) {
-	fmt.Fprintf(b, "processor\t: %d\n", cpu)
-	fmt.Fprintf(b, "vendor_id\t: %s\n", fs.VendorID)
-	fmt.Fprintf(b, "cpu family\t: %d\n", ((fs.ExtendedFamily<<4)&0xff)|fs.Family)
-	fmt.Fprintf(b, "model\t\t: %d\n", ((fs.ExtendedModel<<4)&0xff)|fs.Model)
-	fmt.Fprintf(b, "model name\t: %s\n", "unknown") // Unknown for now.
-	fmt.Fprintf(b, "stepping\t: %s\n", "unknown")   // Unknown for now.
-	fmt.Fprintf(b, "cpu MHz\t\t: %.3f\n", cpuFreqMHz)
-	fmt.Fprintln(b, "fpu\t\t: yes")
-	fmt.Fprintln(b, "fpu_exception\t: yes")
-	fmt.Fprintf(b, "cpuid level\t: %d\n", uint32(xSaveInfo)) // Same as ax in vendorID.
-	fmt.Fprintln(b, "wp\t\t: yes")
-	fmt.Fprintf(b, "flags\t\t: %s\n", fs.FlagsString(true))
-	fmt.Fprintf(b, "bogomips\t: %.02f\n", cpuFreqMHz) // It's bogus anyway.
-	fmt.Fprintf(b, "clflush size\t: %d\n", fs.CacheLine)
-	fmt.Fprintf(b, "cache_alignment\t: %d\n", fs.CacheLine)
-	fmt.Fprintf(b, "address sizes\t: %d bits physical, %d bits virtual\n", 46, 48)
-	fmt.Fprintln(b, "power management:") // This is always here, but can be blank.
-	fmt.Fprintln(b, "")                  // The /proc/cpuinfo file ends with an extra newline.
-}
-
-const (
-	amdVendorID   = "AuthenticAMD"
-	intelVendorID = "GenuineIntel"
-)
-
-// AMD returns true if fs describes an AMD CPU.
-func (fs *FeatureSet) AMD() bool {
-	return fs.VendorID == amdVendorID
-}
-
-// Intel returns true if fs describes an Intel CPU.
-func (fs *FeatureSet) Intel() bool {
-	return fs.VendorID == intelVendorID
-}
-
-// ErrIncompatible is returned by FeatureSet.HostCompatible if fs is not a
-// subset of the host feature set.
-type ErrIncompatible struct {
-	message string
-}
-
-// Error implements error.
-func (e ErrIncompatible) Error() string {
-	return e.message
-}
-
-// CheckHostCompatible returns nil if fs is a subset of the host feature set.
-func (fs *FeatureSet) CheckHostCompatible() error {
-	hfs := HostFeatureSet()
-
-	if diff := fs.Subtract(hfs); diff != nil {
-		return ErrIncompatible{fmt.Sprintf("CPU feature set %v incompatible with host feature set %v (missing: %v)", fs.FlagsString(false), hfs.FlagsString(false), diff)}
-	}
-
-	// The size of a cache line must match, as it is critical to correctly
-	// utilizing CLFLUSH. Other cache properties are allowed to change, as
-	// they are not important to correctness.
-	if fs.CacheLine != hfs.CacheLine {
-		return ErrIncompatible{fmt.Sprintf("CPU cache line size %d incompatible with host cache line size %d", fs.CacheLine, hfs.CacheLine)}
-	}
-
-	return nil
-}
-
-// Helper to convert 3 regs into 12-byte vendor ID.
-func vendorIDFromRegs(bx, cx, dx uint32) string {
-	bytes := make([]byte, 0, 12)
-	for i := uint(0); i < 4; i++ {
-		b := byte(bx >> (i * 8))
-		bytes = append(bytes, b)
-	}
-
-	for i := uint(0); i < 4; i++ {
-		b := byte(dx >> (i * 8))
-		bytes = append(bytes, b)
-	}
-
-	for i := uint(0); i < 4; i++ {
-		b := byte(cx >> (i * 8))
-		bytes = append(bytes, b)
-	}
-	return string(bytes)
-}
-
-// ExtendedStateSize returns the number of bytes needed to save the "extended
-// state" for this processor and the boundary it must be aligned to. Extended
-// state includes floating point registers, and other cpu state that's not
-// associated with the normal task context.
-//
-// Note: We can save some space here with an optimization where we use a
-// smaller chunk of memory depending on features that are actually enabled.
-// Currently we just use the largest possible size for simplicity (which is
-// about 2.5K worst case, with avx512).
-func (fs *FeatureSet) ExtendedStateSize() (size, align uint) {
-	if fs.UseXsave() {
-		// Leaf 0 of xsaveinfo function returns the size for currently
-		// enabled xsave features in ebx, the maximum size if all valid
-		// features are saved with xsave in ecx, and valid XCR0 bits in
-		// edx:eax.
-		_, _, maxSize, _ := HostID(uint32(xSaveInfo), 0)
-		return uint(maxSize), 64
-	}
-
-	// If we don't support xsave, we fall back to fxsave, which requires
-	// 512 bytes aligned to 16 bytes.
-	return 512, 16
-}
-
-// ValidXCR0Mask returns the bits that may be set to 1 in control register
-// XCR0.
-func (fs *FeatureSet) ValidXCR0Mask() uint64 {
-	if !fs.UseXsave() {
-		return 0
-	}
-	eax, _, _, edx := HostID(uint32(xSaveInfo), 0)
-	return uint64(edx)<<32 | uint64(eax)
-}
-
-// vendorIDRegs returns the 3 register values used to construct the 12-byte
-// vendor ID string for eax=0.
-func (fs *FeatureSet) vendorIDRegs() (bx, dx, cx uint32) {
-	for i := uint(0); i < 4; i++ {
-		bx |= uint32(fs.VendorID[i]) << (i * 8)
-	}
-
-	for i := uint(0); i < 4; i++ {
-		dx |= uint32(fs.VendorID[i+4]) << (i * 8)
-	}
-
-	for i := uint(0); i < 4; i++ {
-		cx |= uint32(fs.VendorID[i+8]) << (i * 8)
-	}
-	return
-}
-
-// signature returns the signature dword that's returned in eax when eax=1.
-func (fs *FeatureSet) signature() uint32 {
-	var s uint32
-	s |= uint32(fs.SteppingID & 0xf)
-	s |= uint32(fs.Model&0xf) << 4
-	s |= uint32(fs.Family&0xf) << 8
-	s |= uint32(fs.ProcessorType&0x3) << 12
-	s |= uint32(fs.ExtendedModel&0xf) << 16
-	s |= uint32(fs.ExtendedFamily&0xff) << 20
-	return s
-}
-
-// Helper to deconstruct signature dword.
-func signatureSplit(v uint32) (ef, em, pt, f, m, sid uint8) {
-	sid = uint8(v & 0xf)
-	m = uint8(v>>4) & 0xf
-	f = uint8(v>>8) & 0xf
-	pt = uint8(v>>12) & 0x3
-	em = uint8(v>>16) & 0xf
-	ef = uint8(v >> 20)
-	return
-}
-
-// Helper to convert blockwise feature bit masks into a set of features. Masks
-// must be provided in order for each block, without skipping them. If a block
-// does not matter for this feature set, 0 is specified.
-func setFromBlockMasks(blocks ...uint32) map[Feature]bool {
-	s := make(map[Feature]bool)
-	for b, blockMask := range blocks {
-		for i := 0; i < blockSize; i++ {
-			if blockMask&1 != 0 {
-				s[featureID(block(b), i)] = true
-			}
-			blockMask >>= 1
-		}
-	}
-	return s
-}
-
-// blockMask returns the 32-bit mask associated with a block of features.
-func (fs *FeatureSet) blockMask(b block) uint32 {
-	var mask uint32
-	for i := 0; i < blockSize; i++ {
-		if fs.Set[featureID(b, i)] {
-			mask |= 1 << uint(i)
-		}
-	}
-	return mask
-}
-
-// Remove removes a Feature from a FeatureSet. It ignores features
-// that are not in the FeatureSet.
-func (fs *FeatureSet) Remove(feature Feature) {
-	delete(fs.Set, feature)
-}
-
-// Add adds a Feature to a FeatureSet. It ignores duplicate features.
-func (fs *FeatureSet) Add(feature Feature) {
-	fs.Set[feature] = true
-}
-
-// HasFeature tests whether or not a feature is in the given feature set.
-func (fs *FeatureSet) HasFeature(feature Feature) bool {
-	return fs.Set[feature]
-}
-
-// Subtract returns the features present in fs that are not present in other.
-// If all features in fs are present in other, Subtract returns nil.
-func (fs *FeatureSet) Subtract(other *FeatureSet) (diff map[Feature]bool) {
-	for f := range fs.Set {
-		if !other.Set[f] {
-			if diff == nil {
-				diff = make(map[Feature]bool)
-			}
-			diff[f] = true
-		}
-	}
-
-	return
-}
-
-// EmulateID emulates a cpuid instruction based on the feature set.
-func (fs *FeatureSet) EmulateID(origAx, origCx uint32) (ax, bx, cx, dx uint32) {
-	switch cpuidFunction(origAx) {
-	case vendorID:
-		ax = uint32(xSaveInfo) // 0xd (xSaveInfo) is the highest function we support.
-		bx, dx, cx = fs.vendorIDRegs()
-	case featureInfo:
-		// CLFLUSH line size is encoded in quadwords. Other fields in bx unsupported.
-		bx = (fs.CacheLine / 8) << 8
-		cx = fs.blockMask(block(0))
-		dx = fs.blockMask(block(1))
-		ax = fs.signature()
-	case intelCacheDescriptors:
-		if !fs.Intel() {
-			// Reserved on non-Intel.
-			return 0, 0, 0, 0
-		}
-
-		// "The least-significant byte in register EAX (register AL)
-		// will always return 01H. Software should ignore this value
-		// and not interpret it as an informational descriptor." - SDM
-		//
-		// We only support reporting cache parameters via
-		// intelDeterministicCacheParams; report as much here.
-		//
-		// We do not support exposing TLB information at all.
-		ax = 1 | (uint32(intelNoCacheDescriptor) << 8)
-	case intelDeterministicCacheParams:
-		if !fs.Intel() {
-			// Reserved on non-Intel.
-			return 0, 0, 0, 0
-		}
-
-		// cx is the index of the cache to describe.
-		if int(origCx) >= len(fs.Caches) {
-			return uint32(cacheNull), 0, 0, 0
-		}
-		c := fs.Caches[origCx]
-
-		ax = uint32(c.Type)
-		ax |= c.Level << 5
-		ax |= 1 << 8 // Always claim the cache is "self-initializing".
-		if c.FullyAssociative {
-			ax |= 1 << 9
-		}
-		// Processor topology not supported.
-
-		bx = fs.CacheLine - 1
-		bx |= (c.Partitions - 1) << 12
-		bx |= (c.Ways - 1) << 22
-
-		cx = c.Sets - 1
-
-		if !c.InvalidateHierarchical {
-			dx |= 1
-		}
-		if c.Inclusive {
-			dx |= 1 << 1
-		}
-		if !c.DirectMapped {
-			dx |= 1 << 2
-		}
-	case xSaveInfo:
-		if !fs.UseXsave() {
-			return 0, 0, 0, 0
-		}
-		return HostID(uint32(xSaveInfo), origCx)
-	case extendedFeatureInfo:
-		if origCx != 0 {
-			break // Only leaf 0 is supported.
-		}
-		bx = fs.blockMask(block(2))
-		cx = fs.blockMask(block(3))
-	case extendedFunctionInfo:
-		// We only support showing the extended features.
-		ax = uint32(extendedFeatures)
-		cx = 0
-	case extendedFeatures:
-		cx = fs.blockMask(block(5))
-		dx = fs.blockMask(block(6))
-		if fs.AMD() {
-			// AMD duplicates some block 1 features in block 6.
-			dx |= fs.blockMask(block(1)) & block6DuplicateMask
-		}
-	}
-
-	return
-}
-
-// UseXsave returns the choice of fp state saving instruction.
-func (fs *FeatureSet) UseXsave() bool {
-	return fs.HasFeature(X86FeatureXSAVE) && fs.HasFeature(X86FeatureOSXSAVE)
-}
-
-// UseXsaveopt returns true if 'fs' supports the "xsaveopt" instruction.
-func (fs *FeatureSet) UseXsaveopt() bool {
-	return fs.UseXsave() && fs.HasFeature(X86FeatureXSAVEOPT)
-}
-
-// HostID executes a native CPUID instruction.
-func HostID(axArg, cxArg uint32) (ax, bx, cx, dx uint32)
-
-// HostFeatureSet uses cpuid to get host values and construct a feature set
-// that matches that of the host machine. Note that there are several places
-// where there appear to be some unnecessary assignments between register names
-// (ax, bx, cx, or dx) and featureBlockN variables. This is to explicitly show
-// where the different feature blocks come from, to make the code easier to
-// inspect and read.
-func HostFeatureSet() *FeatureSet {
-	// eax=0 gets max supported feature and vendor ID.
-	_, bx, cx, dx := HostID(0, 0)
-	vendorID := vendorIDFromRegs(bx, cx, dx)
-
-	// eax=1 gets basic features in ecx:edx.
-	ax, bx, cx, dx := HostID(1, 0)
-	featureBlock0 := cx
-	featureBlock1 := dx
-	ef, em, pt, f, m, sid := signatureSplit(ax)
-	cacheLine := 8 * (bx >> 8) & 0xff
-
-	// eax=4, ecx=i gets details about cache index i. Only supported on Intel.
-	var caches []Cache
-	if vendorID == intelVendorID {
-		// ecx selects the cache index until a null type is returned.
-		for i := uint32(0); ; i++ {
-			ax, bx, cx, dx := HostID(4, i)
-			t := CacheType(ax & 0xf)
-			if t == cacheNull {
-				break
-			}
-
-			lineSize := (bx & 0xfff) + 1
-			if lineSize != cacheLine {
-				panic(fmt.Sprintf("Mismatched cache line size: %d vs %d", lineSize, cacheLine))
-			}
-
-			caches = append(caches, Cache{
-				Type:                   t,
-				Level:                  (ax >> 5) & 0x7,
-				FullyAssociative:       ((ax >> 9) & 1) == 1,
-				Partitions:             ((bx >> 12) & 0x3ff) + 1,
-				Ways:                   ((bx >> 22) & 0x3ff) + 1,
-				Sets:                   cx + 1,
-				InvalidateHierarchical: (dx & 1) == 0,
-				Inclusive:              ((dx >> 1) & 1) == 1,
-				DirectMapped:           ((dx >> 2) & 1) == 0,
-			})
-		}
-	}
-
-	// eax=7, ecx=0 gets extended features in ecx:ebx.
-	_, bx, cx, _ = HostID(7, 0)
-	featureBlock2 := bx
-	featureBlock3 := cx
-
-	// Leaf 0xd is supported only if CPUID.1:ECX.XSAVE[bit 26] is set.
-	var featureBlock4 uint32
-	if (featureBlock0 & (1 << 26)) != 0 {
-		featureBlock4, _, _, _ = HostID(uint32(xSaveInfo), 1)
-	}
-
-	// eax=0x80000000 gets supported extended levels. We use this to
-	// determine if there are any non-zero block 4 or block 6 bits to find.
-	var featureBlock5, featureBlock6 uint32
-	if ax, _, _, _ := HostID(uint32(extendedFunctionInfo), 0); ax >= uint32(extendedFeatures) {
-		// eax=0x80000001 gets AMD added feature bits.
-		_, _, cx, dx = HostID(uint32(extendedFeatures), 0)
-		featureBlock5 = cx
-		// Ignore features duplicated from block 1 on AMD. These bits
-		// are reserved on Intel.
-		featureBlock6 = dx &^ block6DuplicateMask
-	}
-
-	set := setFromBlockMasks(featureBlock0, featureBlock1, featureBlock2, featureBlock3, featureBlock4, featureBlock5, featureBlock6)
-	return &FeatureSet{
-		Set:            set,
-		VendorID:       vendorID,
-		ExtendedFamily: ef,
-		ExtendedModel:  em,
-		ProcessorType:  pt,
-		Family:         f,
-		Model:          m,
-		SteppingID:     sid,
-		CacheLine:      cacheLine,
-		Caches:         caches,
-	}
-}
-
-// Reads max cpu frequency from host /proc/cpuinfo. Must run before
-// whitelisting. This value is used to create the fake /proc/cpuinfo from a
-// FeatureSet.
-func initCPUFreq() {
-	cpuinfob, err := ioutil.ReadFile("/proc/cpuinfo")
-	if err != nil {
-		// Leave it as 0... The standalone VDSO bails out in the same
-		// way.
-		log.Warningf("Could not read /proc/cpuinfo: %v", err)
-		return
-	}
-	cpuinfo := string(cpuinfob)
-
-	// We get the value straight from host /proc/cpuinfo. On machines with
-	// frequency scaling enabled, this will only get the current value
-	// which will likely be inaccurate. This is fine on machines with
-	// frequency scaling disabled.
-	for _, line := range strings.Split(cpuinfo, "\n") {
-		if strings.Contains(line, "cpu MHz") {
-			splitMHz := strings.Split(line, ":")
-			if len(splitMHz) < 2 {
-				log.Warningf("Could not read /proc/cpuinfo: malformed cpu MHz line")
-				return
-			}
-
-			// If there was a problem, leave cpuFreqMHz as 0.
-			var err error
-			cpuFreqMHz, err = strconv.ParseFloat(strings.TrimSpace(splitMHz[1]), 64)
-			if err != nil {
-				log.Warningf("Could not parse cpu MHz value %v: %v", splitMHz[1], err)
-				cpuFreqMHz = 0
-				return
-			}
-			return
-		}
-	}
-	log.Warningf("Could not parse /proc/cpuinfo, it is empty or does not contain cpu MHz")
-}
-
-func initFeaturesFromString() {
-	for f, s := range x86FeatureStrings {
-		x86FeaturesFromString[s] = f
-	}
-	for f, s := range x86FeatureParseOnlyStrings {
-		x86FeaturesFromString[s] = f
-	}
-}
-
-func init() {
-	// initCpuFreq must be run before whitelists are enabled.
-	initCPUFreq()
-	initFeaturesFromString()
-}
+// On arm64, features are numbered according to the ELF HWCAP definition.
+// arch/arm64/include/uapi/asm/hwcap.h
+type Feature int
diff --git a/pkg/cpuid/cpuid_arm64.go b/pkg/cpuid/cpuid_arm64.go
new file mode 100644
index 000000000..6d71290c9
--- /dev/null
+++ b/pkg/cpuid/cpuid_arm64.go
@@ -0,0 +1,461 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package cpuid
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io/ioutil"
+	"strconv"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+// ARM64 doesn't have a 'cpuid' equivalent, which means it have no architected
+// discovery mechanism for hardware features available to userspace code at EL0.
+// The kernel exposes the presence of these features to userspace through a set
+// of flags(HWCAP/HWCAP2) bits, exposed in the auxilliary vector.
+// Ref Documentation/arm64/elf_hwcaps.rst for more info.
+//
+// Currently, only the HWCAP bits are supported.
+
+const (
+	// Single and double precision float point types.
+	ARM64FeatureFP Feature = iota
+
+	// Advanced SIMD with single and double precision
+	// float point arithmetic.
+	ARM64FeatureASIMD
+
+	// The generic timer is configured to generate
+	// events at a frequency of approximately 100KHz.
+	ARM64FeatureEVTSTRM
+
+	// AES instructions(AESE/AESD/AESMC/AESIMC).
+	ARM64FeatureAES
+
+	// AES instructions(PMULL/PMULL2).
+	ARM64FeaturePMULL
+
+	// SHA1 instructions(SHA1C/SHA1P/SHA1M etc).
+	ARM64FeatureSHA1
+
+	// SHA2 instructions(SHA256H/SHA256H2/SHA256SU0 etc).
+	ARM64FeatureSHA2
+
+	// CRC32 instructions(CRC32B/CRC32H/CRC32W etc).
+	ARM64FeatureCRC32
+
+	// Atomic instructions(LDADD/LDCLR/LDEOR/LDSET etc).
+	ARM64FeatureATOMICS
+
+	// Half precision float point arithmetic.
+	ARM64FeatureFPHP
+
+	// ASIMD with half precision float point arithmetic.
+	ARM64FeatureASIMDHP
+
+	// EL0 access to certain ID registers is available.
+	ARM64FeatureCPUID
+
+	// SQRDMLAH and SQRDMLSH instructions implemented.
+	ARM64FeatureASIMDRDM
+
+	// The FJCVTZS instruction is implemented.
+	ARM64FeatureJSCVT
+
+	// The FCMLA and FCADD instructions are implemented.
+	ARM64FeatureFCMA
+
+	// The LDAPRB/LDAPRH/LDAPR instructions are implemented.
+	ARM64FeatureLRCPC
+
+	// DC instruction(DC CVAP) supported.
+	ARM64FeatureDCPOP
+
+	// SHA3 instructions(EOR3/RAX1/XAR/BCAX) implemented.
+	ARM64FeatureSHA3
+
+	// SM3 instructions(SM3SS1/SM3TT1A/SM3TT1B) implemented.
+	ARM64FeatureSM3
+
+	// SM4 instructions(SM4E/SM4EKEY) implemented.
+	ARM64FeatureSM4
+
+	// Dot Product instructions(UDOT/SDOT) implemented.
+	ARM64FeatureASIMDDP
+
+	// SHA2 instructions(SHA512H/SHA512H2/SHA512SU0) implemented.
+	ARM64FeatureSHA512
+
+	// Scalable Vector Extension implemented.
+	ARM64FeatureSVE
+
+	// FMLAL and FMLSL instructions are implemented.
+	ARM64FeatureASIMDFHM
+)
+
+// ELF auxiliary vector tags
+const (
+	_AT_NULL   = 0  // End of vector
+	_AT_HWCAP  = 16 // hardware capability bit vector
+	_AT_HWCAP2 = 26 // hardware capability bit vector 2
+)
+
+// These should not be changed after they are initialized.
+var hwCap uint
+
+// To make emulation of /proc/cpuinfo easy, these names match the names of the
+// basic features in Linux defined in arch/arm64/kernel/cpuinfo.c.
+var arm64FeatureStrings = map[Feature]string{
+	ARM64FeatureFP:       "fp",
+	ARM64FeatureASIMD:    "asimd",
+	ARM64FeatureEVTSTRM:  "evtstrm",
+	ARM64FeatureAES:      "aes",
+	ARM64FeaturePMULL:    "pmull",
+	ARM64FeatureSHA1:     "sha1",
+	ARM64FeatureSHA2:     "sha2",
+	ARM64FeatureCRC32:    "crc32",
+	ARM64FeatureATOMICS:  "atomics",
+	ARM64FeatureFPHP:     "fphp",
+	ARM64FeatureASIMDHP:  "asimdhp",
+	ARM64FeatureCPUID:    "cpuid",
+	ARM64FeatureASIMDRDM: "asimdrdm",
+	ARM64FeatureJSCVT:    "jscvt",
+	ARM64FeatureFCMA:     "fcma",
+	ARM64FeatureLRCPC:    "lrcpc",
+	ARM64FeatureDCPOP:    "dcpop",
+	ARM64FeatureSHA3:     "sha3",
+	ARM64FeatureSM3:      "sm3",
+	ARM64FeatureSM4:      "sm4",
+	ARM64FeatureASIMDDP:  "asimddp",
+	ARM64FeatureSHA512:   "sha512",
+	ARM64FeatureSVE:      "sve",
+	ARM64FeatureASIMDFHM: "asimdfhm",
+}
+
+var (
+	cpuFreqMHz float64
+	cpuImplHex uint64
+	cpuArchDec uint64
+	cpuVarHex  uint64
+	cpuPartHex uint64
+	cpuRevDec  uint64
+)
+
+// arm64FeaturesFromString includes features from arm64FeatureStrings.
+var arm64FeaturesFromString = make(map[string]Feature)
+
+// FeatureFromString returns the Feature associated with the given feature
+// string plus a bool to indicate if it could find the feature.
+func FeatureFromString(s string) (Feature, bool) {
+	f, b := arm64FeaturesFromString[s]
+	return f, b
+}
+
+// String implements fmt.Stringer.
+func (f Feature) String() string {
+	if s := f.flagString(); s != "" {
+		return s
+	}
+
+	return fmt.Sprintf("<cpuflag %d>", f)
+}
+
+func (f Feature) flagString() string {
+	if s, ok := arm64FeatureStrings[f]; ok {
+		return s
+	}
+
+	return ""
+}
+
+// FeatureSet is a set of Features for a CPU.
+//
+// +stateify savable
+type FeatureSet struct {
+	// Set is the set of features that are enabled in this FeatureSet.
+	Set map[Feature]bool
+
+	// CPUImplementer is part of the processor signature.
+	CPUImplementer uint8
+
+	// CPUArchitecture is part of the processor signature.
+	CPUArchitecture uint8
+
+	// CPUVariant is part of the processor signature.
+	CPUVariant uint8
+
+	// CPUPartnum is part of the processor signature.
+	CPUPartnum uint16
+
+	// CPURevision is part of the processor signature.
+	CPURevision uint8
+}
+
+// CheckHostCompatible returns nil if fs is a subset of the host feature set.
+// Noop on arm64.
+func (fs *FeatureSet) CheckHostCompatible() error {
+	return nil
+}
+
+// ExtendedStateSize returns the number of bytes needed to save the "extended
+// state" for this processor and the boundary it must be aligned to. Extended
+// state includes floating point(NEON) registers, and other cpu state that's not
+// associated with the normal task context.
+func (fs *FeatureSet) ExtendedStateSize() (size, align uint) {
+	// ARMv8 provide 32x128bits NEON registers.
+	//
+	// Ref arch/arm64/include/uapi/asm/ptrace.h
+	// struct user_fpsimd_state {
+	//        __uint128_t     vregs[32];
+	//        __u32           fpsr;
+	//	  __u32           fpcr;
+	//	  __u32           __reserved[2];
+	// };
+	return 528, 16
+}
+
+// HasFeature tests whether or not a feature is in the given feature set.
+func (fs *FeatureSet) HasFeature(feature Feature) bool {
+	return fs.Set[feature]
+}
+
+// UseXsaveopt returns true if 'fs' supports the "xsaveopt" instruction.
+// Noop on arm64.
+func (fs *FeatureSet) UseXsave() bool {
+	return false
+}
+
+// FlagsString prints out supported CPU "flags" field in /proc/cpuinfo.
+func (fs *FeatureSet) FlagsString() string {
+	var s []string
+	for f, _ := range arm64FeatureStrings {
+		if fs.Set[f] {
+			if fstr := f.flagString(); fstr != "" {
+				s = append(s, fstr)
+			}
+		}
+	}
+	return strings.Join(s, " ")
+}
+
+// WriteCPUInfoTo is to generate a section of one cpu in /proc/cpuinfo. This is
+// a minimal /proc/cpuinfo, and the bogomips field is simply made up.
+func (fs FeatureSet) WriteCPUInfoTo(cpu uint, b *bytes.Buffer) {
+	fmt.Fprintf(b, "processor\t: %d\n", cpu)
+	fmt.Fprintf(b, "BogoMIPS\t: %.02f\n", cpuFreqMHz) // It's bogus anyway.
+	fmt.Fprintf(b, "Features\t\t: %s\n", fs.FlagsString())
+	fmt.Fprintf(b, "CPU implementer\t: 0x%x\n", cpuImplHex)
+	fmt.Fprintf(b, "CPU architecture\t: %d\n", cpuArchDec)
+	fmt.Fprintf(b, "CPU variant\t: 0x%x\n", cpuVarHex)
+	fmt.Fprintf(b, "CPU part\t: 0x%x\n", cpuPartHex)
+	fmt.Fprintf(b, "CPU revision\t: %d\n", cpuRevDec)
+	fmt.Fprintln(b, "") // The /proc/cpuinfo file ends with an extra newline.
+}
+
+// HostFeatureSet uses hwCap to get host values and construct a feature set
+// that matches that of the host machine.
+func HostFeatureSet() *FeatureSet {
+	s := make(map[Feature]bool)
+
+	for f, _ := range arm64FeatureStrings {
+		if hwCap&(1<<f) != 0 {
+			s[f] = true
+		}
+	}
+
+	return &FeatureSet{
+		Set:             s,
+		CPUImplementer:  uint8(cpuImplHex),
+		CPUArchitecture: uint8(cpuArchDec),
+		CPUVariant:      uint8(cpuVarHex),
+		CPUPartnum:      uint16(cpuPartHex),
+		CPURevision:     uint8(cpuRevDec),
+	}
+}
+
+// Reads bogomips from host /proc/cpuinfo. Must run before whitelisting.
+// This value is used to create the fake /proc/cpuinfo from a FeatureSet.
+func initCPUInfo() {
+	cpuinfob, err := ioutil.ReadFile("/proc/cpuinfo")
+	if err != nil {
+		// Leave it as 0. The standalone VDSO bails out in the same way.
+		log.Warningf("Could not read /proc/cpuinfo: %v", err)
+		return
+	}
+	cpuinfo := string(cpuinfob)
+
+	// We get the value straight from host /proc/cpuinfo.
+	for _, line := range strings.Split(cpuinfo, "\n") {
+		switch {
+		case strings.Contains(line, "BogoMIPS"):
+			{
+				splitMHz := strings.Split(line, ":")
+				if len(splitMHz) < 2 {
+					log.Warningf("Could not read /proc/cpuinfo: malformed BogoMIPS")
+					break
+				}
+
+				// If there was a problem, leave cpuFreqMHz as 0.
+				var err error
+				cpuFreqMHz, err = strconv.ParseFloat(strings.TrimSpace(splitMHz[1]), 64)
+				if err != nil {
+					log.Warningf("Could not parse BogoMIPS value %v: %v", splitMHz[1], err)
+					cpuFreqMHz = 0
+				}
+			}
+		case strings.Contains(line, "CPU implementer"):
+			{
+				splitImpl := strings.Split(line, ":")
+				if len(splitImpl) < 2 {
+					log.Warningf("Could not read /proc/cpuinfo: malformed CPU implementer")
+					break
+				}
+
+				// If there was a problem, leave cpuImplHex as 0.
+				var err error
+				cpuImplHex, err = strconv.ParseUint(strings.TrimSpace(splitImpl[1]), 0, 64)
+				if err != nil {
+					log.Warningf("Could not parse CPU implementer value %v: %v", splitImpl[1], err)
+					cpuImplHex = 0
+				}
+			}
+		case strings.Contains(line, "CPU architecture"):
+			{
+				splitArch := strings.Split(line, ":")
+				if len(splitArch) < 2 {
+					log.Warningf("Could not read /proc/cpuinfo: malformed CPU architecture")
+					break
+				}
+
+				// If there was a problem, leave cpuArchDec as 0.
+				var err error
+				cpuArchDec, err = strconv.ParseUint(strings.TrimSpace(splitArch[1]), 0, 64)
+				if err != nil {
+					log.Warningf("Could not parse CPU architecture value %v: %v", splitArch[1], err)
+					cpuArchDec = 0
+				}
+			}
+		case strings.Contains(line, "CPU variant"):
+			{
+				splitVar := strings.Split(line, ":")
+				if len(splitVar) < 2 {
+					log.Warningf("Could not read /proc/cpuinfo: malformed CPU variant")
+					break
+				}
+
+				// If there was a problem, leave cpuVarHex as 0.
+				var err error
+				cpuVarHex, err = strconv.ParseUint(strings.TrimSpace(splitVar[1]), 0, 64)
+				if err != nil {
+					log.Warningf("Could not parse CPU variant value %v: %v", splitVar[1], err)
+					cpuVarHex = 0
+				}
+			}
+		case strings.Contains(line, "CPU part"):
+			{
+				splitPart := strings.Split(line, ":")
+				if len(splitPart) < 2 {
+					log.Warningf("Could not read /proc/cpuinfo: malformed CPU part")
+					break
+				}
+
+				// If there was a problem, leave cpuPartHex as 0.
+				var err error
+				cpuPartHex, err = strconv.ParseUint(strings.TrimSpace(splitPart[1]), 0, 64)
+				if err != nil {
+					log.Warningf("Could not parse CPU part value %v: %v", splitPart[1], err)
+					cpuPartHex = 0
+				}
+			}
+		case strings.Contains(line, "CPU revision"):
+			{
+				splitRev := strings.Split(line, ":")
+				if len(splitRev) < 2 {
+					log.Warningf("Could not read /proc/cpuinfo: malformed CPU revision")
+					break
+				}
+
+				// If there was a problem, leave cpuRevDec as 0.
+				var err error
+				cpuRevDec, err = strconv.ParseUint(strings.TrimSpace(splitRev[1]), 0, 64)
+				if err != nil {
+					log.Warningf("Could not parse CPU revision value %v: %v", splitRev[1], err)
+					cpuRevDec = 0
+				}
+			}
+		}
+	}
+}
+
+// The auxiliary vector of a process on the Linux system can be read
+// from /proc/self/auxv, and tags and values are stored as 8-bytes
+// decimal key-value pairs on the 64-bit system.
+//
+// $ od -t d8 /proc/self/auxv
+//  0000000                   33      140734615224320
+//  0000020                   16           3219913727
+//  0000040                    6                 4096
+//  0000060                   17                  100
+//  0000100                    3       94665627353152
+//  0000120                    4                   56
+//  0000140                    5                    9
+//  0000160                    7      140425502162944
+//  0000200                    8                    0
+//  0000220                    9       94665627365760
+//  0000240                   11                 1000
+//  0000260                   12                 1000
+//  0000300                   13                 1000
+//  0000320                   14                 1000
+//  0000340                   23                    0
+//  0000360                   25      140734614619513
+//  0000400                   26                    0
+//  0000420                   31      140734614626284
+//  0000440                   15      140734614619529
+//  0000460                    0                    0
+func initHwCap() {
+	auxv, err := ioutil.ReadFile("/proc/self/auxv")
+	if err != nil {
+		log.Warningf("Could not read /proc/self/auxv: %v", err)
+		return
+	}
+
+	l := len(auxv) / 16
+	for i := 0; i < l; i++ {
+		tag := binary.LittleEndian.Uint64(auxv[i*16:])
+		val := binary.LittleEndian.Uint64(auxv[(i*16 + 8):])
+		if tag == _AT_HWCAP {
+			hwCap = uint(val)
+			break
+		}
+	}
+}
+
+func initFeaturesFromString() {
+	for f, s := range arm64FeatureStrings {
+		arm64FeaturesFromString[s] = f
+	}
+}
+
+func init() {
+	initCPUInfo()
+	initHwCap()
+	initFeaturesFromString()
+}
diff --git a/pkg/cpuid/cpuid_arm64_test.go b/pkg/cpuid/cpuid_arm64_test.go
new file mode 100644
index 000000000..a34f67779
--- /dev/null
+++ b/pkg/cpuid/cpuid_arm64_test.go
@@ -0,0 +1,55 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package cpuid
+
+import (
+	"testing"
+)
+
+var justFP = &FeatureSet{
+	Set: map[Feature]bool{
+		ARM64FeatureFP: true,
+	}}
+
+func TestHostFeatureSet(t *testing.T) {
+	hostFeatures := HostFeatureSet()
+	if len(hostFeatures.Set) == 0 {
+		t.Errorf("Got invalid feature set %v from HostFeatureSet()", hostFeatures)
+	}
+}
+
+func TestHasFeature(t *testing.T) {
+	if !justFP.HasFeature(ARM64FeatureFP) {
+		t.Errorf("HasFeature failed, %v should contain %v", justFP, ARM64FeatureFP)
+	}
+
+	if justFP.HasFeature(ARM64FeatureSM3) {
+		t.Errorf("HasFeature failed, %v should not contain %v", justFP, ARM64FeatureSM3)
+	}
+}
+
+func TestFeatureFromString(t *testing.T) {
+	f, ok := FeatureFromString("asimd")
+	if f != ARM64FeatureASIMD || !ok {
+		t.Errorf("got %v want asimd", f)
+	}
+
+	f, ok = FeatureFromString("bad")
+	if ok {
+		t.Errorf("got %v want nothing", f)
+	}
+}
diff --git a/pkg/cpuid/cpuid_parse_test.go b/pkg/cpuid/cpuid_parse_test.go
deleted file mode 100644
index dd9969db4..000000000
--- a/pkg/cpuid/cpuid_parse_test.go
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package cpuid
-
-import (
-	"fmt"
-	"io/ioutil"
-	"regexp"
-	"strconv"
-	"strings"
-	"syscall"
-	"testing"
-)
-
-func kernelVersion() (int, int, error) {
-	var u syscall.Utsname
-	if err := syscall.Uname(&u); err != nil {
-		return 0, 0, err
-	}
-
-	var r string
-	for _, b := range u.Release {
-		if b == 0 {
-			break
-		}
-		r += string(b)
-	}
-
-	s := strings.Split(r, ".")
-	if len(s) < 2 {
-		return 0, 0, fmt.Errorf("kernel release missing major and minor component: %s", r)
-	}
-
-	major, err := strconv.Atoi(s[0])
-	if err != nil {
-		return 0, 0, fmt.Errorf("error parsing major version %q in %q: %v", s[0], r, err)
-	}
-
-	minor, err := strconv.Atoi(s[1])
-	if err != nil {
-		return 0, 0, fmt.Errorf("error parsing minor version %q in %q: %v", s[1], r, err)
-	}
-
-	return major, minor, nil
-}
-
-// TestHostFeatureFlags tests that all features detected by HostFeatureSet are
-// on the host.
-//
-// It does *not* verify that all features reported by the host are detected by
-// HostFeatureSet.
-//
-// i.e., test that HostFeatureSet is a subset of the host features.
-func TestHostFeatureFlags(t *testing.T) {
-	cpuinfoBytes, _ := ioutil.ReadFile("/proc/cpuinfo")
-	cpuinfo := string(cpuinfoBytes)
-	t.Logf("Host cpu info:\n%s", cpuinfo)
-
-	major, minor, err := kernelVersion()
-	if err != nil {
-		t.Fatalf("Unable to parse kernel version: %v", err)
-	}
-
-	re := regexp.MustCompile(`(?m)^flags\s+: (.*)$`)
-	m := re.FindStringSubmatch(cpuinfo)
-	if len(m) != 2 {
-		t.Fatalf("Unable to extract flags from %q", cpuinfo)
-	}
-
-	cpuinfoFlags := make(map[string]struct{})
-	for _, f := range strings.Split(m[1], " ") {
-		cpuinfoFlags[f] = struct{}{}
-	}
-
-	fs := HostFeatureSet()
-
-	// All features have a string and appear in host cpuinfo.
-	for f := range fs.Set {
-		name := f.flagString(false)
-		if name == "" {
-			t.Errorf("Non-parsable feature: %v", f)
-		}
-
-		// Special cases not consistently visible. We don't mind if
-		// they are exposed in earlier versions.
-		switch {
-		// Block 0.
-		case f == X86FeatureSDBG && (major < 4 || major == 4 && minor < 3):
-			// SDBG only exposed in
-			// b1c599b8ff80ea79b9f8277a3f9f36a7b0cfedce (4.3).
-			continue
-		// Block 2.
-		case f == X86FeatureRDT && (major < 4 || major == 4 && minor < 10):
-			// RDT only exposed in
-			// 4ab1586488cb56ed8728e54c4157cc38646874d9 (4.10).
-			continue
-		// Block 3.
-		case f == X86FeatureAVX512VBMI && (major < 4 || major == 4 && minor < 10):
-			// AVX512VBMI only exposed in
-			// a8d9df5a509a232a959e4ef2e281f7ecd77810d6 (4.10).
-			continue
-		case f == X86FeatureUMIP && (major < 4 || major == 4 && minor < 15):
-			// UMIP only exposed in
-			// 3522c2a6a4f341058b8291326a945e2a2d2aaf55 (4.15).
-			continue
-		case f == X86FeaturePKU && (major < 4 || major == 4 && minor < 9):
-			// PKU only exposed in
-			// dfb4a70f20c5b3880da56ee4c9484bdb4e8f1e65 (4.9).
-			continue
-		// Block 4.
-		case f == X86FeatureXSAVES && (major < 4 || major == 4 && minor < 8):
-			// XSAVES only exposed in
-			// b8be15d588060a03569ac85dc4a0247460988f5b (4.8).
-			continue
-		// Block 5.
-		case f == X86FeaturePERFCTR_LLC && (major < 4 || major == 4 && minor < 14):
-			// PERFCTR_LLC renamed in
-			// 910448bbed066ab1082b510eef1ae61bb792d854 (4.14).
-			continue
-		}
-
-		hidden := f.flagString(true) == ""
-		_, ok := cpuinfoFlags[name]
-		if hidden && ok {
-			t.Errorf("Unexpectedly hidden flag: %v", f)
-		} else if !hidden && !ok {
-			t.Errorf("Non-native flag: %v", f)
-		}
-	}
-}
diff --git a/pkg/cpuid/cpuid_parse_x86_test.go b/pkg/cpuid/cpuid_parse_x86_test.go
new file mode 100644
index 000000000..d48418e69
--- /dev/null
+++ b/pkg/cpuid/cpuid_parse_x86_test.go
@@ -0,0 +1,144 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package cpuid
+
+import (
+	"fmt"
+	"io/ioutil"
+	"regexp"
+	"strconv"
+	"strings"
+	"syscall"
+	"testing"
+)
+
+func kernelVersion() (int, int, error) {
+	var u syscall.Utsname
+	if err := syscall.Uname(&u); err != nil {
+		return 0, 0, err
+	}
+
+	var r string
+	for _, b := range u.Release {
+		if b == 0 {
+			break
+		}
+		r += string(b)
+	}
+
+	s := strings.Split(r, ".")
+	if len(s) < 2 {
+		return 0, 0, fmt.Errorf("kernel release missing major and minor component: %s", r)
+	}
+
+	major, err := strconv.Atoi(s[0])
+	if err != nil {
+		return 0, 0, fmt.Errorf("error parsing major version %q in %q: %v", s[0], r, err)
+	}
+
+	minor, err := strconv.Atoi(s[1])
+	if err != nil {
+		return 0, 0, fmt.Errorf("error parsing minor version %q in %q: %v", s[1], r, err)
+	}
+
+	return major, minor, nil
+}
+
+// TestHostFeatureFlags tests that all features detected by HostFeatureSet are
+// on the host.
+//
+// It does *not* verify that all features reported by the host are detected by
+// HostFeatureSet.
+//
+// i.e., test that HostFeatureSet is a subset of the host features.
+func TestHostFeatureFlags(t *testing.T) {
+	cpuinfoBytes, _ := ioutil.ReadFile("/proc/cpuinfo")
+	cpuinfo := string(cpuinfoBytes)
+	t.Logf("Host cpu info:\n%s", cpuinfo)
+
+	major, minor, err := kernelVersion()
+	if err != nil {
+		t.Fatalf("Unable to parse kernel version: %v", err)
+	}
+
+	re := regexp.MustCompile(`(?m)^flags\s+: (.*)$`)
+	m := re.FindStringSubmatch(cpuinfo)
+	if len(m) != 2 {
+		t.Fatalf("Unable to extract flags from %q", cpuinfo)
+	}
+
+	cpuinfoFlags := make(map[string]struct{})
+	for _, f := range strings.Split(m[1], " ") {
+		cpuinfoFlags[f] = struct{}{}
+	}
+
+	fs := HostFeatureSet()
+
+	// All features have a string and appear in host cpuinfo.
+	for f := range fs.Set {
+		name := f.flagString(false)
+		if name == "" {
+			t.Errorf("Non-parsable feature: %v", f)
+		}
+
+		// Special cases not consistently visible. We don't mind if
+		// they are exposed in earlier versions.
+		switch {
+		// Block 0.
+		case f == X86FeatureSDBG && (major < 4 || major == 4 && minor < 3):
+			// SDBG only exposed in
+			// b1c599b8ff80ea79b9f8277a3f9f36a7b0cfedce (4.3).
+			continue
+		// Block 2.
+		case f == X86FeatureRDT && (major < 4 || major == 4 && minor < 10):
+			// RDT only exposed in
+			// 4ab1586488cb56ed8728e54c4157cc38646874d9 (4.10).
+			continue
+		// Block 3.
+		case f == X86FeatureAVX512VBMI && (major < 4 || major == 4 && minor < 10):
+			// AVX512VBMI only exposed in
+			// a8d9df5a509a232a959e4ef2e281f7ecd77810d6 (4.10).
+			continue
+		case f == X86FeatureUMIP && (major < 4 || major == 4 && minor < 15):
+			// UMIP only exposed in
+			// 3522c2a6a4f341058b8291326a945e2a2d2aaf55 (4.15).
+			continue
+		case f == X86FeaturePKU && (major < 4 || major == 4 && minor < 9):
+			// PKU only exposed in
+			// dfb4a70f20c5b3880da56ee4c9484bdb4e8f1e65 (4.9).
+			continue
+		// Block 4.
+		case f == X86FeatureXSAVES && (major < 4 || major == 4 && minor < 8):
+			// XSAVES only exposed in
+			// b8be15d588060a03569ac85dc4a0247460988f5b (4.8).
+			continue
+		// Block 5.
+		case f == X86FeaturePERFCTR_LLC && (major < 4 || major == 4 && minor < 14):
+			// PERFCTR_LLC renamed in
+			// 910448bbed066ab1082b510eef1ae61bb792d854 (4.14).
+			continue
+		}
+
+		hidden := f.flagString(true) == ""
+		_, ok := cpuinfoFlags[name]
+		if hidden && ok {
+			t.Errorf("Unexpectedly hidden flag: %v", f)
+		} else if !hidden && !ok {
+			t.Errorf("Non-native flag: %v", f)
+		}
+	}
+}
diff --git a/pkg/cpuid/cpuid_test.go b/pkg/cpuid/cpuid_test.go
deleted file mode 100644
index a707ebb55..000000000
--- a/pkg/cpuid/cpuid_test.go
+++ /dev/null
@@ -1,241 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package cpuid
-
-import (
-	"testing"
-)
-
-// These are the default values of various FeatureSet fields.
-const (
-	defaultVendorID = "GenuineIntel"
-
-	// These processor signature defaults are derived from the values
-	// listed in Intel Application Note 485 for i7/Xeon processors.
-	defaultExtFamily  uint8 = 0
-	defaultExtModel   uint8 = 1
-	defaultType       uint8 = 0
-	defaultFamily     uint8 = 0x06
-	defaultModel      uint8 = 0x0a
-	defaultSteppingID uint8 = 0
-)
-
-// newEmptyFeatureSet creates a new FeatureSet with a sensible default model and no features.
-func newEmptyFeatureSet() *FeatureSet {
-	return &FeatureSet{
-		Set:            make(map[Feature]bool),
-		VendorID:       defaultVendorID,
-		ExtendedFamily: defaultExtFamily,
-		ExtendedModel:  defaultExtModel,
-		ProcessorType:  defaultType,
-		Family:         defaultFamily,
-		Model:          defaultModel,
-		SteppingID:     defaultSteppingID,
-	}
-}
-
-var justFPU = &FeatureSet{
-	Set: map[Feature]bool{
-		X86FeatureFPU: true,
-	}}
-
-var justFPUandPAE = &FeatureSet{
-	Set: map[Feature]bool{
-		X86FeatureFPU: true,
-		X86FeaturePAE: true,
-	}}
-
-func TestSubtract(t *testing.T) {
-	if diff := justFPU.Subtract(justFPUandPAE); diff != nil {
-		t.Errorf("Got %v is not subset of %v, want diff (%v) to be nil", justFPU, justFPUandPAE, diff)
-	}
-
-	if justFPUandPAE.Subtract(justFPU) == nil {
-		t.Errorf("Got %v is a subset of %v, want diff to be nil", justFPU, justFPUandPAE)
-	}
-}
-
-// TODO(b/73346484): Run this test on a very old platform, and make sure more
-// bits are enabled than just FPU and PAE. This test currently may not detect
-// if HostFeatureSet gives back junk bits.
-func TestHostFeatureSet(t *testing.T) {
-	hostFeatures := HostFeatureSet()
-	if justFPUandPAE.Subtract(hostFeatures) != nil {
-		t.Errorf("Got invalid feature set %v from HostFeatureSet()", hostFeatures)
-	}
-}
-
-func TestHasFeature(t *testing.T) {
-	if !justFPU.HasFeature(X86FeatureFPU) {
-		t.Errorf("HasFeature failed, %v should contain %v", justFPU, X86FeatureFPU)
-	}
-
-	if justFPU.HasFeature(X86FeatureAVX) {
-		t.Errorf("HasFeature failed, %v should not contain %v", justFPU, X86FeatureAVX)
-	}
-}
-
-// Note: these tests are aware of and abuse internal details of FeatureSets.
-// Users of FeatureSets should not depend on this.
-func TestAdd(t *testing.T) {
-	// Test a basic insertion into the FeatureSet.
-	testFeatures := newEmptyFeatureSet()
-	testFeatures.Add(X86FeatureCLFSH)
-	if len(testFeatures.Set) != 1 {
-		t.Errorf("Got length %v want 1", len(testFeatures.Set))
-	}
-
-	if !testFeatures.HasFeature(X86FeatureCLFSH) {
-		t.Errorf("Add failed, got %v want set with %v", testFeatures, X86FeatureCLFSH)
-	}
-
-	// Test that duplicates are ignored.
-	testFeatures.Add(X86FeatureCLFSH)
-	if len(testFeatures.Set) != 1 {
-		t.Errorf("Got length %v, want 1", len(testFeatures.Set))
-	}
-}
-
-func TestRemove(t *testing.T) {
-	// Try removing the last feature.
-	testFeatures := newEmptyFeatureSet()
-	testFeatures.Add(X86FeatureFPU)
-	testFeatures.Add(X86FeaturePAE)
-	testFeatures.Remove(X86FeaturePAE)
-	if !testFeatures.HasFeature(X86FeatureFPU) || len(testFeatures.Set) != 1 || testFeatures.HasFeature(X86FeaturePAE) {
-		t.Errorf("Remove failed, got %v want %v", testFeatures, justFPU)
-	}
-
-	// Try removing a feature not in the set.
-	testFeatures.Remove(X86FeatureRDRAND)
-	if !testFeatures.HasFeature(X86FeatureFPU) || len(testFeatures.Set) != 1 {
-		t.Errorf("Remove failed, got %v want %v", testFeatures, justFPU)
-	}
-}
-
-func TestFeatureFromString(t *testing.T) {
-	f, ok := FeatureFromString("avx")
-	if f != X86FeatureAVX || !ok {
-		t.Errorf("got %v want avx", f)
-	}
-
-	f, ok = FeatureFromString("bad")
-	if ok {
-		t.Errorf("got %v want nothing", f)
-	}
-}
-
-// This tests function 0 (eax=0), which returns the vendor ID and highest cpuid
-// function reported to be available.
-func TestEmulateIDVendorAndLength(t *testing.T) {
-	testFeatures := newEmptyFeatureSet()
-
-	ax, bx, cx, dx := testFeatures.EmulateID(0, 0)
-	wantEax := uint32(0xd) // Highest supported cpuid function.
-
-	// These magical constants are the characters of "GenuineIntel".
-	// See Intel AN485 for a reference on why they are laid out like this.
-	wantEbx := uint32(0x756e6547)
-	wantEcx := uint32(0x6c65746e)
-	wantEdx := uint32(0x49656e69)
-	if wantEax != ax {
-		t.Errorf("highest function failed, got %x want %x", ax, wantEax)
-	}
-
-	if wantEbx != bx || wantEcx != cx || wantEdx != dx {
-		t.Errorf("vendor string emulation failed, bx:cx:dx, got %x:%x:%x want %x:%x:%x", bx, cx, dx, wantEbx, wantEcx, wantEdx)
-	}
-}
-
-func TestEmulateIDBasicFeatures(t *testing.T) {
-	// Make a minimal test feature set.
-	testFeatures := newEmptyFeatureSet()
-	testFeatures.Add(X86FeatureCLFSH)
-	testFeatures.Add(X86FeatureAVX)
-	testFeatures.CacheLine = 64
-
-	ax, bx, cx, dx := testFeatures.EmulateID(1, 0)
-	ECXAVXBit := uint32(1 << uint(X86FeatureAVX))
-	EDXCLFlushBit := uint32(1 << uint(X86FeatureCLFSH-32)) // We adjust by 32 since it's in block 1.
-
-	if EDXCLFlushBit&dx == 0 || dx&^EDXCLFlushBit != 0 {
-		t.Errorf("EmulateID failed, got feature bits %x want %x", dx, testFeatures.blockMask(1))
-	}
-
-	if ECXAVXBit&cx == 0 || cx&^ECXAVXBit != 0 {
-		t.Errorf("EmulateID failed, got feature bits %x want %x", cx, testFeatures.blockMask(0))
-	}
-
-	// Default signature bits, based on values for i7/Xeon.
-	// See Intel AN485 for information on stepping/model bits.
-	defaultSignature := uint32(0x000106a0)
-	if defaultSignature != ax {
-		t.Errorf("EmulateID stepping emulation failed, got %x want %x", ax, defaultSignature)
-	}
-
-	clflushSizeInfo := uint32(8 << 8)
-	if clflushSizeInfo != bx {
-		t.Errorf("EmulateID bx emulation failed, got %x want %x", bx, clflushSizeInfo)
-	}
-}
-
-func TestEmulateIDExtendedFeatures(t *testing.T) {
-	// Make a minimal test feature set, one bit in each extended feature word.
-	testFeatures := newEmptyFeatureSet()
-	testFeatures.Add(X86FeatureSMEP)
-	testFeatures.Add(X86FeatureAVX512VBMI)
-
-	ax, bx, cx, dx := testFeatures.EmulateID(7, 0)
-	EBXSMEPBit := uint32(1 << uint(X86FeatureSMEP-2*32))      // Adjust by 2*32 since SMEP is a block 2 feature.
-	ECXAVXBit := uint32(1 << uint(X86FeatureAVX512VBMI-3*32)) // We adjust by 3*32 since it's a block 3 feature.
-
-	// Test that the desired bit is set and no other bits are set.
-	if EBXSMEPBit&bx == 0 || bx&^EBXSMEPBit != 0 {
-		t.Errorf("extended feature emulation failed, got feature bits %x want %x", bx, testFeatures.blockMask(2))
-	}
-
-	if ECXAVXBit&cx == 0 || cx&^ECXAVXBit != 0 {
-		t.Errorf("extended feature emulation failed, got feature bits %x want %x", cx, testFeatures.blockMask(3))
-	}
-
-	if ax != 0 || dx != 0 {
-		t.Errorf("extended feature emulation failed, ax:dx, got %x:%x want 0:0", ax, dx)
-	}
-
-	// Check that no subleaves other than 0 do anything.
-	ax, bx, cx, dx = testFeatures.EmulateID(7, 1)
-	if ax != 0 || bx != 0 || cx != 0 || dx != 0 {
-		t.Errorf("extended feature emulation failed, got %x:%x:%x:%x want 0:0", ax, bx, cx, dx)
-	}
-
-}
-
-// Checks that the expected extended features are available via cpuid functions
-// 0x80000000 and up.
-func TestEmulateIDExtended(t *testing.T) {
-	testFeatures := newEmptyFeatureSet()
-	testFeatures.Add(X86FeatureSYSCALL)
-	EDXSYSCALLBit := uint32(1 << uint(X86FeatureSYSCALL-6*32)) // Adjust by 6*32 since SYSCALL is a block 6 feature.
-
-	ax, bx, cx, dx := testFeatures.EmulateID(0x80000000, 0)
-	if ax != 0x80000001 || bx != 0 || cx != 0 || dx != 0 {
-		t.Errorf("EmulateID extended emulation failed, ax:bx:cx:dx, got %x:%x:%x:%x want 0x80000001:0:0:0", ax, bx, cx, dx)
-	}
-
-	_, _, _, dx = testFeatures.EmulateID(0x80000001, 0)
-	if EDXSYSCALLBit&dx == 0 || dx&^EDXSYSCALLBit != 0 {
-		t.Errorf("extended feature emulation failed, got feature bits %x want %x", dx, testFeatures.blockMask(6))
-	}
-}
diff --git a/pkg/cpuid/cpuid_x86.go b/pkg/cpuid/cpuid_x86.go
new file mode 100644
index 000000000..333ca0a04
--- /dev/null
+++ b/pkg/cpuid/cpuid_x86.go
@@ -0,0 +1,1100 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package cpuid
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"strconv"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+// Common references for CPUID leaves and bits:
+//
+// Intel:
+//   * Intel SDM Volume 2, Chapter 3.2 "CPUID" (more up-to-date)
+//   * Intel Application Note 485 (more detailed)
+//
+// AMD:
+//   * AMD64 APM Volume 3, Appendix 3 "Obtaining Processor Information ..."
+
+// block is a collection of 32 Feature bits.
+type block int
+
+const blockSize = 32
+
+// Feature bits are numbered according to "blocks". Each block is 32 bits, and
+// feature bits from the same source (cpuid leaf/level) are in the same block.
+func featureID(b block, bit int) Feature {
+	return Feature(32*int(b) + bit)
+}
+
+// Block 0 constants are all of the "basic" feature bits returned by a cpuid in
+// ecx with eax=1.
+const (
+	X86FeatureSSE3 Feature = iota
+	X86FeaturePCLMULDQ
+	X86FeatureDTES64
+	X86FeatureMONITOR
+	X86FeatureDSCPL
+	X86FeatureVMX
+	X86FeatureSMX
+	X86FeatureEST
+	X86FeatureTM2
+	X86FeatureSSSE3 // Not a typo, "supplemental" SSE3.
+	X86FeatureCNXTID
+	X86FeatureSDBG
+	X86FeatureFMA
+	X86FeatureCX16
+	X86FeatureXTPR
+	X86FeaturePDCM
+	_ // ecx bit 16 is reserved.
+	X86FeaturePCID
+	X86FeatureDCA
+	X86FeatureSSE4_1
+	X86FeatureSSE4_2
+	X86FeatureX2APIC
+	X86FeatureMOVBE
+	X86FeaturePOPCNT
+	X86FeatureTSCD
+	X86FeatureAES
+	X86FeatureXSAVE
+	X86FeatureOSXSAVE
+	X86FeatureAVX
+	X86FeatureF16C
+	X86FeatureRDRAND
+	_ // ecx bit 31 is reserved.
+)
+
+// Block 1 constants are all of the "basic" feature bits returned by a cpuid in
+// edx with eax=1.
+const (
+	X86FeatureFPU Feature = 32 + iota
+	X86FeatureVME
+	X86FeatureDE
+	X86FeaturePSE
+	X86FeatureTSC
+	X86FeatureMSR
+	X86FeaturePAE
+	X86FeatureMCE
+	X86FeatureCX8
+	X86FeatureAPIC
+	_ // edx bit 10 is reserved.
+	X86FeatureSEP
+	X86FeatureMTRR
+	X86FeaturePGE
+	X86FeatureMCA
+	X86FeatureCMOV
+	X86FeaturePAT
+	X86FeaturePSE36
+	X86FeaturePSN
+	X86FeatureCLFSH
+	_ // edx bit 20 is reserved.
+	X86FeatureDS
+	X86FeatureACPI
+	X86FeatureMMX
+	X86FeatureFXSR
+	X86FeatureSSE
+	X86FeatureSSE2
+	X86FeatureSS
+	X86FeatureHTT
+	X86FeatureTM
+	X86FeatureIA64
+	X86FeaturePBE
+)
+
+// Block 2 bits are the "structured extended" features returned in ebx for
+// eax=7, ecx=0.
+const (
+	X86FeatureFSGSBase Feature = 2*32 + iota
+	X86FeatureTSC_ADJUST
+	_ // ebx bit 2 is reserved.
+	X86FeatureBMI1
+	X86FeatureHLE
+	X86FeatureAVX2
+	X86FeatureFDP_EXCPTN_ONLY
+	X86FeatureSMEP
+	X86FeatureBMI2
+	X86FeatureERMS
+	X86FeatureINVPCID
+	X86FeatureRTM
+	X86FeatureCQM
+	X86FeatureFPCSDS
+	X86FeatureMPX
+	X86FeatureRDT
+	X86FeatureAVX512F
+	X86FeatureAVX512DQ
+	X86FeatureRDSEED
+	X86FeatureADX
+	X86FeatureSMAP
+	X86FeatureAVX512IFMA
+	X86FeaturePCOMMIT
+	X86FeatureCLFLUSHOPT
+	X86FeatureCLWB
+	X86FeatureIPT // Intel processor trace.
+	X86FeatureAVX512PF
+	X86FeatureAVX512ER
+	X86FeatureAVX512CD
+	X86FeatureSHA
+	X86FeatureAVX512BW
+	X86FeatureAVX512VL
+)
+
+// Block 3 bits are the "extended" features returned in ecx for eax=7, ecx=0.
+const (
+	X86FeaturePREFETCHWT1 Feature = 3*32 + iota
+	X86FeatureAVX512VBMI
+	X86FeatureUMIP
+	X86FeaturePKU
+	X86FeatureOSPKE
+	X86FeatureWAITPKG
+	X86FeatureAVX512_VBMI2
+	_ // ecx bit 7 is reserved
+	X86FeatureGFNI
+	X86FeatureVAES
+	X86FeatureVPCLMULQDQ
+	X86FeatureAVX512_VNNI
+	X86FeatureAVX512_BITALG
+	X86FeatureTME
+	X86FeatureAVX512_VPOPCNTDQ
+	_ // ecx bit 15 is reserved
+	X86FeatureLA57
+	// ecx bits 17-21 are reserved
+	_
+	_
+	_
+	_
+	_
+	X86FeatureRDPID
+	// ecx bits 23-24 are reserved
+	_
+	_
+	X86FeatureCLDEMOTE
+	_ // ecx bit 26 is reserved
+	X86FeatureMOVDIRI
+	X86FeatureMOVDIR64B
+)
+
+// Block 4 constants are for xsave capabilities in CPUID.(EAX=0DH,ECX=01H):EAX.
+// The CPUID leaf is available only if 'X86FeatureXSAVE' is present.
+const (
+	X86FeatureXSAVEOPT Feature = 4*32 + iota
+	X86FeatureXSAVEC
+	X86FeatureXGETBV1
+	X86FeatureXSAVES
+	// EAX[31:4] are reserved.
+)
+
+// Block 5 constants are the extended feature bits in
+// CPUID.(EAX=0x80000001):ECX.
+const (
+	X86FeatureLAHF64 Feature = 5*32 + iota
+	X86FeatureCMP_LEGACY
+	X86FeatureSVM
+	X86FeatureEXTAPIC
+	X86FeatureCR8_LEGACY
+	X86FeatureLZCNT
+	X86FeatureSSE4A
+	X86FeatureMISALIGNSSE
+	X86FeaturePREFETCHW
+	X86FeatureOSVW
+	X86FeatureIBS
+	X86FeatureXOP
+	X86FeatureSKINIT
+	X86FeatureWDT
+	_ // ecx bit 14 is reserved.
+	X86FeatureLWP
+	X86FeatureFMA4
+	X86FeatureTCE
+	_ // ecx bit 18 is reserved.
+	_ // ecx bit 19 is reserved.
+	_ // ecx bit 20 is reserved.
+	X86FeatureTBM
+	X86FeatureTOPOLOGY
+	X86FeaturePERFCTR_CORE
+	X86FeaturePERFCTR_NB
+	_ // ecx bit 25 is reserved.
+	X86FeatureBPEXT
+	X86FeaturePERFCTR_TSC
+	X86FeaturePERFCTR_LLC
+	X86FeatureMWAITX
+	// ECX[31:30] are reserved.
+)
+
+// Block 6 constants are the extended feature bits in
+// CPUID.(EAX=0x80000001):EDX.
+//
+// These are sparse, and so the bit positions are assigned manually.
+const (
+	// On AMD, EDX[24:23] | EDX[17:12] | EDX[9:0] are duplicate features
+	// also defined in block 1 (in identical bit positions). Those features
+	// are not listed here.
+	block6DuplicateMask = 0x183f3ff
+
+	X86FeatureSYSCALL  Feature = 6*32 + 11
+	X86FeatureNX       Feature = 6*32 + 20
+	X86FeatureMMXEXT   Feature = 6*32 + 22
+	X86FeatureFXSR_OPT Feature = 6*32 + 25
+	X86FeatureGBPAGES  Feature = 6*32 + 26
+	X86FeatureRDTSCP   Feature = 6*32 + 27
+	X86FeatureLM       Feature = 6*32 + 29
+	X86Feature3DNOWEXT Feature = 6*32 + 30
+	X86Feature3DNOW    Feature = 6*32 + 31
+)
+
+// linuxBlockOrder defines the order in which linux organizes the feature
+// blocks. Linux also tracks feature bits in 32-bit blocks, but in an order
+// which doesn't match well here, so for the /proc/cpuinfo generation we simply
+// re-map the blocks to Linux's ordering and then go through the bits in each
+// block.
+var linuxBlockOrder = []block{1, 6, 0, 5, 2, 4, 3}
+
+// To make emulation of /proc/cpuinfo easy, these names match the names of the
+// basic features in Linux defined in arch/x86/kernel/cpu/capflags.c.
+var x86FeatureStrings = map[Feature]string{
+	// Block 0.
+	X86FeatureSSE3:     "pni",
+	X86FeaturePCLMULDQ: "pclmulqdq",
+	X86FeatureDTES64:   "dtes64",
+	X86FeatureMONITOR:  "monitor",
+	X86FeatureDSCPL:    "ds_cpl",
+	X86FeatureVMX:      "vmx",
+	X86FeatureSMX:      "smx",
+	X86FeatureEST:      "est",
+	X86FeatureTM2:      "tm2",
+	X86FeatureSSSE3:    "ssse3",
+	X86FeatureCNXTID:   "cid",
+	X86FeatureSDBG:     "sdbg",
+	X86FeatureFMA:      "fma",
+	X86FeatureCX16:     "cx16",
+	X86FeatureXTPR:     "xtpr",
+	X86FeaturePDCM:     "pdcm",
+	X86FeaturePCID:     "pcid",
+	X86FeatureDCA:      "dca",
+	X86FeatureSSE4_1:   "sse4_1",
+	X86FeatureSSE4_2:   "sse4_2",
+	X86FeatureX2APIC:   "x2apic",
+	X86FeatureMOVBE:    "movbe",
+	X86FeaturePOPCNT:   "popcnt",
+	X86FeatureTSCD:     "tsc_deadline_timer",
+	X86FeatureAES:      "aes",
+	X86FeatureXSAVE:    "xsave",
+	X86FeatureAVX:      "avx",
+	X86FeatureF16C:     "f16c",
+	X86FeatureRDRAND:   "rdrand",
+
+	// Block 1.
+	X86FeatureFPU:   "fpu",
+	X86FeatureVME:   "vme",
+	X86FeatureDE:    "de",
+	X86FeaturePSE:   "pse",
+	X86FeatureTSC:   "tsc",
+	X86FeatureMSR:   "msr",
+	X86FeaturePAE:   "pae",
+	X86FeatureMCE:   "mce",
+	X86FeatureCX8:   "cx8",
+	X86FeatureAPIC:  "apic",
+	X86FeatureSEP:   "sep",
+	X86FeatureMTRR:  "mtrr",
+	X86FeaturePGE:   "pge",
+	X86FeatureMCA:   "mca",
+	X86FeatureCMOV:  "cmov",
+	X86FeaturePAT:   "pat",
+	X86FeaturePSE36: "pse36",
+	X86FeaturePSN:   "pn",
+	X86FeatureCLFSH: "clflush",
+	X86FeatureDS:    "dts",
+	X86FeatureACPI:  "acpi",
+	X86FeatureMMX:   "mmx",
+	X86FeatureFXSR:  "fxsr",
+	X86FeatureSSE:   "sse",
+	X86FeatureSSE2:  "sse2",
+	X86FeatureSS:    "ss",
+	X86FeatureHTT:   "ht",
+	X86FeatureTM:    "tm",
+	X86FeatureIA64:  "ia64",
+	X86FeaturePBE:   "pbe",
+
+	// Block 2.
+	X86FeatureFSGSBase:   "fsgsbase",
+	X86FeatureTSC_ADJUST: "tsc_adjust",
+	X86FeatureBMI1:       "bmi1",
+	X86FeatureHLE:        "hle",
+	X86FeatureAVX2:       "avx2",
+	X86FeatureSMEP:       "smep",
+	X86FeatureBMI2:       "bmi2",
+	X86FeatureERMS:       "erms",
+	X86FeatureINVPCID:    "invpcid",
+	X86FeatureRTM:        "rtm",
+	X86FeatureCQM:        "cqm",
+	X86FeatureMPX:        "mpx",
+	X86FeatureRDT:        "rdt_a",
+	X86FeatureAVX512F:    "avx512f",
+	X86FeatureAVX512DQ:   "avx512dq",
+	X86FeatureRDSEED:     "rdseed",
+	X86FeatureADX:        "adx",
+	X86FeatureSMAP:       "smap",
+	X86FeatureCLWB:       "clwb",
+	X86FeatureAVX512PF:   "avx512pf",
+	X86FeatureAVX512ER:   "avx512er",
+	X86FeatureAVX512CD:   "avx512cd",
+	X86FeatureSHA:        "sha_ni",
+	X86FeatureAVX512BW:   "avx512bw",
+	X86FeatureAVX512VL:   "avx512vl",
+
+	// Block 3.
+	X86FeatureAVX512VBMI:       "avx512vbmi",
+	X86FeatureUMIP:             "umip",
+	X86FeaturePKU:              "pku",
+	X86FeatureOSPKE:            "ospke",
+	X86FeatureWAITPKG:          "waitpkg",
+	X86FeatureAVX512_VBMI2:     "avx512_vbmi2",
+	X86FeatureGFNI:             "gfni",
+	X86FeatureVAES:             "vaes",
+	X86FeatureVPCLMULQDQ:       "vpclmulqdq",
+	X86FeatureAVX512_VNNI:      "avx512_vnni",
+	X86FeatureAVX512_BITALG:    "avx512_bitalg",
+	X86FeatureTME:              "tme",
+	X86FeatureAVX512_VPOPCNTDQ: "avx512_vpopcntdq",
+	X86FeatureLA57:             "la57",
+	X86FeatureRDPID:            "rdpid",
+	X86FeatureCLDEMOTE:         "cldemote",
+	X86FeatureMOVDIRI:          "movdiri",
+	X86FeatureMOVDIR64B:        "movdir64b",
+
+	// Block 4.
+	X86FeatureXSAVEOPT: "xsaveopt",
+	X86FeatureXSAVEC:   "xsavec",
+	X86FeatureXGETBV1:  "xgetbv1",
+	X86FeatureXSAVES:   "xsaves",
+
+	// Block 5.
+	X86FeatureLAHF64:       "lahf_lm", // LAHF/SAHF in long mode
+	X86FeatureCMP_LEGACY:   "cmp_legacy",
+	X86FeatureSVM:          "svm",
+	X86FeatureEXTAPIC:      "extapic",
+	X86FeatureCR8_LEGACY:   "cr8_legacy",
+	X86FeatureLZCNT:        "abm", // Advanced bit manipulation
+	X86FeatureSSE4A:        "sse4a",
+	X86FeatureMISALIGNSSE:  "misalignsse",
+	X86FeaturePREFETCHW:    "3dnowprefetch",
+	X86FeatureOSVW:         "osvw",
+	X86FeatureIBS:          "ibs",
+	X86FeatureXOP:          "xop",
+	X86FeatureSKINIT:       "skinit",
+	X86FeatureWDT:          "wdt",
+	X86FeatureLWP:          "lwp",
+	X86FeatureFMA4:         "fma4",
+	X86FeatureTCE:          "tce",
+	X86FeatureTBM:          "tbm",
+	X86FeatureTOPOLOGY:     "topoext",
+	X86FeaturePERFCTR_CORE: "perfctr_core",
+	X86FeaturePERFCTR_NB:   "perfctr_nb",
+	X86FeatureBPEXT:        "bpext",
+	X86FeaturePERFCTR_TSC:  "ptsc",
+	X86FeaturePERFCTR_LLC:  "perfctr_llc",
+	X86FeatureMWAITX:       "mwaitx",
+
+	// Block 6.
+	X86FeatureSYSCALL:  "syscall",
+	X86FeatureNX:       "nx",
+	X86FeatureMMXEXT:   "mmxext",
+	X86FeatureFXSR_OPT: "fxsr_opt",
+	X86FeatureGBPAGES:  "pdpe1gb",
+	X86FeatureRDTSCP:   "rdtscp",
+	X86FeatureLM:       "lm",
+	X86Feature3DNOWEXT: "3dnowext",
+	X86Feature3DNOW:    "3dnow",
+}
+
+// These flags are parse only---they can be used for setting / unsetting the
+// flags, but will not get printed out in /proc/cpuinfo.
+var x86FeatureParseOnlyStrings = map[Feature]string{
+	// Block 0.
+	X86FeatureOSXSAVE: "osxsave",
+
+	// Block 2.
+	X86FeatureFDP_EXCPTN_ONLY: "fdp_excptn_only",
+	X86FeatureFPCSDS:          "fpcsds",
+	X86FeatureIPT:             "pt",
+	X86FeatureCLFLUSHOPT:      "clfushopt",
+
+	// Block 3.
+	X86FeaturePREFETCHWT1: "prefetchwt1",
+}
+
+// intelCacheDescriptors describe the caches and TLBs on the system. They are
+// returned in the registers for eax=2. Intel only.
+type intelCacheDescriptor uint8
+
+// Valid cache/TLB descriptors. All descriptors can be found in Intel SDM Vol.
+// 2, Ch. 3.2, "CPUID", Table 3-12 "Encoding of CPUID Leaf 2 Descriptors".
+const (
+	intelNullDescriptor    intelCacheDescriptor = 0
+	intelNoTLBDescriptor   intelCacheDescriptor = 0xfe
+	intelNoCacheDescriptor intelCacheDescriptor = 0xff
+
+	// Most descriptors omitted for brevity as they are currently unused.
+)
+
+// CacheType describes the type of a cache, as returned in eax[4:0] for eax=4.
+type CacheType uint8
+
+const (
+	// cacheNull indicates that there are no more entries.
+	cacheNull CacheType = iota
+
+	// CacheData is a data cache.
+	CacheData
+
+	// CacheInstruction is an instruction cache.
+	CacheInstruction
+
+	// CacheUnified is a unified instruction and data cache.
+	CacheUnified
+)
+
+// Cache describes the parameters of a single cache on the system.
+//
+// +stateify savable
+type Cache struct {
+	// Level is the hierarchical level of this cache (L1, L2, etc).
+	Level uint32
+
+	// Type is the type of cache.
+	Type CacheType
+
+	// FullyAssociative indicates that entries may be placed in any block.
+	FullyAssociative bool
+
+	// Partitions is the number of physical partitions in the cache.
+	Partitions uint32
+
+	// Ways is the number of ways of associativity in the cache.
+	Ways uint32
+
+	// Sets is the number of sets in the cache.
+	Sets uint32
+
+	// InvalidateHierarchical indicates that WBINVD/INVD from threads
+	// sharing this cache acts upon lower level caches for threads sharing
+	// this cache.
+	InvalidateHierarchical bool
+
+	// Inclusive indicates that this cache is inclusive of lower cache
+	// levels.
+	Inclusive bool
+
+	// DirectMapped indicates that this cache is directly mapped from
+	// address, rather than using a hash function.
+	DirectMapped bool
+}
+
+// Just a way to wrap cpuid function numbers.
+type cpuidFunction uint32
+
+// The constants below are the lower or "standard" cpuid functions, ordered as
+// defined by the hardware.
+const (
+	vendorID                      cpuidFunction = iota // Returns vendor ID and largest standard function.
+	featureInfo                                        // Returns basic feature bits and processor signature.
+	intelCacheDescriptors                              // Returns list of cache descriptors. Intel only.
+	intelSerialNumber                                  // Returns processor serial number (obsolete on new hardware). Intel only.
+	intelDeterministicCacheParams                      // Returns deterministic cache information. Intel only.
+	monitorMwaitParams                                 // Returns information about monitor/mwait instructions.
+	powerParams                                        // Returns information about power management and thermal sensors.
+	extendedFeatureInfo                                // Returns extended feature bits.
+	_                                                  // Function 0x8 is reserved.
+	intelDCAParams                                     // Returns direct cache access information. Intel only.
+	intelPMCInfo                                       // Returns information about performance monitoring features. Intel only.
+	intelX2APICInfo                                    // Returns core/logical processor topology. Intel only.
+	_                                                  // Function 0xc is reserved.
+	xSaveInfo                                          // Returns information about extended state management.
+)
+
+// The "extended" functions start at 0x80000000.
+const (
+	extendedFunctionInfo cpuidFunction = 0x80000000 + iota // Returns highest available extended function in eax.
+	extendedFeatures                                       // Returns some extended feature bits in edx and ecx.
+)
+
+// These are the extended floating point state features. They are used to
+// enumerate floating point features in XCR0, XSTATE_BV, etc.
+const (
+	XSAVEFeatureX87         = 1 << 0
+	XSAVEFeatureSSE         = 1 << 1
+	XSAVEFeatureAVX         = 1 << 2
+	XSAVEFeatureBNDREGS     = 1 << 3
+	XSAVEFeatureBNDCSR      = 1 << 4
+	XSAVEFeatureAVX512op    = 1 << 5
+	XSAVEFeatureAVX512zmm0  = 1 << 6
+	XSAVEFeatureAVX512zmm16 = 1 << 7
+	XSAVEFeaturePKRU        = 1 << 9
+)
+
+var cpuFreqMHz float64
+
+// x86FeaturesFromString includes features from x86FeatureStrings and
+// x86FeatureParseOnlyStrings.
+var x86FeaturesFromString = make(map[string]Feature)
+
+// FeatureFromString returns the Feature associated with the given feature
+// string plus a bool to indicate if it could find the feature.
+func FeatureFromString(s string) (Feature, bool) {
+	f, b := x86FeaturesFromString[s]
+	return f, b
+}
+
+// String implements fmt.Stringer.
+func (f Feature) String() string {
+	if s := f.flagString(false); s != "" {
+		return s
+	}
+
+	block := int(f) / 32
+	bit := int(f) % 32
+	return fmt.Sprintf("<cpuflag %d; block %d bit %d>", f, block, bit)
+}
+
+func (f Feature) flagString(cpuinfoOnly bool) string {
+	if s, ok := x86FeatureStrings[f]; ok {
+		return s
+	}
+	if !cpuinfoOnly {
+		return x86FeatureParseOnlyStrings[f]
+	}
+	return ""
+}
+
+// FeatureSet is a set of Features for a CPU.
+//
+// +stateify savable
+type FeatureSet struct {
+	// Set is the set of features that are enabled in this FeatureSet.
+	Set map[Feature]bool
+
+	// VendorID is the 12-char string returned in ebx:edx:ecx for eax=0.
+	VendorID string
+
+	// ExtendedFamily is part of the processor signature.
+	ExtendedFamily uint8
+
+	// ExtendedModel is part of the processor signature.
+	ExtendedModel uint8
+
+	// ProcessorType is part of the processor signature.
+	ProcessorType uint8
+
+	// Family is part of the processor signature.
+	Family uint8
+
+	// Model is part of the processor signature.
+	Model uint8
+
+	// SteppingID is part of the processor signature.
+	SteppingID uint8
+
+	// Caches describes the caches on the CPU.
+	Caches []Cache
+
+	// CacheLine is the size of a cache line in bytes.
+	//
+	// All caches use the same line size. This is not enforced in the CPUID
+	// encoding, but is true on all known x86 processors.
+	CacheLine uint32
+}
+
+// FlagsString prints out supported CPU flags. If cpuinfoOnly is true, it is
+// equivalent to the "flags" field in /proc/cpuinfo.
+func (fs *FeatureSet) FlagsString(cpuinfoOnly bool) string {
+	var s []string
+	for _, b := range linuxBlockOrder {
+		for i := 0; i < blockSize; i++ {
+			if f := featureID(b, i); fs.Set[f] {
+				if fstr := f.flagString(cpuinfoOnly); fstr != "" {
+					s = append(s, fstr)
+				}
+			}
+		}
+	}
+	return strings.Join(s, " ")
+}
+
+// WriteCPUInfoTo is to generate a section of one cpu in /proc/cpuinfo. This is
+// a minimal /proc/cpuinfo, it is missing some fields like "microcode" that are
+// not always printed in Linux. The bogomips field is simply made up.
+func (fs FeatureSet) WriteCPUInfoTo(cpu uint, b *bytes.Buffer) {
+	fmt.Fprintf(b, "processor\t: %d\n", cpu)
+	fmt.Fprintf(b, "vendor_id\t: %s\n", fs.VendorID)
+	fmt.Fprintf(b, "cpu family\t: %d\n", ((fs.ExtendedFamily<<4)&0xff)|fs.Family)
+	fmt.Fprintf(b, "model\t\t: %d\n", ((fs.ExtendedModel<<4)&0xff)|fs.Model)
+	fmt.Fprintf(b, "model name\t: %s\n", "unknown") // Unknown for now.
+	fmt.Fprintf(b, "stepping\t: %s\n", "unknown")   // Unknown for now.
+	fmt.Fprintf(b, "cpu MHz\t\t: %.3f\n", cpuFreqMHz)
+	fmt.Fprintln(b, "fpu\t\t: yes")
+	fmt.Fprintln(b, "fpu_exception\t: yes")
+	fmt.Fprintf(b, "cpuid level\t: %d\n", uint32(xSaveInfo)) // Same as ax in vendorID.
+	fmt.Fprintln(b, "wp\t\t: yes")
+	fmt.Fprintf(b, "flags\t\t: %s\n", fs.FlagsString(true))
+	fmt.Fprintf(b, "bogomips\t: %.02f\n", cpuFreqMHz) // It's bogus anyway.
+	fmt.Fprintf(b, "clflush size\t: %d\n", fs.CacheLine)
+	fmt.Fprintf(b, "cache_alignment\t: %d\n", fs.CacheLine)
+	fmt.Fprintf(b, "address sizes\t: %d bits physical, %d bits virtual\n", 46, 48)
+	fmt.Fprintln(b, "power management:") // This is always here, but can be blank.
+	fmt.Fprintln(b, "")                  // The /proc/cpuinfo file ends with an extra newline.
+}
+
+const (
+	amdVendorID   = "AuthenticAMD"
+	intelVendorID = "GenuineIntel"
+)
+
+// AMD returns true if fs describes an AMD CPU.
+func (fs *FeatureSet) AMD() bool {
+	return fs.VendorID == amdVendorID
+}
+
+// Intel returns true if fs describes an Intel CPU.
+func (fs *FeatureSet) Intel() bool {
+	return fs.VendorID == intelVendorID
+}
+
+// ErrIncompatible is returned by FeatureSet.HostCompatible if fs is not a
+// subset of the host feature set.
+type ErrIncompatible struct {
+	message string
+}
+
+// Error implements error.
+func (e ErrIncompatible) Error() string {
+	return e.message
+}
+
+// CheckHostCompatible returns nil if fs is a subset of the host feature set.
+func (fs *FeatureSet) CheckHostCompatible() error {
+	hfs := HostFeatureSet()
+
+	if diff := fs.Subtract(hfs); diff != nil {
+		return ErrIncompatible{fmt.Sprintf("CPU feature set %v incompatible with host feature set %v (missing: %v)", fs.FlagsString(false), hfs.FlagsString(false), diff)}
+	}
+
+	// The size of a cache line must match, as it is critical to correctly
+	// utilizing CLFLUSH. Other cache properties are allowed to change, as
+	// they are not important to correctness.
+	if fs.CacheLine != hfs.CacheLine {
+		return ErrIncompatible{fmt.Sprintf("CPU cache line size %d incompatible with host cache line size %d", fs.CacheLine, hfs.CacheLine)}
+	}
+
+	return nil
+}
+
+// Helper to convert 3 regs into 12-byte vendor ID.
+func vendorIDFromRegs(bx, cx, dx uint32) string {
+	bytes := make([]byte, 0, 12)
+	for i := uint(0); i < 4; i++ {
+		b := byte(bx >> (i * 8))
+		bytes = append(bytes, b)
+	}
+
+	for i := uint(0); i < 4; i++ {
+		b := byte(dx >> (i * 8))
+		bytes = append(bytes, b)
+	}
+
+	for i := uint(0); i < 4; i++ {
+		b := byte(cx >> (i * 8))
+		bytes = append(bytes, b)
+	}
+	return string(bytes)
+}
+
+// ExtendedStateSize returns the number of bytes needed to save the "extended
+// state" for this processor and the boundary it must be aligned to. Extended
+// state includes floating point registers, and other cpu state that's not
+// associated with the normal task context.
+//
+// Note: We can save some space here with an optimization where we use a
+// smaller chunk of memory depending on features that are actually enabled.
+// Currently we just use the largest possible size for simplicity (which is
+// about 2.5K worst case, with avx512).
+func (fs *FeatureSet) ExtendedStateSize() (size, align uint) {
+	if fs.UseXsave() {
+		// Leaf 0 of xsaveinfo function returns the size for currently
+		// enabled xsave features in ebx, the maximum size if all valid
+		// features are saved with xsave in ecx, and valid XCR0 bits in
+		// edx:eax.
+		_, _, maxSize, _ := HostID(uint32(xSaveInfo), 0)
+		return uint(maxSize), 64
+	}
+
+	// If we don't support xsave, we fall back to fxsave, which requires
+	// 512 bytes aligned to 16 bytes.
+	return 512, 16
+}
+
+// ValidXCR0Mask returns the bits that may be set to 1 in control register
+// XCR0.
+func (fs *FeatureSet) ValidXCR0Mask() uint64 {
+	if !fs.UseXsave() {
+		return 0
+	}
+	eax, _, _, edx := HostID(uint32(xSaveInfo), 0)
+	return uint64(edx)<<32 | uint64(eax)
+}
+
+// vendorIDRegs returns the 3 register values used to construct the 12-byte
+// vendor ID string for eax=0.
+func (fs *FeatureSet) vendorIDRegs() (bx, dx, cx uint32) {
+	for i := uint(0); i < 4; i++ {
+		bx |= uint32(fs.VendorID[i]) << (i * 8)
+	}
+
+	for i := uint(0); i < 4; i++ {
+		dx |= uint32(fs.VendorID[i+4]) << (i * 8)
+	}
+
+	for i := uint(0); i < 4; i++ {
+		cx |= uint32(fs.VendorID[i+8]) << (i * 8)
+	}
+	return
+}
+
+// signature returns the signature dword that's returned in eax when eax=1.
+func (fs *FeatureSet) signature() uint32 {
+	var s uint32
+	s |= uint32(fs.SteppingID & 0xf)
+	s |= uint32(fs.Model&0xf) << 4
+	s |= uint32(fs.Family&0xf) << 8
+	s |= uint32(fs.ProcessorType&0x3) << 12
+	s |= uint32(fs.ExtendedModel&0xf) << 16
+	s |= uint32(fs.ExtendedFamily&0xff) << 20
+	return s
+}
+
+// Helper to deconstruct signature dword.
+func signatureSplit(v uint32) (ef, em, pt, f, m, sid uint8) {
+	sid = uint8(v & 0xf)
+	m = uint8(v>>4) & 0xf
+	f = uint8(v>>8) & 0xf
+	pt = uint8(v>>12) & 0x3
+	em = uint8(v>>16) & 0xf
+	ef = uint8(v >> 20)
+	return
+}
+
+// Helper to convert blockwise feature bit masks into a set of features. Masks
+// must be provided in order for each block, without skipping them. If a block
+// does not matter for this feature set, 0 is specified.
+func setFromBlockMasks(blocks ...uint32) map[Feature]bool {
+	s := make(map[Feature]bool)
+	for b, blockMask := range blocks {
+		for i := 0; i < blockSize; i++ {
+			if blockMask&1 != 0 {
+				s[featureID(block(b), i)] = true
+			}
+			blockMask >>= 1
+		}
+	}
+	return s
+}
+
+// blockMask returns the 32-bit mask associated with a block of features.
+func (fs *FeatureSet) blockMask(b block) uint32 {
+	var mask uint32
+	for i := 0; i < blockSize; i++ {
+		if fs.Set[featureID(b, i)] {
+			mask |= 1 << uint(i)
+		}
+	}
+	return mask
+}
+
+// Remove removes a Feature from a FeatureSet. It ignores features
+// that are not in the FeatureSet.
+func (fs *FeatureSet) Remove(feature Feature) {
+	delete(fs.Set, feature)
+}
+
+// Add adds a Feature to a FeatureSet. It ignores duplicate features.
+func (fs *FeatureSet) Add(feature Feature) {
+	fs.Set[feature] = true
+}
+
+// HasFeature tests whether or not a feature is in the given feature set.
+func (fs *FeatureSet) HasFeature(feature Feature) bool {
+	return fs.Set[feature]
+}
+
+// Subtract returns the features present in fs that are not present in other.
+// If all features in fs are present in other, Subtract returns nil.
+func (fs *FeatureSet) Subtract(other *FeatureSet) (diff map[Feature]bool) {
+	for f := range fs.Set {
+		if !other.Set[f] {
+			if diff == nil {
+				diff = make(map[Feature]bool)
+			}
+			diff[f] = true
+		}
+	}
+
+	return
+}
+
+// EmulateID emulates a cpuid instruction based on the feature set.
+func (fs *FeatureSet) EmulateID(origAx, origCx uint32) (ax, bx, cx, dx uint32) {
+	switch cpuidFunction(origAx) {
+	case vendorID:
+		ax = uint32(xSaveInfo) // 0xd (xSaveInfo) is the highest function we support.
+		bx, dx, cx = fs.vendorIDRegs()
+	case featureInfo:
+		// CLFLUSH line size is encoded in quadwords. Other fields in bx unsupported.
+		bx = (fs.CacheLine / 8) << 8
+		cx = fs.blockMask(block(0))
+		dx = fs.blockMask(block(1))
+		ax = fs.signature()
+	case intelCacheDescriptors:
+		if !fs.Intel() {
+			// Reserved on non-Intel.
+			return 0, 0, 0, 0
+		}
+
+		// "The least-significant byte in register EAX (register AL)
+		// will always return 01H. Software should ignore this value
+		// and not interpret it as an informational descriptor." - SDM
+		//
+		// We only support reporting cache parameters via
+		// intelDeterministicCacheParams; report as much here.
+		//
+		// We do not support exposing TLB information at all.
+		ax = 1 | (uint32(intelNoCacheDescriptor) << 8)
+	case intelDeterministicCacheParams:
+		if !fs.Intel() {
+			// Reserved on non-Intel.
+			return 0, 0, 0, 0
+		}
+
+		// cx is the index of the cache to describe.
+		if int(origCx) >= len(fs.Caches) {
+			return uint32(cacheNull), 0, 0, 0
+		}
+		c := fs.Caches[origCx]
+
+		ax = uint32(c.Type)
+		ax |= c.Level << 5
+		ax |= 1 << 8 // Always claim the cache is "self-initializing".
+		if c.FullyAssociative {
+			ax |= 1 << 9
+		}
+		// Processor topology not supported.
+
+		bx = fs.CacheLine - 1
+		bx |= (c.Partitions - 1) << 12
+		bx |= (c.Ways - 1) << 22
+
+		cx = c.Sets - 1
+
+		if !c.InvalidateHierarchical {
+			dx |= 1
+		}
+		if c.Inclusive {
+			dx |= 1 << 1
+		}
+		if !c.DirectMapped {
+			dx |= 1 << 2
+		}
+	case xSaveInfo:
+		if !fs.UseXsave() {
+			return 0, 0, 0, 0
+		}
+		return HostID(uint32(xSaveInfo), origCx)
+	case extendedFeatureInfo:
+		if origCx != 0 {
+			break // Only leaf 0 is supported.
+		}
+		bx = fs.blockMask(block(2))
+		cx = fs.blockMask(block(3))
+	case extendedFunctionInfo:
+		// We only support showing the extended features.
+		ax = uint32(extendedFeatures)
+		cx = 0
+	case extendedFeatures:
+		cx = fs.blockMask(block(5))
+		dx = fs.blockMask(block(6))
+		if fs.AMD() {
+			// AMD duplicates some block 1 features in block 6.
+			dx |= fs.blockMask(block(1)) & block6DuplicateMask
+		}
+	}
+
+	return
+}
+
+// UseXsave returns the choice of fp state saving instruction.
+func (fs *FeatureSet) UseXsave() bool {
+	return fs.HasFeature(X86FeatureXSAVE) && fs.HasFeature(X86FeatureOSXSAVE)
+}
+
+// UseXsaveopt returns true if 'fs' supports the "xsaveopt" instruction.
+func (fs *FeatureSet) UseXsaveopt() bool {
+	return fs.UseXsave() && fs.HasFeature(X86FeatureXSAVEOPT)
+}
+
+// HostID executes a native CPUID instruction.
+func HostID(axArg, cxArg uint32) (ax, bx, cx, dx uint32)
+
+// HostFeatureSet uses cpuid to get host values and construct a feature set
+// that matches that of the host machine. Note that there are several places
+// where there appear to be some unnecessary assignments between register names
+// (ax, bx, cx, or dx) and featureBlockN variables. This is to explicitly show
+// where the different feature blocks come from, to make the code easier to
+// inspect and read.
+func HostFeatureSet() *FeatureSet {
+	// eax=0 gets max supported feature and vendor ID.
+	_, bx, cx, dx := HostID(0, 0)
+	vendorID := vendorIDFromRegs(bx, cx, dx)
+
+	// eax=1 gets basic features in ecx:edx.
+	ax, bx, cx, dx := HostID(1, 0)
+	featureBlock0 := cx
+	featureBlock1 := dx
+	ef, em, pt, f, m, sid := signatureSplit(ax)
+	cacheLine := 8 * (bx >> 8) & 0xff
+
+	// eax=4, ecx=i gets details about cache index i. Only supported on Intel.
+	var caches []Cache
+	if vendorID == intelVendorID {
+		// ecx selects the cache index until a null type is returned.
+		for i := uint32(0); ; i++ {
+			ax, bx, cx, dx := HostID(4, i)
+			t := CacheType(ax & 0xf)
+			if t == cacheNull {
+				break
+			}
+
+			lineSize := (bx & 0xfff) + 1
+			if lineSize != cacheLine {
+				panic(fmt.Sprintf("Mismatched cache line size: %d vs %d", lineSize, cacheLine))
+			}
+
+			caches = append(caches, Cache{
+				Type:                   t,
+				Level:                  (ax >> 5) & 0x7,
+				FullyAssociative:       ((ax >> 9) & 1) == 1,
+				Partitions:             ((bx >> 12) & 0x3ff) + 1,
+				Ways:                   ((bx >> 22) & 0x3ff) + 1,
+				Sets:                   cx + 1,
+				InvalidateHierarchical: (dx & 1) == 0,
+				Inclusive:              ((dx >> 1) & 1) == 1,
+				DirectMapped:           ((dx >> 2) & 1) == 0,
+			})
+		}
+	}
+
+	// eax=7, ecx=0 gets extended features in ecx:ebx.
+	_, bx, cx, _ = HostID(7, 0)
+	featureBlock2 := bx
+	featureBlock3 := cx
+
+	// Leaf 0xd is supported only if CPUID.1:ECX.XSAVE[bit 26] is set.
+	var featureBlock4 uint32
+	if (featureBlock0 & (1 << 26)) != 0 {
+		featureBlock4, _, _, _ = HostID(uint32(xSaveInfo), 1)
+	}
+
+	// eax=0x80000000 gets supported extended levels. We use this to
+	// determine if there are any non-zero block 4 or block 6 bits to find.
+	var featureBlock5, featureBlock6 uint32
+	if ax, _, _, _ := HostID(uint32(extendedFunctionInfo), 0); ax >= uint32(extendedFeatures) {
+		// eax=0x80000001 gets AMD added feature bits.
+		_, _, cx, dx = HostID(uint32(extendedFeatures), 0)
+		featureBlock5 = cx
+		// Ignore features duplicated from block 1 on AMD. These bits
+		// are reserved on Intel.
+		featureBlock6 = dx &^ block6DuplicateMask
+	}
+
+	set := setFromBlockMasks(featureBlock0, featureBlock1, featureBlock2, featureBlock3, featureBlock4, featureBlock5, featureBlock6)
+	return &FeatureSet{
+		Set:            set,
+		VendorID:       vendorID,
+		ExtendedFamily: ef,
+		ExtendedModel:  em,
+		ProcessorType:  pt,
+		Family:         f,
+		Model:          m,
+		SteppingID:     sid,
+		CacheLine:      cacheLine,
+		Caches:         caches,
+	}
+}
+
+// Reads max cpu frequency from host /proc/cpuinfo. Must run before
+// whitelisting. This value is used to create the fake /proc/cpuinfo from a
+// FeatureSet.
+func initCPUFreq() {
+	cpuinfob, err := ioutil.ReadFile("/proc/cpuinfo")
+	if err != nil {
+		// Leave it as 0... The standalone VDSO bails out in the same
+		// way.
+		log.Warningf("Could not read /proc/cpuinfo: %v", err)
+		return
+	}
+	cpuinfo := string(cpuinfob)
+
+	// We get the value straight from host /proc/cpuinfo. On machines with
+	// frequency scaling enabled, this will only get the current value
+	// which will likely be inaccurate. This is fine on machines with
+	// frequency scaling disabled.
+	for _, line := range strings.Split(cpuinfo, "\n") {
+		if strings.Contains(line, "cpu MHz") {
+			splitMHz := strings.Split(line, ":")
+			if len(splitMHz) < 2 {
+				log.Warningf("Could not read /proc/cpuinfo: malformed cpu MHz line")
+				return
+			}
+
+			// If there was a problem, leave cpuFreqMHz as 0.
+			var err error
+			cpuFreqMHz, err = strconv.ParseFloat(strings.TrimSpace(splitMHz[1]), 64)
+			if err != nil {
+				log.Warningf("Could not parse cpu MHz value %v: %v", splitMHz[1], err)
+				cpuFreqMHz = 0
+				return
+			}
+			return
+		}
+	}
+	log.Warningf("Could not parse /proc/cpuinfo, it is empty or does not contain cpu MHz")
+}
+
+func initFeaturesFromString() {
+	for f, s := range x86FeatureStrings {
+		x86FeaturesFromString[s] = f
+	}
+	for f, s := range x86FeatureParseOnlyStrings {
+		x86FeaturesFromString[s] = f
+	}
+}
+
+func init() {
+	// initCpuFreq must be run before whitelists are enabled.
+	initCPUFreq()
+	initFeaturesFromString()
+}
diff --git a/pkg/cpuid/cpuid_x86_test.go b/pkg/cpuid/cpuid_x86_test.go
new file mode 100644
index 000000000..0fe20c213
--- /dev/null
+++ b/pkg/cpuid/cpuid_x86_test.go
@@ -0,0 +1,243 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package cpuid
+
+import (
+	"testing"
+)
+
+// These are the default values of various FeatureSet fields.
+const (
+	defaultVendorID = "GenuineIntel"
+
+	// These processor signature defaults are derived from the values
+	// listed in Intel Application Note 485 for i7/Xeon processors.
+	defaultExtFamily  uint8 = 0
+	defaultExtModel   uint8 = 1
+	defaultType       uint8 = 0
+	defaultFamily     uint8 = 0x06
+	defaultModel      uint8 = 0x0a
+	defaultSteppingID uint8 = 0
+)
+
+// newEmptyFeatureSet creates a new FeatureSet with a sensible default model and no features.
+func newEmptyFeatureSet() *FeatureSet {
+	return &FeatureSet{
+		Set:            make(map[Feature]bool),
+		VendorID:       defaultVendorID,
+		ExtendedFamily: defaultExtFamily,
+		ExtendedModel:  defaultExtModel,
+		ProcessorType:  defaultType,
+		Family:         defaultFamily,
+		Model:          defaultModel,
+		SteppingID:     defaultSteppingID,
+	}
+}
+
+var justFPU = &FeatureSet{
+	Set: map[Feature]bool{
+		X86FeatureFPU: true,
+	}}
+
+var justFPUandPAE = &FeatureSet{
+	Set: map[Feature]bool{
+		X86FeatureFPU: true,
+		X86FeaturePAE: true,
+	}}
+
+func TestSubtract(t *testing.T) {
+	if diff := justFPU.Subtract(justFPUandPAE); diff != nil {
+		t.Errorf("Got %v is not subset of %v, want diff (%v) to be nil", justFPU, justFPUandPAE, diff)
+	}
+
+	if justFPUandPAE.Subtract(justFPU) == nil {
+		t.Errorf("Got %v is a subset of %v, want diff to be nil", justFPU, justFPUandPAE)
+	}
+}
+
+// TODO(b/73346484): Run this test on a very old platform, and make sure more
+// bits are enabled than just FPU and PAE. This test currently may not detect
+// if HostFeatureSet gives back junk bits.
+func TestHostFeatureSet(t *testing.T) {
+	hostFeatures := HostFeatureSet()
+	if justFPUandPAE.Subtract(hostFeatures) != nil {
+		t.Errorf("Got invalid feature set %v from HostFeatureSet()", hostFeatures)
+	}
+}
+
+func TestHasFeature(t *testing.T) {
+	if !justFPU.HasFeature(X86FeatureFPU) {
+		t.Errorf("HasFeature failed, %v should contain %v", justFPU, X86FeatureFPU)
+	}
+
+	if justFPU.HasFeature(X86FeatureAVX) {
+		t.Errorf("HasFeature failed, %v should not contain %v", justFPU, X86FeatureAVX)
+	}
+}
+
+// Note: these tests are aware of and abuse internal details of FeatureSets.
+// Users of FeatureSets should not depend on this.
+func TestAdd(t *testing.T) {
+	// Test a basic insertion into the FeatureSet.
+	testFeatures := newEmptyFeatureSet()
+	testFeatures.Add(X86FeatureCLFSH)
+	if len(testFeatures.Set) != 1 {
+		t.Errorf("Got length %v want 1", len(testFeatures.Set))
+	}
+
+	if !testFeatures.HasFeature(X86FeatureCLFSH) {
+		t.Errorf("Add failed, got %v want set with %v", testFeatures, X86FeatureCLFSH)
+	}
+
+	// Test that duplicates are ignored.
+	testFeatures.Add(X86FeatureCLFSH)
+	if len(testFeatures.Set) != 1 {
+		t.Errorf("Got length %v, want 1", len(testFeatures.Set))
+	}
+}
+
+func TestRemove(t *testing.T) {
+	// Try removing the last feature.
+	testFeatures := newEmptyFeatureSet()
+	testFeatures.Add(X86FeatureFPU)
+	testFeatures.Add(X86FeaturePAE)
+	testFeatures.Remove(X86FeaturePAE)
+	if !testFeatures.HasFeature(X86FeatureFPU) || len(testFeatures.Set) != 1 || testFeatures.HasFeature(X86FeaturePAE) {
+		t.Errorf("Remove failed, got %v want %v", testFeatures, justFPU)
+	}
+
+	// Try removing a feature not in the set.
+	testFeatures.Remove(X86FeatureRDRAND)
+	if !testFeatures.HasFeature(X86FeatureFPU) || len(testFeatures.Set) != 1 {
+		t.Errorf("Remove failed, got %v want %v", testFeatures, justFPU)
+	}
+}
+
+func TestFeatureFromString(t *testing.T) {
+	f, ok := FeatureFromString("avx")
+	if f != X86FeatureAVX || !ok {
+		t.Errorf("got %v want avx", f)
+	}
+
+	f, ok = FeatureFromString("bad")
+	if ok {
+		t.Errorf("got %v want nothing", f)
+	}
+}
+
+// This tests function 0 (eax=0), which returns the vendor ID and highest cpuid
+// function reported to be available.
+func TestEmulateIDVendorAndLength(t *testing.T) {
+	testFeatures := newEmptyFeatureSet()
+
+	ax, bx, cx, dx := testFeatures.EmulateID(0, 0)
+	wantEax := uint32(0xd) // Highest supported cpuid function.
+
+	// These magical constants are the characters of "GenuineIntel".
+	// See Intel AN485 for a reference on why they are laid out like this.
+	wantEbx := uint32(0x756e6547)
+	wantEcx := uint32(0x6c65746e)
+	wantEdx := uint32(0x49656e69)
+	if wantEax != ax {
+		t.Errorf("highest function failed, got %x want %x", ax, wantEax)
+	}
+
+	if wantEbx != bx || wantEcx != cx || wantEdx != dx {
+		t.Errorf("vendor string emulation failed, bx:cx:dx, got %x:%x:%x want %x:%x:%x", bx, cx, dx, wantEbx, wantEcx, wantEdx)
+	}
+}
+
+func TestEmulateIDBasicFeatures(t *testing.T) {
+	// Make a minimal test feature set.
+	testFeatures := newEmptyFeatureSet()
+	testFeatures.Add(X86FeatureCLFSH)
+	testFeatures.Add(X86FeatureAVX)
+	testFeatures.CacheLine = 64
+
+	ax, bx, cx, dx := testFeatures.EmulateID(1, 0)
+	ECXAVXBit := uint32(1 << uint(X86FeatureAVX))
+	EDXCLFlushBit := uint32(1 << uint(X86FeatureCLFSH-32)) // We adjust by 32 since it's in block 1.
+
+	if EDXCLFlushBit&dx == 0 || dx&^EDXCLFlushBit != 0 {
+		t.Errorf("EmulateID failed, got feature bits %x want %x", dx, testFeatures.blockMask(1))
+	}
+
+	if ECXAVXBit&cx == 0 || cx&^ECXAVXBit != 0 {
+		t.Errorf("EmulateID failed, got feature bits %x want %x", cx, testFeatures.blockMask(0))
+	}
+
+	// Default signature bits, based on values for i7/Xeon.
+	// See Intel AN485 for information on stepping/model bits.
+	defaultSignature := uint32(0x000106a0)
+	if defaultSignature != ax {
+		t.Errorf("EmulateID stepping emulation failed, got %x want %x", ax, defaultSignature)
+	}
+
+	clflushSizeInfo := uint32(8 << 8)
+	if clflushSizeInfo != bx {
+		t.Errorf("EmulateID bx emulation failed, got %x want %x", bx, clflushSizeInfo)
+	}
+}
+
+func TestEmulateIDExtendedFeatures(t *testing.T) {
+	// Make a minimal test feature set, one bit in each extended feature word.
+	testFeatures := newEmptyFeatureSet()
+	testFeatures.Add(X86FeatureSMEP)
+	testFeatures.Add(X86FeatureAVX512VBMI)
+
+	ax, bx, cx, dx := testFeatures.EmulateID(7, 0)
+	EBXSMEPBit := uint32(1 << uint(X86FeatureSMEP-2*32))      // Adjust by 2*32 since SMEP is a block 2 feature.
+	ECXAVXBit := uint32(1 << uint(X86FeatureAVX512VBMI-3*32)) // We adjust by 3*32 since it's a block 3 feature.
+
+	// Test that the desired bit is set and no other bits are set.
+	if EBXSMEPBit&bx == 0 || bx&^EBXSMEPBit != 0 {
+		t.Errorf("extended feature emulation failed, got feature bits %x want %x", bx, testFeatures.blockMask(2))
+	}
+
+	if ECXAVXBit&cx == 0 || cx&^ECXAVXBit != 0 {
+		t.Errorf("extended feature emulation failed, got feature bits %x want %x", cx, testFeatures.blockMask(3))
+	}
+
+	if ax != 0 || dx != 0 {
+		t.Errorf("extended feature emulation failed, ax:dx, got %x:%x want 0:0", ax, dx)
+	}
+
+	// Check that no subleaves other than 0 do anything.
+	ax, bx, cx, dx = testFeatures.EmulateID(7, 1)
+	if ax != 0 || bx != 0 || cx != 0 || dx != 0 {
+		t.Errorf("extended feature emulation failed, got %x:%x:%x:%x want 0:0", ax, bx, cx, dx)
+	}
+
+}
+
+// Checks that the expected extended features are available via cpuid functions
+// 0x80000000 and up.
+func TestEmulateIDExtended(t *testing.T) {
+	testFeatures := newEmptyFeatureSet()
+	testFeatures.Add(X86FeatureSYSCALL)
+	EDXSYSCALLBit := uint32(1 << uint(X86FeatureSYSCALL-6*32)) // Adjust by 6*32 since SYSCALL is a block 6 feature.
+
+	ax, bx, cx, dx := testFeatures.EmulateID(0x80000000, 0)
+	if ax != 0x80000001 || bx != 0 || cx != 0 || dx != 0 {
+		t.Errorf("EmulateID extended emulation failed, ax:bx:cx:dx, got %x:%x:%x:%x want 0x80000001:0:0:0", ax, bx, cx, dx)
+	}
+
+	_, _, _, dx = testFeatures.EmulateID(0x80000001, 0)
+	if EDXSYSCALLBit&dx == 0 || dx&^EDXSYSCALLBit != 0 {
+		t.Errorf("extended feature emulation failed, got feature bits %x want %x", dx, testFeatures.blockMask(6))
+	}
+}
-- 
cgit v1.2.3


From 0efa8168c7c04ec0a4bd62e2d2eb8718b5d72ea7 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 11:28:57 -0800
Subject: Update visibility.

PiperOrigin-RevId: 294265019
---
 pkg/seccomp/BUILD                | 2 +-
 test/root/testdata/BUILD         | 2 +-
 tools/build/BUILD                | 2 +-
 tools/checkunsafe/BUILD          | 2 +-
 tools/go_generics/BUILD          | 2 +-
 tools/go_generics/go_merge/BUILD | 2 +-
 tools/go_stateify/BUILD          | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'pkg')

diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index 742c8b79b..c5fca2ba3 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -26,7 +26,7 @@ go_library(
         "seccomp_rules.go",
         "seccomp_unsafe.go",
     ],
-    visibility = ["//visibility:public"],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/bpf",
diff --git a/test/root/testdata/BUILD b/test/root/testdata/BUILD
index bca5f9cab..6859541ad 100644
--- a/test/root/testdata/BUILD
+++ b/test/root/testdata/BUILD
@@ -13,6 +13,6 @@ go_library(
         "simple.go",
     ],
     visibility = [
-        "//visibility:public",
+        "//:sandbox",
     ],
 )
diff --git a/tools/build/BUILD b/tools/build/BUILD
index 0c0ce3f4d..00a467473 100644
--- a/tools/build/BUILD
+++ b/tools/build/BUILD
@@ -6,5 +6,5 @@ genrule(
     name = "loopback",
     outs = ["loopback.txt"],
     cmd = "touch $@",
-    visibility = ["//visibility:public"],
+    visibility = ["//:sandbox"],
 )
diff --git a/tools/checkunsafe/BUILD b/tools/checkunsafe/BUILD
index 92ba8ab06..4f1a31a6d 100644
--- a/tools/checkunsafe/BUILD
+++ b/tools/checkunsafe/BUILD
@@ -5,7 +5,7 @@ package(licenses = ["notice"])
 go_tool_library(
     name = "checkunsafe",
     srcs = ["check_unsafe.go"],
-    visibility = ["//visibility:public"],
+    visibility = ["//:sandbox"],
     deps = [
         "@org_golang_x_tools//go/analysis:go_tool_library",
     ],
diff --git a/tools/go_generics/BUILD b/tools/go_generics/BUILD
index 069df3856..32a949c93 100644
--- a/tools/go_generics/BUILD
+++ b/tools/go_generics/BUILD
@@ -9,7 +9,7 @@ go_binary(
         "imports.go",
         "remove.go",
     ],
-    visibility = ["//visibility:public"],
+    visibility = ["//:sandbox"],
     deps = ["//tools/go_generics/globals"],
 )
 
diff --git a/tools/go_generics/go_merge/BUILD b/tools/go_generics/go_merge/BUILD
index b7d35e272..2fd5a200d 100644
--- a/tools/go_generics/go_merge/BUILD
+++ b/tools/go_generics/go_merge/BUILD
@@ -5,5 +5,5 @@ package(licenses = ["notice"])
 go_binary(
     name = "go_merge",
     srcs = ["main.go"],
-    visibility = ["//visibility:public"],
+    visibility = ["//:sandbox"],
 )
diff --git a/tools/go_stateify/BUILD b/tools/go_stateify/BUILD
index 6036faf7b..503cdf2e5 100644
--- a/tools/go_stateify/BUILD
+++ b/tools/go_stateify/BUILD
@@ -5,6 +5,6 @@ package(licenses = ["notice"])
 go_binary(
     name = "stateify",
     srcs = ["main.go"],
-    visibility = ["//visibility:public"],
+    visibility = ["//:sandbox"],
     deps = ["//tools/tags"],
 )
-- 
cgit v1.2.3


From 20840bfec087d45853e81d1ac34940f3b2fb920a Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Mon, 10 Feb 2020 11:57:31 -0800
Subject: Move x86 state definition to its own file.

PiperOrigin-RevId: 294271541
---
 pkg/sentry/arch/BUILD             |  1 +
 pkg/sentry/arch/arch_state_x86.go |  4 ++--
 pkg/sentry/arch/arch_x86.go       | 15 --------------
 pkg/sentry/arch/arch_x86_impl.go  | 43 +++++++++++++++++++++++++++++++++++++++
 tools/build/tags.bzl              | 24 +++++++++++-----------
 5 files changed, 58 insertions(+), 29 deletions(-)
 create mode 100644 pkg/sentry/arch/arch_x86_impl.go

(limited to 'pkg')

diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 34c0a867d..e27f21e5e 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -14,6 +14,7 @@ go_library(
         "arch_state_aarch64.go",
         "arch_state_x86.go",
         "arch_x86.go",
+        "arch_x86_impl.go",
         "auxv.go",
         "signal.go",
         "signal_act.go",
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index d388ee9cf..e35c9214a 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -43,8 +43,8 @@ func (e ErrFloatingPoint) Error() string {
 // and SSE state, so this is the equivalent XSTATE_BV value.
 const fxsaveBV uint64 = cpuid.XSAVEFeatureX87 | cpuid.XSAVEFeatureSSE
 
-// afterLoad is invoked by stateify.
-func (s *State) afterLoad() {
+// afterLoadFPState is invoked by afterLoad.
+func (s *State) afterLoadFPState() {
 	old := s.x86FPState
 
 	// Recreate the slice. This is done to ensure that it is aligned
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 3db8bd34b..88b40a9d1 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -155,21 +155,6 @@ func NewFloatingPointData() *FloatingPointData {
 	return (*FloatingPointData)(&(newX86FPState()[0]))
 }
 
-// State contains the common architecture bits for X86 (the build tag of this
-// file ensures it's only built on x86).
-//
-// +stateify savable
-type State struct {
-	// The system registers.
-	Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
-
-	// Our floating point state.
-	x86FPState `state:"wait"`
-
-	// FeatureSet is a pointer to the currently active feature set.
-	FeatureSet *cpuid.FeatureSet
-}
-
 // Proto returns a protobuf representation of the system registers in State.
 func (s State) Proto() *rpb.Registers {
 	regs := &rpb.AMD64Registers{
diff --git a/pkg/sentry/arch/arch_x86_impl.go b/pkg/sentry/arch/arch_x86_impl.go
new file mode 100644
index 000000000..04ac283c6
--- /dev/null
+++ b/pkg/sentry/arch/arch_x86_impl.go
@@ -0,0 +1,43 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64 i386
+
+package arch
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/cpuid"
+)
+
+// State contains the common architecture bits for X86 (the build tag of this
+// file ensures it's only built on x86).
+//
+// +stateify savable
+type State struct {
+	// The system registers.
+	Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
+
+	// Our floating point state.
+	x86FPState `state:"wait"`
+
+	// FeatureSet is a pointer to the currently active feature set.
+	FeatureSet *cpuid.FeatureSet
+}
+
+// afterLoad is invoked by stateify.
+func (s *State) afterLoad() {
+	s.afterLoadFPState()
+}
diff --git a/tools/build/tags.bzl b/tools/build/tags.bzl
index a6db44e47..558fb53ae 100644
--- a/tools/build/tags.bzl
+++ b/tools/build/tags.bzl
@@ -3,22 +3,28 @@
 go_suffixes = [
     "_386",
     "_386_unsafe",
-    "_amd64",
-    "_amd64_unsafe",
     "_aarch64",
     "_aarch64_unsafe",
+    "_amd64",
+    "_amd64_unsafe",
     "_arm",
-    "_arm_unsafe",
     "_arm64",
     "_arm64_unsafe",
+    "_arm_unsafe",
+    "_impl",
+    "_impl_unsafe",
+    "_linux",
+    "_linux_unsafe",
     "_mips",
-    "_mips_unsafe",
-    "_mipsle",
-    "_mipsle_unsafe",
     "_mips64",
     "_mips64_unsafe",
     "_mips64le",
     "_mips64le_unsafe",
+    "_mips_unsafe",
+    "_mipsle",
+    "_mipsle_unsafe",
+    "_opts",
+    "_opts_unsafe",
     "_ppc64",
     "_ppc64_unsafe",
     "_ppc64le",
@@ -31,10 +37,4 @@ go_suffixes = [
     "_sparc64_unsafe",
     "_wasm",
     "_wasm_unsafe",
-    "_linux",
-    "_linux_unsafe",
-    "_opts",
-    "_opts_unsafe",
-    "_impl",
-    "_impl_unsafe",
 ]
-- 
cgit v1.2.3


From bfa0bba72abb69cbc7f4da27d3b4b116c3784495 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 10 Feb 2020 12:02:47 -0800
Subject: Redirect FIXME to gvisor.dev

PiperOrigin-RevId: 294272755
---
 pkg/sentry/fsimpl/kernfs/filesystem.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 9d65d0179..e49303c26 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -111,10 +111,10 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 		// Dentry isn't cached; it either doesn't exist or failed
 		// revalidation. Attempt to resolve it via Lookup.
 		//
-		// FIXME(b/144498111): Inode.Lookup() should return *(kernfs.)Dentry,
-		// not *vfs.Dentry, since (kernfs.)Filesystem assumes that all dentries
-		// in the filesystem are (kernfs.)Dentry and performs vfs.DentryImpl
-		// casts accordingly.
+		// FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return
+		// *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes
+		// that all dentries in the filesystem are (kernfs.)Dentry and performs
+		// vfs.DentryImpl casts accordingly.
 		var err error
 		childVFSD, err = parent.inode.Lookup(ctx, name)
 		if err != nil {
-- 
cgit v1.2.3


From c9a18b16ade6ec0bc90fc75d0a4ab0621f9d01d6 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 12:06:06 -0800
Subject: Document MinimumTotalMemoryBytes.

PiperOrigin-RevId: 294273559
---
 pkg/sentry/usage/memory.go | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'pkg')

diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index 538c645eb..4320ad17f 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -253,6 +253,10 @@ func (m *MemoryLocked) Copy() (MemoryStats, uint64) {
 }
 
 // MinimumTotalMemoryBytes is the minimum reported total system memory.
+//
+// This can be configured through options provided to the Sentry at start.
+// This number is purely synthetic. This is only set before the application
+// starts executing, and must not be modified.
 var MinimumTotalMemoryBytes uint64 = 2 << 30 // 2 GB
 
 // TotalMemory returns the "total usable memory" available.
-- 
cgit v1.2.3


From 4d4d47f0c0a21d3404d2edae527b187d62daa3c8 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 13:04:29 -0800
Subject: Add contextual note.

PiperOrigin-RevId: 294285723
---
 pkg/sentry/fsimpl/gofer/directory.go | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index baa2cdd8e..6d4ebc2bf 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -87,6 +87,10 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 	// to assume that directory fids have the correct semantics, and translates
 	// struct file_operations::readdir calls directly to readdir RPCs), but is
 	// consistent with VFS1.
+	//
+	// NOTE(b/135560623): In particular, some gofer implementations may not
+	// retain state between calls to Readdir, so may not provide a coherent
+	// directory stream across in the presence of mutation.
 
 	d.fs.renameMu.RLock()
 	defer d.fs.renameMu.RUnlock()
-- 
cgit v1.2.3


From bb22ebd7fbfc66556b38df669be5c6372daba018 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 13:20:44 -0800
Subject: Add contextual comment.

PiperOrigin-RevId: 294289066
---
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index d1436b943..2015a8871 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -15,6 +15,9 @@
 // These benchmarks emulate memfs benchmarks. Ext4 images must be created
 // before this benchmark is run using the `make_deep_ext4.sh` script at
 // /tmp/image-{depth}.ext4 for all the depths tested below.
+//
+// The benchmark itself cannot run the script because the script requires
+// sudo privileges to create the file system images.
 package benchmark_test
 
 import (
-- 
cgit v1.2.3


From a6f9361c2f7c5b46a200de1dc891a0ce059ad90e Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 13:51:26 -0800
Subject: Add context to comments.

PiperOrigin-RevId: 294295852
---
 pkg/sentry/fs/gofer/attr.go | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go
index 71cccdc34..6db4b762d 100644
--- a/pkg/sentry/fs/gofer/attr.go
+++ b/pkg/sentry/fs/gofer/attr.go
@@ -88,8 +88,9 @@ func bsize(pattr p9.Attr) int64 {
 	if pattr.BlockSize > 0 {
 		return int64(pattr.BlockSize)
 	}
-	// Some files may have no clue of their block size. Better not to report
-	// something misleading or buggy and have a safe default.
+	// Some files, particularly those that are not on a local file system,
+	// may have no clue of their block size. Better not to report something
+	// misleading or buggy and have a safe default.
 	return usermem.PageSize
 }
 
@@ -149,6 +150,7 @@ func links(valid p9.AttrMask, pattr p9.Attr) uint64 {
 	}
 
 	// This node is likely backed by a file system that doesn't support links.
+	//
 	// We could readdir() and count children directories to provide an accurate
 	// link count. However this may be expensive since the gofer may be backed by remote
 	// storage. Instead, simply return 2 links for directories and 1 for everything else
-- 
cgit v1.2.3


From 2889ffa84ec4737b651b14c3ce019c5005d0dd9c Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 14:09:14 -0800
Subject: Add context to note.

PiperOrigin-RevId: 294300040
---
 pkg/sentry/arch/arch_amd64.s | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/arch/arch_amd64.s b/pkg/sentry/arch/arch_amd64.s
index bd61402cf..6c10336e7 100644
--- a/pkg/sentry/arch/arch_amd64.s
+++ b/pkg/sentry/arch/arch_amd64.s
@@ -26,10 +26,11 @@
 //
 // func initX86FPState(data *FloatingPointData, useXsave bool)
 //
-// We need to clear out and initialize an empty fp state area since the sentry
-// may have left sensitive information in the floating point registers.
+// We need to clear out and initialize an empty fp state area since the sentry,
+// or any previous loader, may have left sensitive information in the floating
+// point registers.
 //
-// Preconditions: data is zeroed
+// Preconditions: data is zeroed.
 TEXT ·initX86FPState(SB), $24-16
 	// Save MXCSR (callee-save)
 	STMXCSR	mxcsr-8(SP)
-- 
cgit v1.2.3


From 475316e87dac806d69bcb06ea4065f3c138bb47e Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 10 Feb 2020 14:47:00 -0800
Subject: Refactor getxattr.

Put most of the logic for getxattr in one place for clarity. This simplifies
FGetXattr and getXattrFromPath, which are just wrappers for getXattr.

PiperOrigin-RevId: 294308332
---
 pkg/sentry/syscalls/linux/sys_xattr.go | 44 ++++++++++++++++------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 342337726..9d8140b8a 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -49,14 +49,11 @@ func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	}
 	defer f.DecRef()
 
-	n, value, err := getXattr(t, f.Dirent, nameAddr, size)
+	n, err := getXattr(t, f.Dirent, nameAddr, valueAddr, size)
 	if err != nil {
 		return 0, nil, err
 	}
 
-	if _, err := t.CopyOutBytes(valueAddr, []byte(value)); err != nil {
-		return 0, nil, err
-	}
 	return uintptr(n), nil, nil
 }
 
@@ -71,41 +68,36 @@ func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink
 		return 0, nil, err
 	}
 
-	valueLen := 0
+	n := 0
 	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
 		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
 			return syserror.ENOTDIR
 		}
 
-		n, value, err := getXattr(t, d, nameAddr, size)
-		valueLen = n
-		if err != nil {
-			return err
-		}
-
-		_, err = t.CopyOutBytes(valueAddr, []byte(value))
+		n, err = getXattr(t, d, nameAddr, valueAddr, size)
 		return err
 	})
 	if err != nil {
 		return 0, nil, err
 	}
-	return uintptr(valueLen), nil, nil
+
+	return uintptr(n), nil, nil
 }
 
 // getXattr implements getxattr(2) from the given *fs.Dirent.
-func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr, size uint64) (int, string, error) {
-	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
-		return 0, "", err
-	}
-
+func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64) (int, error) {
 	name, err := copyInXattrName(t, nameAddr)
 	if err != nil {
-		return 0, "", err
+		return 0, err
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
+		return 0, err
 	}
 
 	// TODO(b/148380782): Support xattrs in namespaces other than "user".
 	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
-		return 0, "", syserror.EOPNOTSUPP
+		return 0, syserror.EOPNOTSUPP
 	}
 
 	// If getxattr(2) is called with size 0, the size of the value will be
@@ -118,18 +110,22 @@ func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr, size uint64)
 
 	value, err := d.Inode.GetXattr(t, name, requestedSize)
 	if err != nil {
-		return 0, "", err
+		return 0, err
 	}
 	n := len(value)
 	if uint64(n) > requestedSize {
-		return 0, "", syserror.ERANGE
+		return 0, syserror.ERANGE
 	}
 
 	// Don't copy out the attribute value if size is 0.
 	if size == 0 {
-		return n, "", nil
+		return n, nil
+	}
+
+	if _, err = t.CopyOutBytes(valueAddr, []byte(value)); err != nil {
+		return 0, err
 	}
-	return n, value, nil
+	return n, nil
 }
 
 // SetXattr implements linux syscall setxattr(2).
-- 
cgit v1.2.3


From 71af006b6fe4504fccb86f0222a8a1864d33fb7d Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 10 Feb 2020 17:12:03 -0800
Subject: Cleanup internal package group.

PiperOrigin-RevId: 294339229
---
 pkg/sentry/BUILD | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD
index e8b794179..e759dc36f 100644
--- a/pkg/sentry/BUILD
+++ b/pkg/sentry/BUILD
@@ -1,13 +1,11 @@
-# This BUILD file defines a package_group that allows for interdependencies for
-# sentry-internal packages.
-
 package(licenses = ["notice"])
 
+# The "internal" package_group should be used as much as possible by packages
+# that should remain Sentry-internal (i.e. not be exposed directly to command
+# line tooling or APIs).
 package_group(
     name = "internal",
     packages = [
-        "//cloud/gvisor/gopkg/sentry/...",
-        "//cloud/gvisor/sentry/...",
         "//pkg/sentry/...",
         "//runsc/...",
         # Code generated by go_marshal relies on go_marshal libraries.
-- 
cgit v1.2.3


From 762e4761cc4edd92108f6836ad1933c7158b8be8 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 11 Feb 2020 11:08:28 -0800
Subject: Move Align{Up,Down} into binary package.

PiperOrigin-RevId: 294477647
---
 pkg/binary/binary.go                      | 10 ++++++++++
 pkg/sentry/socket/control/control.go      | 26 ++++++++------------------
 pkg/sentry/socket/netfilter/extensions.go |  7 +------
 pkg/sentry/socket/netlink/message.go      | 15 ++++-----------
 pkg/sentry/strace/BUILD                   |  1 -
 pkg/sentry/strace/socket.go               |  7 +++----
 6 files changed, 26 insertions(+), 40 deletions(-)

(limited to 'pkg')

diff --git a/pkg/binary/binary.go b/pkg/binary/binary.go
index 631785f7b..25065aef9 100644
--- a/pkg/binary/binary.go
+++ b/pkg/binary/binary.go
@@ -254,3 +254,13 @@ func WriteUint64(w io.Writer, order binary.ByteOrder, num uint64) error {
 	_, err := w.Write(buf)
 	return err
 }
+
+// AlignUp rounds a length up to an alignment. align must be a power of 2.
+func AlignUp(length int, align uint) int {
+	return (length + int(align) - 1) & ^(int(align) - 1)
+}
+
+// AlignDown rounds a length down to an alignment. align must be a power of 2.
+func AlignDown(length int, align uint) int {
+	return length & ^(int(align) - 1)
+}
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 00265f15b..6145a7fc3 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -189,7 +189,7 @@ func putUint32(buf []byte, n uint32) []byte {
 // putCmsg writes a control message header and as much data as will fit into
 // the unused capacity of a buffer.
 func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([]byte, int) {
-	space := AlignDown(cap(buf)-len(buf), 4)
+	space := binary.AlignDown(cap(buf)-len(buf), 4)
 
 	// We can't write to space that doesn't exist, so if we are going to align
 	// the available space, we must align down.
@@ -282,19 +282,9 @@ func PackCredentials(t *kernel.Task, creds SCMCredentials, buf []byte, flags int
 	return putCmsg(buf, flags, linux.SCM_CREDENTIALS, align, c)
 }
 
-// AlignUp rounds a length up to an alignment. align must be a power of 2.
-func AlignUp(length int, align uint) int {
-	return (length + int(align) - 1) & ^(int(align) - 1)
-}
-
-// AlignDown rounds a down to an alignment. align must be a power of 2.
-func AlignDown(length int, align uint) int {
-	return length & ^(int(align) - 1)
-}
-
 // alignSlice extends a slice's length (up to the capacity) to align it.
 func alignSlice(buf []byte, align uint) []byte {
-	aligned := AlignUp(len(buf), align)
+	aligned := binary.AlignUp(len(buf), align)
 	if aligned > cap(buf) {
 		// Linux allows unaligned data if there isn't room for alignment.
 		// Since there isn't room for alignment, there isn't room for any
@@ -377,7 +367,7 @@ func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byt
 
 // cmsgSpace is equivalent to CMSG_SPACE in Linux.
 func cmsgSpace(t *kernel.Task, dataLen int) int {
-	return linux.SizeOfControlMessageHeader + AlignUp(dataLen, t.Arch().Width())
+	return linux.SizeOfControlMessageHeader + binary.AlignUp(dataLen, t.Arch().Width())
 }
 
 // CmsgsSpace returns the number of bytes needed to fit the control messages
@@ -437,7 +427,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 		case linux.SOL_SOCKET:
 			switch h.Type {
 			case linux.SCM_RIGHTS:
-				rightsSize := AlignDown(length, linux.SizeOfControlMessageRight)
+				rightsSize := binary.AlignDown(length, linux.SizeOfControlMessageRight)
 				numRights := rightsSize / linux.SizeOfControlMessageRight
 
 				if len(fds)+numRights > linux.SCM_MAX_FD {
@@ -448,7 +438,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 					fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight])))
 				}
 
-				i += AlignUp(length, width)
+				i += binary.AlignUp(length, width)
 
 			case linux.SCM_CREDENTIALS:
 				if length < linux.SizeOfControlMessageCredentials {
@@ -462,7 +452,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 					return socket.ControlMessages{}, err
 				}
 				cmsgs.Unix.Credentials = scmCreds
-				i += AlignUp(length, width)
+				i += binary.AlignUp(length, width)
 
 			default:
 				// Unknown message type.
@@ -476,7 +466,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 				}
 				cmsgs.IP.HasTOS = true
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS)
-				i += AlignUp(length, width)
+				i += binary.AlignUp(length, width)
 
 			default:
 				return socket.ControlMessages{}, syserror.EINVAL
@@ -489,7 +479,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 				}
 				cmsgs.IP.HasTClass = true
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], usermem.ByteOrder, &cmsgs.IP.TClass)
-				i += AlignUp(length, width)
+				i += binary.AlignUp(length, width)
 
 			default:
 				return socket.ControlMessages{}, syserror.EINVAL
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index 22fd0ebe7..b4b244abf 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -72,7 +72,7 @@ func marshalEntryMatch(name string, data []byte) []byte {
 	nflog("marshaling matcher %q", name)
 
 	// We have to pad this struct size to a multiple of 8 bytes.
-	size := alignUp(linux.SizeOfXTEntryMatch+len(data), 8)
+	size := binary.AlignUp(linux.SizeOfXTEntryMatch+len(data), 8)
 	matcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
 			MatchSize: uint16(size),
@@ -93,8 +93,3 @@ func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter,
 	}
 	return matchMaker.unmarshal(buf, filter)
 }
-
-// alignUp rounds a length up to an alignment. align must be a power of 2.
-func alignUp(length int, align uint) int {
-	return (length + int(align) - 1) & ^(int(align) - 1)
-}
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index 4ea252ccb..0899c61d1 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -23,18 +23,11 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// alignUp rounds a length up to an alignment.
-//
-// Preconditions: align is a power of two.
-func alignUp(length int, align uint) int {
-	return (length + int(align) - 1) &^ (int(align) - 1)
-}
-
 // alignPad returns the length of padding required for alignment.
 //
 // Preconditions: align is a power of two.
 func alignPad(length int, align uint) int {
-	return alignUp(length, align) - length
+	return binary.AlignUp(length, align) - length
 }
 
 // Message contains a complete serialized netlink message.
@@ -138,7 +131,7 @@ func (m *Message) Finalize() []byte {
 	// Align the message. Note that the message length in the header (set
 	// above) is the useful length of the message, not the total aligned
 	// length. See net/netlink/af_netlink.c:__nlmsg_put.
-	aligned := alignUp(len(m.buf), linux.NLMSG_ALIGNTO)
+	aligned := binary.AlignUp(len(m.buf), linux.NLMSG_ALIGNTO)
 	m.putZeros(aligned - len(m.buf))
 	return m.buf
 }
@@ -173,7 +166,7 @@ func (m *Message) PutAttr(atype uint16, v interface{}) {
 	m.Put(v)
 
 	// Align the attribute.
-	aligned := alignUp(l, linux.NLA_ALIGNTO)
+	aligned := binary.AlignUp(l, linux.NLA_ALIGNTO)
 	m.putZeros(aligned - l)
 }
 
@@ -190,7 +183,7 @@ func (m *Message) PutAttrString(atype uint16, s string) {
 	m.putZeros(1)
 
 	// Align the attribute.
-	aligned := alignUp(l, linux.NLA_ALIGNTO)
+	aligned := binary.AlignUp(l, linux.NLA_ALIGNTO)
 	m.putZeros(aligned - l)
 }
 
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 762a946fe..2f39a6f2b 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -30,7 +30,6 @@ go_library(
         "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/kernel",
-        "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/netlink",
         "//pkg/sentry/socket/netstack",
         "//pkg/sentry/syscalls/linux",
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index f7ff4573e..51e6d81b2 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
@@ -220,13 +219,13 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 
 		if skipData {
 			strs = append(strs, fmt.Sprintf("{level=%s, type=%s, length=%d}", level, typ, h.Length))
-			i += control.AlignUp(length, width)
+			i += binary.AlignUp(length, width)
 			continue
 		}
 
 		switch h.Type {
 		case linux.SCM_RIGHTS:
-			rightsSize := control.AlignDown(length, linux.SizeOfControlMessageRight)
+			rightsSize := binary.AlignDown(length, linux.SizeOfControlMessageRight)
 
 			numRights := rightsSize / linux.SizeOfControlMessageRight
 			fds := make(linux.ControlMessageRights, numRights)
@@ -295,7 +294,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 		default:
 			panic("unreachable")
 		}
-		i += control.AlignUp(length, width)
+		i += binary.AlignUp(length, width)
 	}
 
 	return fmt.Sprintf("%#x %s", addr, strings.Join(strs, ", "))
-- 
cgit v1.2.3


From 115898e368e4afe5418a7290d9545fafc7f6f25e Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 11 Feb 2020 11:37:12 -0800
Subject: Prevent DATA RACE in UnstableAttr.

The slaveInodeOperations is currently copying the object when
truncate is called (which is a no-op). This may result in a
(unconsequential) data race when being modified concurrently.

PiperOrigin-RevId: 294484276
---
 pkg/sentry/fs/tty/slave.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index db55cdc48..6a2dbc576 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -73,7 +73,7 @@ func (si *slaveInodeOperations) Release(ctx context.Context) {
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
-- 
cgit v1.2.3


From b8e22e241cab625d2809034c74d0ff808b948b4c Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 11 Feb 2020 12:58:13 -0800
Subject: Disallow duplicate NIC names.

PiperOrigin-RevId: 294500858
---
 pkg/tcpip/stack/nic.go        |  5 +++
 pkg/tcpip/stack/stack.go      |  9 +++++
 pkg/tcpip/stack/stack_test.go | 85 +++++++++++++++++++++++++++++++++++++++++++
 runsc/sandbox/network.go      | 30 +++++++--------
 4 files changed, 114 insertions(+), 15 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 78d451cca..ca3a7a07e 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1215,6 +1215,11 @@ func (n *NIC) ID() tcpip.NICID {
 	return n.id
 }
 
+// Name returns the name of n.
+func (n *NIC) Name() string {
+	return n.name
+}
+
 // Stack returns the instance of the Stack that owns this NIC.
 func (n *NIC) Stack() *Stack {
 	return n.stack
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index b793f1d74..6eac16e16 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -890,6 +890,15 @@ func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOp
 		return tcpip.ErrDuplicateNICID
 	}
 
+	// Make sure name is unique, unless unnamed.
+	if opts.Name != "" {
+		for _, n := range s.nics {
+			if n.Name() == opts.Name {
+				return tcpip.ErrDuplicateNICID
+			}
+		}
+	}
+
 	n := newNIC(s, id, opts.Name, ep, opts.Context)
 
 	s.nics[id] = n
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 24133e6f2..7ba604442 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -1792,6 +1792,91 @@ func TestAddProtocolAddressWithOptions(t *testing.T) {
 	verifyAddresses(t, expectedAddresses, gotAddresses)
 }
 
+func TestCreateNICWithOptions(t *testing.T) {
+	type callArgsAndExpect struct {
+		nicID tcpip.NICID
+		opts  stack.NICOptions
+		err   *tcpip.Error
+	}
+
+	tests := []struct {
+		desc  string
+		calls []callArgsAndExpect
+	}{
+		{
+			desc: "DuplicateNICID",
+			calls: []callArgsAndExpect{
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{Name: "eth1"},
+					err:   nil,
+				},
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{Name: "eth2"},
+					err:   tcpip.ErrDuplicateNICID,
+				},
+			},
+		},
+		{
+			desc: "DuplicateName",
+			calls: []callArgsAndExpect{
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{Name: "lo"},
+					err:   nil,
+				},
+				{
+					nicID: tcpip.NICID(2),
+					opts:  stack.NICOptions{Name: "lo"},
+					err:   tcpip.ErrDuplicateNICID,
+				},
+			},
+		},
+		{
+			desc: "Unnamed",
+			calls: []callArgsAndExpect{
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{},
+					err:   nil,
+				},
+				{
+					nicID: tcpip.NICID(2),
+					opts:  stack.NICOptions{},
+					err:   nil,
+				},
+			},
+		},
+		{
+			desc: "UnnamedDuplicateNICID",
+			calls: []callArgsAndExpect{
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{},
+					err:   nil,
+				},
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{},
+					err:   tcpip.ErrDuplicateNICID,
+				},
+			},
+		},
+	}
+	for _, test := range tests {
+		t.Run(test.desc, func(t *testing.T) {
+			s := stack.New(stack.Options{})
+			ep := channel.New(0, 0, tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"))
+			for _, call := range test.calls {
+				if got, want := s.CreateNICWithOptions(call.nicID, ep, call.opts), call.err; got != want {
+					t.Fatalf("CreateNICWithOptions(%v, _, %+v) = %v, want %v", call.nicID, call.opts, got, want)
+				}
+			}
+		})
+	}
+}
+
 func TestNICStats(t *testing.T) {
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index ff48f5646..99e143696 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -174,13 +174,13 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 			return fmt.Errorf("fetching interface addresses for %q: %v", iface.Name, err)
 		}
 
-		// We build our own loopback devices.
+		// We build our own loopback device.
 		if iface.Flags&net.FlagLoopback != 0 {
-			links, err := loopbackLinks(iface, allAddrs)
+			link, err := loopbackLink(iface, allAddrs)
 			if err != nil {
-				return fmt.Errorf("getting loopback routes and links for iface %q: %v", iface.Name, err)
+				return fmt.Errorf("getting loopback link for iface %q: %v", iface.Name, err)
 			}
-			args.LoopbackLinks = append(args.LoopbackLinks, links...)
+			args.LoopbackLinks = append(args.LoopbackLinks, link)
 			continue
 		}
 
@@ -339,25 +339,25 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (
 	return &socketEntry{deviceFile, gsoMaxSize}, nil
 }
 
-// loopbackLinks collects the links for a loopback interface.
-func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) {
-	var links []boot.LoopbackLink
+// loopbackLink returns the link with addresses and routes for a loopback
+// interface.
+func loopbackLink(iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, error) {
+	link := boot.LoopbackLink{
+		Name: iface.Name,
+	}
 	for _, addr := range addrs {
 		ipNet, ok := addr.(*net.IPNet)
 		if !ok {
-			return nil, fmt.Errorf("address is not IPNet: %+v", addr)
+			return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr)
 		}
 		dst := *ipNet
 		dst.IP = dst.IP.Mask(dst.Mask)
-		links = append(links, boot.LoopbackLink{
-			Name:      iface.Name,
-			Addresses: []net.IP{ipNet.IP},
-			Routes: []boot.Route{{
-				Destination: dst,
-			}},
+		link.Addresses = append(link.Addresses, ipNet.IP)
+		link.Routes = append(link.Routes, boot.Route{
+			Destination: dst,
 		})
 	}
-	return links, nil
+	return link, nil
 }
 
 // routesForIface iterates over all routes for the given interface and converts
-- 
cgit v1.2.3


From 6dced977eab69401a114257e386addb9cb03a39d Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 11 Feb 2020 17:38:04 -0800
Subject: Ensure fsimpl/gofer.dentryPlatformFile.hostFileMapper is initialized.

Fixes #1812. (The more direct cause of the deadlock is panic unsafety because
the historically high cost of defer means that we avoid it in hot paths,
including much of MM; defer is much cheaper as of Go 1.14, but still a
measurable overhead.)

PiperOrigin-RevId: 294560316
---
 pkg/sentry/fs/fsutil/host_file_mapper.go | 17 +++++++++++------
 pkg/sentry/fsimpl/gofer/regular_file.go  |  5 +++++
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index 67278aa86..e82afd112 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -65,13 +65,18 @@ type mapping struct {
 	writable bool
 }
 
-// NewHostFileMapper returns a HostFileMapper with no references or cached
-// mappings.
+// Init must be called on zero-value HostFileMappers before first use.
+func (f *HostFileMapper) Init() {
+	f.refs = make(map[uint64]int32)
+	f.mappings = make(map[uint64]mapping)
+}
+
+// NewHostFileMapper returns an initialized HostFileMapper allocated on the
+// heap with no references or cached mappings.
 func NewHostFileMapper() *HostFileMapper {
-	return &HostFileMapper{
-		refs:     make(map[uint64]int32),
-		mappings: make(map[uint64]mapping),
-	}
+	f := &HostFileMapper{}
+	f.Init()
+	return f
 }
 
 // IncRefOn increments the reference count on all offsets in mr.
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 8e11e06b3..54c1031a7 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -571,6 +571,8 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt
 	default:
 		panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop))
 	}
+	// After this point, d may be used as a memmap.Mappable.
+	d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init)
 	return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts)
 }
 
@@ -799,6 +801,9 @@ type dentryPlatformFile struct {
 	// If this dentry represents a regular file, and handle.fd >= 0,
 	// hostFileMapper caches mappings of handle.fd.
 	hostFileMapper fsutil.HostFileMapper
+
+	// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
+	hostFileMapperInitOnce sync.Once
 }
 
 // IncRef implements platform.File.IncRef.
-- 
cgit v1.2.3


From 5205bc7e583f90e5a5855a69de26a0baa888cbdf Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 11 Feb 2020 20:35:55 -0800
Subject: Simplify atomic operations

PiperOrigin-RevId: 294582802
---
 pkg/sleep/commit_noasm.go | 13 ++-----------
 pkg/sleep/sleep_unsafe.go | 23 ++++++++++-------------
 2 files changed, 12 insertions(+), 24 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sleep/commit_noasm.go b/pkg/sleep/commit_noasm.go
index 3af447fb9..f59061f37 100644
--- a/pkg/sleep/commit_noasm.go
+++ b/pkg/sleep/commit_noasm.go
@@ -28,15 +28,6 @@ import "sync/atomic"
 // It is written in assembly because it is called from g0, so it doesn't have
 // a race context.
 func commitSleep(g uintptr, waitingG *uintptr) bool {
-	for {
-		// Check if the wait was aborted.
-		if atomic.LoadUintptr(waitingG) == 0 {
-			return false
-		}
-
-		// Try to store the G so that wakers know who to wake.
-		if atomic.CompareAndSwapUintptr(waitingG, preparingG, g) {
-			return true
-		}
-	}
+	// Try to store the G so that wakers know who to wake.
+	return atomic.CompareAndSwapUintptr(waitingG, preparingG, g)
 }
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index acbf0229b..65bfcf778 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -299,20 +299,17 @@ func (s *Sleeper) enqueueAssertedWaker(w *Waker) {
 		}
 	}
 
-	for {
-		// Nothing to do if there isn't a G waiting.
-		g := atomic.LoadUintptr(&s.waitingG)
-		if g == 0 {
-			return
-		}
+	// Nothing to do if there isn't a G waiting.
+	if atomic.LoadUintptr(&s.waitingG) == 0 {
+		return
+	}
 
-		// Signal to the sleeper that a waker has been asserted.
-		if atomic.CompareAndSwapUintptr(&s.waitingG, g, 0) {
-			if g != preparingG {
-				// We managed to get a G. Wake it up.
-				goready(g, 0)
-			}
-		}
+	// Signal to the sleeper that a waker has been asserted.
+	switch g := atomic.SwapUintptr(&s.waitingG, 0); g {
+	case 0, preparingG:
+	default:
+		// We managed to get a G. Wake it up.
+		goready(g, 0)
 	}
 }
 
-- 
cgit v1.2.3


From 6fdf2c53a1d084b70602170b660242036fd8fe4f Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 7 Feb 2020 11:21:07 -0800
Subject: iptables: User chains

- Adds creation of user chains via `-N <chainname>`
- Adds `-j RETURN` support for built-in chains, which triggers the
  chain's underflow rule (usually the default policy).
- Adds tests for chain creation, default policies, and `-j RETURN' from
  built-in chains.
---
 pkg/sentry/socket/netfilter/netfilter.go | 115 +++++++++++++++++++-----------
 pkg/tcpip/iptables/iptables.go           |  74 ++++++++++++-------
 pkg/tcpip/iptables/targets.go            |  41 ++++++++---
 pkg/tcpip/iptables/types.go              |  50 +++++--------
 test/iptables/filter_input.go            | 117 ++++++++++++++++++++++++++++++-
 test/iptables/iptables_test.go           |  24 +++++++
 6 files changed, 310 insertions(+), 111 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index ea02627de..3fc80e0de 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -50,7 +50,9 @@ type metadata struct {
 
 // nflog logs messages related to the writing and reading of iptables.
 func nflog(format string, args ...interface{}) {
-	log.Infof("netfilter: "+format, args...)
+	if log.IsLogging(log.Debug) {
+		log.Debugf("netfilter: "+format, args...)
+	}
 }
 
 // GetInfo returns information about iptables.
@@ -227,19 +229,23 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 }
 
 func marshalTarget(target iptables.Target) []byte {
-	switch target.(type) {
-	case iptables.UnconditionalAcceptTarget:
-		return marshalStandardTarget(iptables.Accept)
-	case iptables.UnconditionalDropTarget:
-		return marshalStandardTarget(iptables.Drop)
+	switch tg := target.(type) {
+	case iptables.AcceptTarget:
+		return marshalStandardTarget(iptables.RuleAccept)
+	case iptables.DropTarget:
+		return marshalStandardTarget(iptables.RuleDrop)
 	case iptables.ErrorTarget:
-		return marshalErrorTarget()
+		return marshalErrorTarget(errorTargetName)
+	case iptables.UserChainTarget:
+		return marshalErrorTarget(tg.Name)
+	case iptables.ReturnTarget:
+		return marshalStandardTarget(iptables.RuleReturn)
 	default:
 		panic(fmt.Errorf("unknown target of type %T", target))
 	}
 }
 
-func marshalStandardTarget(verdict iptables.Verdict) []byte {
+func marshalStandardTarget(verdict iptables.RuleVerdict) []byte {
 	nflog("convert to binary: marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
 
 	// The target's name will be the empty string.
@@ -254,14 +260,14 @@ func marshalStandardTarget(verdict iptables.Verdict) []byte {
 	return binary.Marshal(ret, usermem.ByteOrder, target)
 }
 
-func marshalErrorTarget() []byte {
+func marshalErrorTarget(errorName string) []byte {
 	// This is an error target named error
 	target := linux.XTErrorTarget{
 		Target: linux.XTEntryTarget{
 			TargetSize: linux.SizeOfXTErrorTarget,
 		},
 	}
-	copy(target.Name[:], errorTargetName)
+	copy(target.Name[:], errorName)
 	copy(target.Target.Name[:], errorTargetName)
 
 	ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
@@ -270,38 +276,35 @@ func marshalErrorTarget() []byte {
 
 // translateFromStandardVerdict translates verdicts the same way as the iptables
 // tool.
-func translateFromStandardVerdict(verdict iptables.Verdict) int32 {
+func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 {
 	switch verdict {
-	case iptables.Accept:
+	case iptables.RuleAccept:
 		return -linux.NF_ACCEPT - 1
-	case iptables.Drop:
+	case iptables.RuleDrop:
 		return -linux.NF_DROP - 1
-	case iptables.Queue:
-		return -linux.NF_QUEUE - 1
-	case iptables.Return:
+	case iptables.RuleReturn:
 		return linux.NF_RETURN
-	case iptables.Jump:
+	default:
 		// TODO(gvisor.dev/issue/170): Support Jump.
-		panic("Jump isn't supported yet")
+		panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
 	}
-	panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
 }
 
-// translateToStandardVerdict translates from the value in a
+// translateToStandardTarget translates from the value in a
 // linux.XTStandardTarget to an iptables.Verdict.
-func translateToStandardVerdict(val int32) (iptables.Verdict, error) {
+func translateToStandardTarget(val int32) (iptables.Target, error) {
 	// TODO(gvisor.dev/issue/170): Support other verdicts.
 	switch val {
 	case -linux.NF_ACCEPT - 1:
-		return iptables.Accept, nil
+		return iptables.AcceptTarget{}, nil
 	case -linux.NF_DROP - 1:
-		return iptables.Drop, nil
+		return iptables.DropTarget{}, nil
 	case -linux.NF_QUEUE - 1:
-		return iptables.Invalid, errors.New("unsupported iptables verdict QUEUE")
+		return nil, errors.New("unsupported iptables verdict QUEUE")
 	case linux.NF_RETURN:
-		return iptables.Invalid, errors.New("unsupported iptables verdict RETURN")
+		return iptables.ReturnTarget{}, nil
 	default:
-		return iptables.Invalid, fmt.Errorf("unknown iptables verdict %d", val)
+		return nil, fmt.Errorf("unknown iptables verdict %d", val)
 	}
 }
 
@@ -411,6 +414,10 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 					table.BuiltinChains[hk] = ruleIdx
 				}
 				if offset == replace.Underflow[hook] {
+					if !validUnderflow(table.Rules[ruleIdx]) {
+						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP.")
+						return syserr.ErrInvalidArgument
+					}
 					table.Underflows[hk] = ruleIdx
 				}
 			}
@@ -425,12 +432,34 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 	}
 
+	// Add the user chains.
+	for ruleIdx, rule := range table.Rules {
+		target, ok := rule.Target.(iptables.UserChainTarget)
+		if !ok {
+			continue
+		}
+
+		// We found a user chain. Before inserting it into the table,
+		// check that:
+		// - There's some other rule after it.
+		// - There are no matchers.
+		if ruleIdx == len(table.Rules)-1 {
+			nflog("user chain must have a rule or default policy.")
+			return syserr.ErrInvalidArgument
+		}
+		if len(table.Rules[ruleIdx].Matchers) != 0 {
+			nflog("user chain's first node must have no matcheres.")
+			return syserr.ErrInvalidArgument
+		}
+		table.UserChains[target.Name] = ruleIdx + 1
+	}
+
 	// TODO(gvisor.dev/issue/170): Support other chains.
 	// Since we only support modifying the INPUT chain right now, make sure
 	// all other chains point to ACCEPT rules.
 	for hook, ruleIdx := range table.BuiltinChains {
 		if hook != iptables.Input {
-			if _, ok := table.Rules[ruleIdx].Target.(iptables.UnconditionalAcceptTarget); !ok {
+			if _, ok := table.Rules[ruleIdx].Target.(iptables.AcceptTarget); !ok {
 				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
 			}
@@ -519,18 +548,7 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 		buf = optVal[:linux.SizeOfXTStandardTarget]
 		binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
 
-		verdict, err := translateToStandardVerdict(standardTarget.Verdict)
-		if err != nil {
-			return nil, err
-		}
-		switch verdict {
-		case iptables.Accept:
-			return iptables.UnconditionalAcceptTarget{}, nil
-		case iptables.Drop:
-			return iptables.UnconditionalDropTarget{}, nil
-		default:
-			return nil, fmt.Errorf("Unknown verdict: %v", verdict)
-		}
+		return translateToStandardTarget(standardTarget.Verdict)
 
 	case errorTargetName:
 		// Error target.
@@ -548,11 +566,14 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 		//   somehow fall through every rule.
 		// * To mark the start of a user defined chain. These
 		//   rules have an error with the name of the chain.
-		switch errorTarget.Name.String() {
+		switch name := errorTarget.Name.String(); name {
 		case errorTargetName:
+			nflog("set entries: error target")
 			return iptables.ErrorTarget{}, nil
 		default:
-			return nil, fmt.Errorf("unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
+			// User defined chain.
+			nflog("set entries: user-defined target %q", name)
+			return iptables.UserChainTarget{Name: name}, nil
 		}
 	}
 
@@ -585,6 +606,18 @@ func containsUnsupportedFields(iptip linux.IPTIP) bool {
 		iptip.InverseFlags != 0
 }
 
+func validUnderflow(rule iptables.Rule) bool {
+	if len(rule.Matchers) != 0 {
+		return false
+	}
+	switch rule.Target.(type) {
+	case iptables.AcceptTarget, iptables.DropTarget:
+		return true
+	default:
+		return false
+	}
+}
+
 func hookFromLinux(hook int) iptables.Hook {
 	switch hook {
 	case linux.NF_INET_PRE_ROUTING:
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 1b9485bbd..75a433a3b 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -52,10 +52,10 @@ func DefaultTables() IPTables {
 		Tables: map[string]Table{
 			TablenameNat: Table{
 				Rules: []Rule{
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
 					Rule{Target: ErrorTarget{}},
 				},
 				BuiltinChains: map[Hook]int{
@@ -74,8 +74,8 @@ func DefaultTables() IPTables {
 			},
 			TablenameMangle: Table{
 				Rules: []Rule{
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
 					Rule{Target: ErrorTarget{}},
 				},
 				BuiltinChains: map[Hook]int{
@@ -90,9 +90,9 @@ func DefaultTables() IPTables {
 			},
 			TablenameFilter: Table{
 				Rules: []Rule{
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
 					Rule{Target: ErrorTarget{}},
 				},
 				BuiltinChains: map[Hook]int{
@@ -149,13 +149,11 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 	for _, tablename := range it.Priorities[hook] {
 		switch verdict := it.checkTable(hook, pkt, tablename); verdict {
 		// If the table returns Accept, move on to the next table.
-		case Accept:
+		case TableAccept:
 			continue
 		// The Drop verdict is final.
-		case Drop:
+		case TableDrop:
 			return false
-		case Stolen, Queue, Repeat, None, Jump, Return, Continue:
-			panic(fmt.Sprintf("Unimplemented verdict %v.", verdict))
 		default:
 			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
 		}
@@ -166,36 +164,58 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 }
 
 // Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) Verdict {
+func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) TableVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
 	table := it.Tables[tablename]
 	for ruleIdx := table.BuiltinChains[hook]; ruleIdx < len(table.Rules); ruleIdx++ {
 		switch verdict := it.checkRule(hook, pkt, table, ruleIdx); verdict {
-		// In either of these cases, this table is done with the packet.
-		case Accept, Drop:
-			return verdict
-		// Continue traversing the rules of the table.
-		case Continue:
+		case RuleAccept:
+			return TableAccept
+
+		case RuleDrop:
+			return TableDrop
+
+		case RuleContinue:
 			continue
-		case Stolen, Queue, Repeat, None, Jump, Return:
-			panic(fmt.Sprintf("Unimplemented verdict %v.", verdict))
+
+		case RuleReturn:
+			// TODO(gvisor.dev/issue/170): We don't implement jump
+			// yet, so any Return is from a built-in chain. That
+			// means we have to to call the underflow.
+			underflow := table.Rules[table.Underflows[hook]]
+			// Underflow is guaranteed to be an unconditional
+			// ACCEPT or DROP.
+			switch v, _ := underflow.Target.Action(pkt); v {
+			case RuleAccept:
+				return TableAccept
+			case RuleDrop:
+				return TableDrop
+			case RuleContinue, RuleReturn:
+				panic("Underflows should only return RuleAccept or RuleDrop.")
+			default:
+				panic(fmt.Sprintf("Unknown verdict: %d", v))
+			}
+
 		default:
-			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
+			panic(fmt.Sprintf("Unknown verdict: %d", verdict))
 		}
+
 	}
 
-	panic(fmt.Sprintf("Traversed past the entire list of iptables rules in table %q.", tablename))
+	// We got through the entire table without a decision. Default to DROP
+	// for safety.
+	return TableDrop
 }
 
 // Precondition: pk.NetworkHeader is set.
-func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) Verdict {
+func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) RuleVerdict {
 	rule := table.Rules[ruleIdx]
 
 	// First check whether the packet matches the IP header filter.
 	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
 	if rule.Filter.Protocol != 0 && rule.Filter.Protocol != header.IPv4(pkt.NetworkHeader).TransportProtocol() {
-		return Continue
+		return RuleContinue
 	}
 
 	// Go through each rule matcher. If they all match, run
@@ -203,10 +223,10 @@ func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ru
 	for _, matcher := range rule.Matchers {
 		matches, hotdrop := matcher.Match(hook, pkt, "")
 		if hotdrop {
-			return Drop
+			return RuleDrop
 		}
 		if !matches {
-			return Continue
+			return RuleContinue
 		}
 	}
 
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 4dd281371..9fc60cfad 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -21,20 +21,20 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
-// UnconditionalAcceptTarget accepts all packets.
-type UnconditionalAcceptTarget struct{}
+// AcceptTarget accepts packets.
+type AcceptTarget struct{}
 
 // Action implements Target.Action.
-func (UnconditionalAcceptTarget) Action(packet tcpip.PacketBuffer) (Verdict, string) {
-	return Accept, ""
+func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+	return RuleAccept, ""
 }
 
-// UnconditionalDropTarget denies all packets.
-type UnconditionalDropTarget struct{}
+// DropTarget drops packets.
+type DropTarget struct{}
 
 // Action implements Target.Action.
-func (UnconditionalDropTarget) Action(packet tcpip.PacketBuffer) (Verdict, string) {
-	return Drop, ""
+func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+	return RuleDrop, ""
 }
 
 // ErrorTarget logs an error and drops the packet. It represents a target that
@@ -42,7 +42,26 @@ func (UnconditionalDropTarget) Action(packet tcpip.PacketBuffer) (Verdict, strin
 type ErrorTarget struct{}
 
 // Action implements Target.Action.
-func (ErrorTarget) Action(packet tcpip.PacketBuffer) (Verdict, string) {
-	log.Warningf("ErrorTarget triggered.")
-	return Drop, ""
+func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+	log.Debugf("ErrorTarget triggered.")
+	return RuleDrop, ""
+}
+
+// UserChainTarget marks a rule as the beginning of a user chain.
+type UserChainTarget struct {
+	Name string
+}
+
+// Action implements Target.Action.
+func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
+	panic("UserChainTarget should never be called.")
+}
+
+// ReturnTarget returns from the current chain. If the chain is a built-in, the
+// hook's underflow should be called.
+type ReturnTarget struct{}
+
+// Action implements Target.Action.
+func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
+	return RuleReturn, ""
 }
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 7d593c35c..5735d001b 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -56,44 +56,32 @@ const (
 	NumHooks
 )
 
-// A Verdict is returned by a rule's target to indicate how traversal of rules
-// should (or should not) continue.
-type Verdict int
+// A TableVerdict is what a table decides should be done with a packet.
+type TableVerdict int
 
 const (
-	// Invalid indicates an unkonwn or erroneous verdict.
-	Invalid Verdict = iota
+	// TableAccept indicates the packet should continue through netstack.
+	TableAccept TableVerdict = iota
 
-	// Accept indicates the packet should continue traversing netstack as
-	// normal.
-	Accept
-
-	// Drop inicates the packet should be dropped, stopping traversing
-	// netstack.
-	Drop
-
-	// Stolen indicates the packet was co-opted by the target and should
-	// stop traversing netstack.
-	Stolen
-
-	// Queue indicates the packet should be queued for userspace processing.
-	Queue
+	// TableAccept indicates the packet should be dropped.
+	TableDrop
+)
 
-	// Repeat indicates the packet should re-traverse the chains for the
-	// current hook.
-	Repeat
+// A RuleVerdict is what a rule decides should be done with a packet.
+type RuleVerdict int
 
-	// None indicates no verdict was reached.
-	None
+const (
+	// RuleAccept indicates the packet should continue through netstack.
+	RuleAccept RuleVerdict = iota
 
-	// Jump indicates a jump to another chain.
-	Jump
+	// RuleContinue indicates the packet should continue to the next rule.
+	RuleContinue
 
-	// Continue indicates that traversal should continue at the next rule.
-	Continue
+	// RuleDrop indicates the packet should be dropped.
+	RuleDrop
 
-	// Return indicates that traversal should return to the calling chain.
-	Return
+	// RuleReturn indicates the packet should return to the previous chain.
+	RuleReturn
 )
 
 // IPTables holds all the tables for a netstack.
@@ -187,5 +175,5 @@ type Target interface {
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
 	// Jump, it also returns the name of the chain to jump to.
-	Action(packet tcpip.PacketBuffer) (Verdict, string)
+	Action(packet tcpip.PacketBuffer) (RuleVerdict, string)
 }
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index bd6059921..e26d6a7d2 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -36,6 +36,10 @@ func init() {
 	RegisterTestCase(FilterInputDropTCPSrcPort{})
 	RegisterTestCase(FilterInputDropUDPPort{})
 	RegisterTestCase(FilterInputDropUDP{})
+	RegisterTestCase(FilterInputCreateUserChain{})
+	RegisterTestCase(FilterInputDefaultPolicyAccept{})
+	RegisterTestCase(FilterInputDefaultPolicyDrop{})
+	RegisterTestCase(FilterInputReturnUnderflow{})
 }
 
 // FilterInputDropUDP tests that we can drop UDP traffic.
@@ -295,8 +299,119 @@ func (FilterInputRequireProtocolUDP) ContainerAction(ip net.IP) error {
 	return nil
 }
 
-// LocalAction implements TestCase.LocalAction.
 func (FilterInputRequireProtocolUDP) LocalAction(ip net.IP) error {
 	// No-op.
 	return nil
 }
+
+// FilterInputCreateUserChain tests chain creation.
+type FilterInputCreateUserChain struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputCreateUserChain) Name() string {
+	return "FilterInputCreateUserChain"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputCreateUserChain) ContainerAction(ip net.IP) error {
+	// Create a chain.
+	const chainName = "foochain"
+	if err := filterTable("-N", chainName); err != nil {
+		return err
+	}
+
+	// Add a simple rule to the chain.
+	return filterTable("-A", chainName, "-j", "DROP")
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputCreateUserChain) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// FilterInputDefaultPolicyAccept tests the default ACCEPT policy.
+type FilterInputDefaultPolicyAccept struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDefaultPolicyAccept) Name() string {
+	return "FilterInputDefaultPolicyAccept"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDefaultPolicyAccept) ContainerAction(ip net.IP) error {
+	// Set the default policy to accept, then receive a packet.
+	if err := filterTable("-P", "INPUT", "ACCEPT"); err != nil {
+		return err
+	}
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDefaultPolicyAccept) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputDefaultPolicyDrop tests the default DROP policy.
+type FilterInputDefaultPolicyDrop struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDefaultPolicyDrop) Name() string {
+	return "FilterInputDefaultPolicyDrop"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDefaultPolicyDrop) ContainerAction(ip net.IP) error {
+	if err := filterTable("-P", "INPUT", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort)
+	} else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+		return fmt.Errorf("error reading: %v", err)
+	}
+
+	// At this point we know that reading timed out and never received a
+	// packet.
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDefaultPolicyDrop) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputReturnUnderflow tests that -j RETURN in a built-in chain causes
+// the underflow rule (i.e. default policy) to be executed.
+type FilterInputReturnUnderflow struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputReturnUnderflow) Name() string {
+	return "FilterInputReturnUnderflow"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputReturnUnderflow) ContainerAction(ip net.IP) error {
+	// Add a RETURN rule followed by an unconditional accept, and set the
+	// default policy to DROP.
+	if err := filterTable("-A", "INPUT", "-j", "RETURN"); err != nil {
+		return err
+	}
+	if err := filterTable("-A", "INPUT", "-j", "DROP"); err != nil {
+		return err
+	}
+	if err := filterTable("-P", "INPUT", "ACCEPT"); err != nil {
+		return err
+	}
+
+	// We should receive packets, as the RETURN rule will trigger the default
+	// ACCEPT policy.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputReturnUnderflow) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 41909582a..46a7c99b0 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -214,6 +214,30 @@ func TestFilterInputDropTCPSrcPort(t *testing.T) {
 	}
 }
 
+func TestFilterInputCreateUserChain(t *testing.T) {
+	if err := singleTest(FilterInputCreateUserChain{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDefaultPolicyAccept(t *testing.T) {
+	if err := singleTest(FilterInputDefaultPolicyAccept{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDefaultPolicyDrop(t *testing.T) {
+	if err := singleTest(FilterInputDefaultPolicyDrop{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputReturnUnderflow(t *testing.T) {
+	if err := singleTest(FilterInputReturnUnderflow{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
 func TestFilterOutputDropTCPDestPort(t *testing.T) {
 	if err := singleTest(FilterOutputDropTCPDestPort{}); err != nil {
 		t.Fatal(err)
-- 
cgit v1.2.3


From d30a884775556474f1a893fd30460a6edf0a3039 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Tue, 11 Feb 2020 00:52:23 +0000
Subject: Add definition of arch.ARMTrapFlag.

Fixes #1708

Signed-off-by: Haibo Xu haibo.xu@arm.com
Change-Id: Ib15768692ead17c81c06f7666ca3f0a14064c3a0
---
 pkg/sentry/arch/arch_aarch64.go | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'pkg')

diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 3b6987665..d7794db63 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -34,6 +34,9 @@ const (
 	SyscallWidth = 4
 )
 
+// ARMTrapFlag is the mask for the trap flag.
+const ARMTrapFlag = uint64(1) << 21
+
 // aarch64FPState is aarch64 floating point state.
 type aarch64FPState []byte
 
-- 
cgit v1.2.3


From 69bf39e8a47d3b4dcbbd04d2e8df476cdfab5e74 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 13 Feb 2020 10:58:47 -0800
Subject: Internal change.

PiperOrigin-RevId: 294952610
---
 pkg/abi/linux/socket.go                        | 13 ++++
 pkg/sentry/socket/control/BUILD                |  1 +
 pkg/sentry/socket/control/control.go           | 43 +++++++++++++
 pkg/sentry/socket/hostinet/socket.go           | 11 +++-
 pkg/sentry/socket/netstack/netstack.go         | 37 ++++++++++--
 pkg/tcpip/tcpip.go                             | 25 ++++++++
 pkg/tcpip/transport/udp/endpoint.go            | 26 ++++++++
 test/syscalls/linux/socket_ip_udp_generic.cc   | 44 ++++++++++++++
 test/syscalls/linux/socket_ipv4_udp_unbound.cc | 84 ++++++++++++++++++++++++++
 test/syscalls/linux/udp_socket_test_cases.cc   |  1 -
 10 files changed, 278 insertions(+), 7 deletions(-)

(limited to 'pkg')

diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 766ee4014..4a14ef691 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -411,6 +411,15 @@ type ControlMessageCredentials struct {
 	GID uint32
 }
 
+// A ControlMessageIPPacketInfo is IP_PKTINFO socket control message.
+//
+// ControlMessageIPPacketInfo represents struct in_pktinfo from linux/in.h.
+type ControlMessageIPPacketInfo struct {
+	NIC             int32
+	LocalAddr       InetAddr
+	DestinationAddr InetAddr
+}
+
 // SizeOfControlMessageCredentials is the binary size of a
 // ControlMessageCredentials struct.
 var SizeOfControlMessageCredentials = int(binary.Size(ControlMessageCredentials{}))
@@ -431,6 +440,10 @@ const SizeOfControlMessageTOS = 1
 // SizeOfControlMessageTClass is the size of an IPV6_TCLASS control message.
 const SizeOfControlMessageTClass = 4
 
+// SizeOfControlMessageIPPacketInfo is the size of an IP_PKTINFO
+// control message.
+const SizeOfControlMessageIPPacketInfo = 12
+
 // SCM_MAX_FD is the maximum number of FDs accepted in a single sendmsg call.
 // From net/scm.h.
 const SCM_MAX_FD = 253
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 79e16d6e8..4d42d29cb 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -19,6 +19,7 @@ go_library(
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/syserror",
+        "//pkg/tcpip",
         "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 6145a7fc3..4667373d2 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -338,6 +339,22 @@ func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
 	)
 }
 
+// PackIPPacketInfo packs an IP_PKTINFO socket control message.
+func PackIPPacketInfo(t *kernel.Task, packetInfo tcpip.IPPacketInfo, buf []byte) []byte {
+	var p linux.ControlMessageIPPacketInfo
+	p.NIC = int32(packetInfo.NIC)
+	copy(p.LocalAddr[:], []byte(packetInfo.LocalAddr))
+	copy(p.DestinationAddr[:], []byte(packetInfo.DestinationAddr))
+
+	return putCmsgStruct(
+		buf,
+		linux.SOL_IP,
+		linux.IP_PKTINFO,
+		t.Arch().Width(),
+		p,
+	)
+}
+
 // PackControlMessages packs control messages into the given buffer.
 //
 // We skip control messages specific to Unix domain sockets.
@@ -362,6 +379,10 @@ func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byt
 		buf = PackTClass(t, cmsgs.IP.TClass, buf)
 	}
 
+	if cmsgs.IP.HasIPPacketInfo {
+		buf = PackIPPacketInfo(t, cmsgs.IP.PacketInfo, buf)
+	}
+
 	return buf
 }
 
@@ -394,6 +415,16 @@ func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int {
 	return space
 }
 
+// NewIPPacketInfo returns the IPPacketInfo struct.
+func NewIPPacketInfo(packetInfo linux.ControlMessageIPPacketInfo) tcpip.IPPacketInfo {
+	var p tcpip.IPPacketInfo
+	p.NIC = tcpip.NICID(packetInfo.NIC)
+	copy([]byte(p.LocalAddr), packetInfo.LocalAddr[:])
+	copy([]byte(p.DestinationAddr), packetInfo.DestinationAddr[:])
+
+	return p
+}
+
 // Parse parses a raw socket control message into portable objects.
 func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.ControlMessages, error) {
 	var (
@@ -468,6 +499,18 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS)
 				i += binary.AlignUp(length, width)
 
+			case linux.IP_PKTINFO:
+				if length < linux.SizeOfControlMessageIPPacketInfo {
+					return socket.ControlMessages{}, syserror.EINVAL
+				}
+
+				cmsgs.IP.HasIPPacketInfo = true
+				var packetInfo linux.ControlMessageIPPacketInfo
+				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo)
+
+				cmsgs.IP.PacketInfo = NewIPPacketInfo(packetInfo)
+				i += binary.AlignUp(length, width)
+
 			default:
 				return socket.ControlMessages{}, syserror.EINVAL
 			}
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index de76388ac..22f78d2e2 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -289,7 +289,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 	switch level {
 	case linux.SOL_IP:
 		switch name {
-		case linux.IP_TOS, linux.IP_RECVTOS:
+		case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO:
 			optlen = sizeofInt32
 		}
 	case linux.SOL_IPV6:
@@ -336,6 +336,8 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 		switch name {
 		case linux.IP_TOS, linux.IP_RECVTOS:
 			optlen = sizeofInt32
+		case linux.IP_PKTINFO:
+			optlen = linux.SizeOfControlMessageIPPacketInfo
 		}
 	case linux.SOL_IPV6:
 		switch name {
@@ -473,7 +475,14 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			case syscall.IP_TOS:
 				controlMessages.IP.HasTOS = true
 				binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTOS], usermem.ByteOrder, &controlMessages.IP.TOS)
+
+			case syscall.IP_PKTINFO:
+				controlMessages.IP.HasIPPacketInfo = true
+				var packetInfo linux.ControlMessageIPPacketInfo
+				binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo)
+				controlMessages.IP.PacketInfo = control.NewIPPacketInfo(packetInfo)
 			}
+
 		case syscall.SOL_IPV6:
 			switch unixCmsg.Header.Type {
 			case syscall.IPV6_TCLASS:
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index ed2fbcceb..9757fbfba 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1414,6 +1414,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		}
 		return o, nil
 
+	case linux.IP_PKTINFO:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveIPPacketInfoOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+		var o int32
+		if v {
+			o = 1
+		}
+		return o, nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1762,6 +1777,7 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		linux.IPV6_IPSEC_POLICY,
 		linux.IPV6_JOIN_ANYCAST,
 		linux.IPV6_LEAVE_ANYCAST,
+		// TODO(b/148887420): Add support for IPV6_PKTINFO.
 		linux.IPV6_PKTINFO,
 		linux.IPV6_ROUTER_ALERT,
 		linux.IPV6_XFRM_POLICY,
@@ -1949,6 +1965,16 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0))
 
+	case linux.IP_PKTINFO:
+		if len(optVal) == 0 {
+			return nil
+		}
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0))
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -1964,7 +1990,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_NODEFRAG,
 		linux.IP_OPTIONS,
 		linux.IP_PASSSEC,
-		linux.IP_PKTINFO,
 		linux.IP_RECVERR,
 		linux.IP_RECVFRAGSIZE,
 		linux.IP_RECVOPTS,
@@ -2395,10 +2420,12 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 func (s *SocketOperations) controlMessages() socket.ControlMessages {
 	return socket.ControlMessages{
 		IP: tcpip.ControlMessages{
-			HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp,
-			Timestamp:    s.readCM.Timestamp,
-			HasTOS:       s.readCM.HasTOS,
-			TOS:          s.readCM.TOS,
+			HasTimestamp:    s.readCM.HasTimestamp && s.sockOptTimestamp,
+			Timestamp:       s.readCM.Timestamp,
+			HasTOS:          s.readCM.HasTOS,
+			TOS:             s.readCM.TOS,
+			HasIPPacketInfo: s.readCM.HasIPPacketInfo,
+			PacketInfo:      s.readCM.PacketInfo,
 		},
 	}
 }
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 0e944712f..9ca39ce40 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -328,6 +328,12 @@ type ControlMessages struct {
 
 	// Tclass is the IPv6 traffic class of the associated packet.
 	TClass int32
+
+	// HasIPPacketInfo indicates whether PacketInfo is set.
+	HasIPPacketInfo bool
+
+	// PacketInfo holds interface and address data on an incoming packet.
+	PacketInfo IPPacketInfo
 }
 
 // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
@@ -503,6 +509,11 @@ const (
 	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
 	// socket is to be restricted to sending and receiving IPv6 packets only.
 	V6OnlyOption
+
+	// ReceiveIPPacketInfoOption is used by {G,S}etSockOptBool to specify
+	// if more inforamtion is provided with incoming packets such
+	// as interface index and address.
+	ReceiveIPPacketInfoOption
 )
 
 // SockOptInt represents socket options which values have the int type.
@@ -685,6 +696,20 @@ type IPv4TOSOption uint8
 // for all subsequent outgoing IPv6 packets from the endpoint.
 type IPv6TrafficClassOption uint8
 
+// IPPacketInfo is the message struture for IP_PKTINFO.
+//
+// +stateify savable
+type IPPacketInfo struct {
+	// NIC is the ID of the NIC to be used.
+	NIC NICID
+
+	// LocalAddr is the local address.
+	LocalAddr Address
+
+	// DestinationAddr is the destination address.
+	DestinationAddr Address
+}
+
 // Route is a row in the routing table. It specifies through which NIC (and
 // gateway) sets of packets should be routed. A row is considered viable if the
 // masked target address matches the destination address in the row.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index c9cbed8f4..3fe91cac2 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -29,6 +29,7 @@ import (
 type udpPacket struct {
 	udpPacketEntry
 	senderAddress tcpip.FullAddress
+	packetInfo    tcpip.IPPacketInfo
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
 	tos           uint8
@@ -118,6 +119,9 @@ type endpoint struct {
 	// as ancillary data to ControlMessages on Read.
 	receiveTOS bool
 
+	// receiveIPPacketInfo determines if the packet info is returned by Read.
+	receiveIPPacketInfo bool
+
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -254,11 +258,17 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 	}
 	e.mu.RLock()
 	receiveTOS := e.receiveTOS
+	receiveIPPacketInfo := e.receiveIPPacketInfo
 	e.mu.RUnlock()
 	if receiveTOS {
 		cm.HasTOS = true
 		cm.TOS = p.tos
 	}
+
+	if receiveIPPacketInfo {
+		cm.HasIPPacketInfo = true
+		cm.PacketInfo = p.packetInfo
+	}
 	return p.data.ToView(), cm, nil
 }
 
@@ -495,6 +505,13 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		}
 
 		e.v6only = v
+		return nil
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.Lock()
+		e.receiveIPPacketInfo = v
+		e.mu.Unlock()
+		return nil
 	}
 
 	return nil
@@ -703,6 +720,12 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		e.mu.RUnlock()
 
 		return v, nil
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.RLock()
+		v := e.receiveIPPacketInfo
+		e.mu.RUnlock()
+		return v, nil
 	}
 
 	return false, tcpip.ErrUnknownProtocolOption
@@ -1247,6 +1270,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	switch r.NetProto {
 	case header.IPv4ProtocolNumber:
 		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
+		packet.packetInfo.LocalAddr = r.LocalAddress
+		packet.packetInfo.DestinationAddr = r.RemoteAddress
+		packet.packetInfo.NIC = r.NICID()
 	}
 
 	packet.timestamp = e.stack.NowNanoseconds()
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 53290bed7..db5663ecd 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -357,5 +357,49 @@ TEST_P(UDPSocketPairTest, SetReuseAddrReusePort) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
+// Test getsockopt for a socket which is not set with IP_PKTINFO option.
+TEST_P(UDPSocketPairTest, IPPKTINFODefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_IP, IP_PKTINFO, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test setsockopt and getsockopt for a socket with IP_PKTINFO option.
+TEST_P(UDPSocketPairTest, SetAndGetIPPKTINFO) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int level = SOL_IP;
+  int type = IP_PKTINFO;
+
+  // Check getsockopt before IP_PKTINFO is set.
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOn);
+  EXPECT_EQ(get_len, sizeof(get));
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOff);
+  EXPECT_EQ(get_len, sizeof(get));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 990ccf23c..bc4b07a62 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -15,6 +15,7 @@
 #include "test/syscalls/linux/socket_ipv4_udp_unbound.h"
 
 #include <arpa/inet.h>
+#include <net/if.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -2128,5 +2129,88 @@ TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
               SyscallSucceedsWithValue(kMessageSize));
 }
 
+// Test that socket will receive packet info control message.
+TEST_P(IPv4UDPUnboundSocketTest, SetAndReceiveIPPKTINFO) {
+  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+  SKIP_IF((IsRunningWithHostinet()));
+
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto sender_addr = V4Loopback();
+  int level = SOL_IP;
+  int type = IP_PKTINFO;
+
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+           sender_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t sender_addr_len = sender_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&sender_addr.addr),
+                          &sender_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(sender_addr_len, sender_addr.addr_len);
+
+  auto receiver_addr = V4Loopback();
+  reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&sender_addr.addr)->sin_port;
+  ASSERT_THAT(
+      connect(sender->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+              receiver_addr.addr_len),
+      SyscallSucceeds());
+
+  // Allow socket to receive control message.
+  ASSERT_THAT(
+      setsockopt(receiver->get(), level, type, &kSockOptOn, sizeof(kSockOptOn)),
+      SyscallSucceeds());
+
+  // Prepare message to send.
+  constexpr size_t kDataLength = 1024;
+  msghdr sent_msg = {};
+  iovec sent_iov = {};
+  char sent_data[kDataLength];
+  sent_iov.iov_base = sent_data;
+  sent_iov.iov_len = kDataLength;
+  sent_msg.msg_iov = &sent_iov;
+  sent_msg.msg_iovlen = 1;
+  sent_msg.msg_flags = 0;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(sender->get(), &sent_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  msghdr received_msg = {};
+  iovec received_iov = {};
+  char received_data[kDataLength];
+  char received_cmsg_buf[CMSG_SPACE(sizeof(in_pktinfo))] = {};
+  size_t cmsg_data_len = sizeof(in_pktinfo);
+  received_iov.iov_base = received_data;
+  received_iov.iov_len = kDataLength;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+  received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+  received_msg.msg_control = received_cmsg_buf;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(receiver->get(), &received_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+  EXPECT_EQ(cmsg->cmsg_level, level);
+  EXPECT_EQ(cmsg->cmsg_type, type);
+
+  // Get loopback index.
+  ifreq ifr = {};
+  absl::SNPrintF(ifr.ifr_name, IFNAMSIZ, "lo");
+  ASSERT_THAT(ioctl(sender->get(), SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  ASSERT_NE(ifr.ifr_ifindex, 0);
+
+  // Check the data
+  in_pktinfo received_pktinfo = {};
+  memcpy(&received_pktinfo, CMSG_DATA(cmsg), sizeof(in_pktinfo));
+  EXPECT_EQ(received_pktinfo.ipi_ifindex, ifr.ifr_ifindex);
+  EXPECT_EQ(received_pktinfo.ipi_spec_dst.s_addr, htonl(INADDR_LOOPBACK));
+  EXPECT_EQ(received_pktinfo.ipi_addr.s_addr, htonl(INADDR_LOOPBACK));
+}
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index a2f6ef8cc..9f8de6b48 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1495,6 +1495,5 @@ TEST_P(UdpSocketTest, SendAndReceiveTOS) {
   memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
   EXPECT_EQ(received_tos, sent_tos);
 }
-
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 6ef63cd7da107d487fda7c48af50fa9802913cd9 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 12 Feb 2020 16:19:06 -0800
Subject: We can now create and jump in iptables. For example:

$ iptables -N foochain
$ iptables -A INPUT -j foochain
---
 pkg/abi/linux/netfilter.go               |   9 +-
 pkg/sentry/socket/netfilter/BUILD        |   1 +
 pkg/sentry/socket/netfilter/netfilter.go |  62 +++++++--
 pkg/sentry/socket/netfilter/targets.go   |  35 +++++
 pkg/tcpip/iptables/iptables.go           | 103 +++++++++------
 pkg/tcpip/iptables/targets.go            |  20 ++-
 pkg/tcpip/iptables/types.go              |  21 +--
 test/iptables/filter_input.go            | 217 ++++++++++++++++++++++++++++---
 test/iptables/iptables_test.go           |  36 +++++
 test/iptables/iptables_util.go           |  10 ++
 10 files changed, 420 insertions(+), 94 deletions(-)
 create mode 100644 pkg/sentry/socket/netfilter/targets.go

(limited to 'pkg')

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index bbc4df74c..bd2e13ba1 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -225,11 +225,14 @@ type XTEntryTarget struct {
 // SizeOfXTEntryTarget is the size of an XTEntryTarget.
 const SizeOfXTEntryTarget = 32
 
-// XTStandardTarget is a builtin target, one of ACCEPT, DROP, JUMP, QUEUE, or
-// RETURN. It corresponds to struct xt_standard_target in
+// XTStandardTarget is a built-in target, one of ACCEPT, DROP, JUMP, QUEUE,
+// RETURN, or jump. It corresponds to struct xt_standard_target in
 // include/uapi/linux/netfilter/x_tables.h.
 type XTStandardTarget struct {
-	Target  XTEntryTarget
+	Target XTEntryTarget
+	// A positive verdict indicates a jump, and is the offset from the
+	// start of the table to jump to. A negative value means one of the
+	// other built-in targets.
 	Verdict int32
 	_       [4]byte
 }
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index c91ec7494..7cd2ce55b 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "extensions.go",
         "netfilter.go",
+        "targets.go",
         "tcp_matcher.go",
         "udp_matcher.go",
     ],
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3fc80e0de..d322e4144 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -240,13 +240,15 @@ func marshalTarget(target iptables.Target) []byte {
 		return marshalErrorTarget(tg.Name)
 	case iptables.ReturnTarget:
 		return marshalStandardTarget(iptables.RuleReturn)
+	case JumpTarget:
+		return marshalJumpTarget(tg)
 	default:
 		panic(fmt.Errorf("unknown target of type %T", target))
 	}
 }
 
 func marshalStandardTarget(verdict iptables.RuleVerdict) []byte {
-	nflog("convert to binary: marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
+	nflog("convert to binary: marshalling standard target")
 
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
@@ -274,6 +276,23 @@ func marshalErrorTarget(errorName string) []byte {
 	return binary.Marshal(ret, usermem.ByteOrder, target)
 }
 
+func marshalJumpTarget(jt JumpTarget) []byte {
+	nflog("convert to binary: marshalling jump target")
+
+	// The target's name will be the empty string.
+	target := linux.XTStandardTarget{
+		Target: linux.XTEntryTarget{
+			TargetSize: linux.SizeOfXTStandardTarget,
+		},
+		// Verdict is overloaded by the ABI. When positive, it holds
+		// the jump offset from the start of the table.
+		Verdict: int32(jt.Offset),
+	}
+
+	ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
+	return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
 // translateFromStandardVerdict translates verdicts the same way as the iptables
 // tool.
 func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 {
@@ -335,7 +354,8 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 	// Convert input into a list of rules and their offsets.
 	var offset uint32
-	var offsets []uint32
+	// offsets maps rule byte offsets to their position in table.Rules.
+	offsets := map[uint32]int{}
 	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
 		nflog("set entries: processing entry at offset %d", offset)
 
@@ -396,11 +416,12 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 			Target:   target,
 			Matchers: matchers,
 		})
-		offsets = append(offsets, offset)
+		offsets[offset] = int(entryIdx)
 		offset += uint32(entry.NextOffset)
 
 		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
 			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+			return syserr.ErrInvalidArgument
 		}
 	}
 
@@ -409,13 +430,13 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	for hook, _ := range replace.HookEntry {
 		if table.ValidHooks()&(1<<hook) != 0 {
 			hk := hookFromLinux(hook)
-			for ruleIdx, offset := range offsets {
+			for offset, ruleIdx := range offsets {
 				if offset == replace.HookEntry[hook] {
 					table.BuiltinChains[hk] = ruleIdx
 				}
 				if offset == replace.Underflow[hook] {
 					if !validUnderflow(table.Rules[ruleIdx]) {
-						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP.")
+						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP")
 						return syserr.ErrInvalidArgument
 					}
 					table.Underflows[hk] = ruleIdx
@@ -444,16 +465,35 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		// - There's some other rule after it.
 		// - There are no matchers.
 		if ruleIdx == len(table.Rules)-1 {
-			nflog("user chain must have a rule or default policy.")
+			nflog("user chain must have a rule or default policy")
 			return syserr.ErrInvalidArgument
 		}
 		if len(table.Rules[ruleIdx].Matchers) != 0 {
-			nflog("user chain's first node must have no matcheres.")
+			nflog("user chain's first node must have no matchers")
 			return syserr.ErrInvalidArgument
 		}
 		table.UserChains[target.Name] = ruleIdx + 1
 	}
 
+	// Set each jump to point to the appropriate rule. Right now they hold byte
+	// offsets.
+	for ruleIdx, rule := range table.Rules {
+		jump, ok := rule.Target.(JumpTarget)
+		if !ok {
+			continue
+		}
+
+		// Find the rule corresponding to the jump rule offset.
+		jumpTo, ok := offsets[jump.Offset]
+		if !ok {
+			nflog("failed to find a rule to jump to")
+			return syserr.ErrInvalidArgument
+		}
+		jump.RuleNum = jumpTo
+		rule.Target = jump
+		table.Rules[ruleIdx] = rule
+	}
+
 	// TODO(gvisor.dev/issue/170): Support other chains.
 	// Since we only support modifying the INPUT chain right now, make sure
 	// all other chains point to ACCEPT rules.
@@ -548,7 +588,13 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 		buf = optVal[:linux.SizeOfXTStandardTarget]
 		binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
 
-		return translateToStandardTarget(standardTarget.Verdict)
+		if standardTarget.Verdict < 0 {
+			// A Verdict < 0 indicates a non-jump verdict.
+			return translateToStandardTarget(standardTarget.Verdict)
+		} else {
+			// A verdict >= 0 indicates a jump.
+			return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil
+		}
 
 	case errorTargetName:
 		// Error target.
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
new file mode 100644
index 000000000..c421b87cf
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+)
+
+// JumpTarget implements iptables.Target.
+type JumpTarget struct {
+	// Offset is the byte offset of the rule to jump to. It is used for
+	// marshaling and unmarshaling.
+	Offset uint32
+
+	// RuleNum is the rule to jump to.
+	RuleNum int
+}
+
+// Action implements iptables.Target.Action.
+func (jt JumpTarget) Action(tcpip.PacketBuffer) (iptables.RuleVerdict, int) {
+	return iptables.RuleJump, jt.RuleNum
+}
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 75a433a3b..dbaccbb36 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -135,25 +135,53 @@ func EmptyFilterTable() Table {
 	}
 }
 
+// A chainVerdict is what a table decides should be done with a packet.
+type chainVerdict int
+
+const (
+	// chainAccept indicates the packet should continue through netstack.
+	chainAccept chainVerdict = iota
+
+	// chainAccept indicates the packet should be dropped.
+	chainDrop
+
+	// chainReturn indicates the packet should return to the calling chain
+	// or the underflow rule of a builtin chain.
+	chainReturn
+)
+
 // Check runs pkt through the rules for hook. It returns true when the packet
 // should continue traversing the network stack and false when it should be
 // dropped.
 //
 // Precondition: pkt.NetworkHeader is set.
 func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
-	// TODO(gvisor.dev/issue/170): A lot of this is uncomplicated because
-	// we're missing features. Jumps, the call stack, etc. aren't checked
-	// for yet because we're yet to support them.
-
 	// Go through each table containing the hook.
 	for _, tablename := range it.Priorities[hook] {
-		switch verdict := it.checkTable(hook, pkt, tablename); verdict {
+		table := it.Tables[tablename]
+		ruleIdx := table.BuiltinChains[hook]
+		switch verdict := it.checkChain(hook, pkt, table, ruleIdx); verdict {
 		// If the table returns Accept, move on to the next table.
-		case TableAccept:
+		case chainAccept:
 			continue
 		// The Drop verdict is final.
-		case TableDrop:
+		case chainDrop:
 			return false
+		case chainReturn:
+			// Any Return from a built-in chain means we have to
+			// call the underflow.
+			underflow := table.Rules[table.Underflows[hook]]
+			switch v, _ := underflow.Target.Action(pkt); v {
+			case RuleAccept:
+				continue
+			case RuleDrop:
+				return false
+			case RuleJump, RuleReturn:
+				panic("Underflows should only return RuleAccept or RuleDrop.")
+			default:
+				panic(fmt.Sprintf("Unknown verdict: %d", v))
+			}
+
 		default:
 			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
 		}
@@ -164,37 +192,37 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 }
 
 // Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) TableVerdict {
+func (it *IPTables) checkChain(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) chainVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
-	table := it.Tables[tablename]
-	for ruleIdx := table.BuiltinChains[hook]; ruleIdx < len(table.Rules); ruleIdx++ {
-		switch verdict := it.checkRule(hook, pkt, table, ruleIdx); verdict {
+	for ruleIdx < len(table.Rules) {
+		switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx); verdict {
 		case RuleAccept:
-			return TableAccept
+			return chainAccept
 
 		case RuleDrop:
-			return TableDrop
-
-		case RuleContinue:
-			continue
+			return chainDrop
 
 		case RuleReturn:
-			// TODO(gvisor.dev/issue/170): We don't implement jump
-			// yet, so any Return is from a built-in chain. That
-			// means we have to to call the underflow.
-			underflow := table.Rules[table.Underflows[hook]]
-			// Underflow is guaranteed to be an unconditional
-			// ACCEPT or DROP.
-			switch v, _ := underflow.Target.Action(pkt); v {
-			case RuleAccept:
-				return TableAccept
-			case RuleDrop:
-				return TableDrop
-			case RuleContinue, RuleReturn:
-				panic("Underflows should only return RuleAccept or RuleDrop.")
+			return chainReturn
+
+		case RuleJump:
+			// "Jumping" to the next rule just means we're
+			// continuing on down the list.
+			if jumpTo == ruleIdx+1 {
+				ruleIdx++
+				continue
+			}
+			switch verdict := it.checkChain(hook, pkt, table, jumpTo); verdict {
+			case chainAccept:
+				return chainAccept
+			case chainDrop:
+				return chainDrop
+			case chainReturn:
+				ruleIdx++
+				continue
 			default:
-				panic(fmt.Sprintf("Unknown verdict: %d", v))
+				panic(fmt.Sprintf("Unknown verdict: %d", verdict))
 			}
 
 		default:
@@ -205,17 +233,18 @@ func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename stri
 
 	// We got through the entire table without a decision. Default to DROP
 	// for safety.
-	return TableDrop
+	return chainDrop
 }
 
 // Precondition: pk.NetworkHeader is set.
-func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) RuleVerdict {
+func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) (RuleVerdict, int) {
 	rule := table.Rules[ruleIdx]
 
 	// First check whether the packet matches the IP header filter.
 	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
 	if rule.Filter.Protocol != 0 && rule.Filter.Protocol != header.IPv4(pkt.NetworkHeader).TransportProtocol() {
-		return RuleContinue
+		// Continue on to the next rule.
+		return RuleJump, ruleIdx + 1
 	}
 
 	// Go through each rule matcher. If they all match, run
@@ -223,14 +252,14 @@ func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ru
 	for _, matcher := range rule.Matchers {
 		matches, hotdrop := matcher.Match(hook, pkt, "")
 		if hotdrop {
-			return RuleDrop
+			return RuleDrop, 0
 		}
 		if !matches {
-			return RuleContinue
+			// Continue on to the next rule.
+			return RuleJump, ruleIdx + 1
 		}
 	}
 
 	// All the matchers matched, so run the target.
-	verdict, _ := rule.Target.Action(pkt)
-	return verdict
+	return rule.Target.Action(pkt)
 }
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 9fc60cfad..81a2e39a2 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// This file contains various Targets.
-
 package iptables
 
 import (
@@ -25,16 +23,16 @@ import (
 type AcceptTarget struct{}
 
 // Action implements Target.Action.
-func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
-	return RuleAccept, ""
+func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
+	return RuleAccept, 0
 }
 
 // DropTarget drops packets.
 type DropTarget struct{}
 
 // Action implements Target.Action.
-func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
-	return RuleDrop, ""
+func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
+	return RuleDrop, 0
 }
 
 // ErrorTarget logs an error and drops the packet. It represents a target that
@@ -42,9 +40,9 @@ func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
 type ErrorTarget struct{}
 
 // Action implements Target.Action.
-func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
 	log.Debugf("ErrorTarget triggered.")
-	return RuleDrop, ""
+	return RuleDrop, 0
 }
 
 // UserChainTarget marks a rule as the beginning of a user chain.
@@ -53,7 +51,7 @@ type UserChainTarget struct {
 }
 
 // Action implements Target.Action.
-func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
+func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
 	panic("UserChainTarget should never be called.")
 }
 
@@ -62,6 +60,6 @@ func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
 type ReturnTarget struct{}
 
 // Action implements Target.Action.
-func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
-	return RuleReturn, ""
+func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
+	return RuleReturn, 0
 }
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 5735d001b..7d032fd23 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -56,17 +56,6 @@ const (
 	NumHooks
 )
 
-// A TableVerdict is what a table decides should be done with a packet.
-type TableVerdict int
-
-const (
-	// TableAccept indicates the packet should continue through netstack.
-	TableAccept TableVerdict = iota
-
-	// TableAccept indicates the packet should be dropped.
-	TableDrop
-)
-
 // A RuleVerdict is what a rule decides should be done with a packet.
 type RuleVerdict int
 
@@ -74,12 +63,12 @@ const (
 	// RuleAccept indicates the packet should continue through netstack.
 	RuleAccept RuleVerdict = iota
 
-	// RuleContinue indicates the packet should continue to the next rule.
-	RuleContinue
-
 	// RuleDrop indicates the packet should be dropped.
 	RuleDrop
 
+	// RuleJump indicates the packet should jump to another chain.
+	RuleJump
+
 	// RuleReturn indicates the packet should return to the previous chain.
 	RuleReturn
 )
@@ -174,6 +163,6 @@ type Matcher interface {
 type Target interface {
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
-	// Jump, it also returns the name of the chain to jump to.
-	Action(packet tcpip.PacketBuffer) (RuleVerdict, string)
+	// Jump, it also returns the index of the rule to jump to.
+	Action(packet tcpip.PacketBuffer) (RuleVerdict, int)
 }
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index e26d6a7d2..706c09cea 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -26,6 +26,7 @@ const (
 	acceptPort       = 2402
 	sendloopDuration = 2 * time.Second
 	network          = "udp4"
+	chainName        = "foochain"
 )
 
 func init() {
@@ -40,6 +41,12 @@ func init() {
 	RegisterTestCase(FilterInputDefaultPolicyAccept{})
 	RegisterTestCase(FilterInputDefaultPolicyDrop{})
 	RegisterTestCase(FilterInputReturnUnderflow{})
+	RegisterTestCase(FilterInputSerializeJump{})
+	RegisterTestCase(FilterInputJumpBasic{})
+	RegisterTestCase(FilterInputJumpReturn{})
+	RegisterTestCase(FilterInputJumpReturnDrop{})
+	RegisterTestCase(FilterInputJumpBuiltin{})
+	RegisterTestCase(FilterInputJumpTwice{})
 }
 
 // FilterInputDropUDP tests that we can drop UDP traffic.
@@ -267,13 +274,12 @@ func (FilterInputMultiUDPRules) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputMultiUDPRules) ContainerAction(ip net.IP) error {
-	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
-		return err
-	}
-	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"); err != nil {
-		return err
+	rules := [][]string{
+		{"-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"},
+		{"-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"},
+		{"-L"},
 	}
-	return filterTable("-L")
+	return filterTableRules(rules)
 }
 
 // LocalAction implements TestCase.LocalAction.
@@ -314,14 +320,13 @@ func (FilterInputCreateUserChain) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputCreateUserChain) ContainerAction(ip net.IP) error {
-	// Create a chain.
-	const chainName = "foochain"
-	if err := filterTable("-N", chainName); err != nil {
-		return err
+	rules := [][]string{
+		// Create a chain.
+		{"-N", chainName},
+		// Add a simple rule to the chain.
+		{"-A", chainName, "-j", "DROP"},
 	}
-
-	// Add a simple rule to the chain.
-	return filterTable("-A", chainName, "-j", "DROP")
+	return filterTableRules(rules)
 }
 
 // LocalAction implements TestCase.LocalAction.
@@ -396,13 +401,12 @@ func (FilterInputReturnUnderflow) Name() string {
 func (FilterInputReturnUnderflow) ContainerAction(ip net.IP) error {
 	// Add a RETURN rule followed by an unconditional accept, and set the
 	// default policy to DROP.
-	if err := filterTable("-A", "INPUT", "-j", "RETURN"); err != nil {
-		return err
+	rules := [][]string{
+		{"-A", "INPUT", "-j", "RETURN"},
+		{"-A", "INPUT", "-j", "DROP"},
+		{"-P", "INPUT", "ACCEPT"},
 	}
-	if err := filterTable("-A", "INPUT", "-j", "DROP"); err != nil {
-		return err
-	}
-	if err := filterTable("-P", "INPUT", "ACCEPT"); err != nil {
+	if err := filterTableRules(rules); err != nil {
 		return err
 	}
 
@@ -415,3 +419,178 @@ func (FilterInputReturnUnderflow) ContainerAction(ip net.IP) error {
 func (FilterInputReturnUnderflow) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
+
+// Verify that we can serialize jumps.
+type FilterInputSerializeJump struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputSerializeJump) Name() string {
+	return "FilterInputSerializeJump"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputSerializeJump) ContainerAction(ip net.IP) error {
+	// Write a JUMP rule, the serialize it with `-L`.
+	rules := [][]string{
+		{"-N", chainName},
+		{"-A", "INPUT", "-j", chainName},
+		{"-L"},
+	}
+	return filterTableRules(rules)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputSerializeJump) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// Jump to a chain and execute a rule there.
+type FilterInputJumpBasic struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpBasic) Name() string {
+	return "FilterInputJumpBasic"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpBasic) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-N", chainName},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", chainName, "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on acceptPort.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpBasic) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// Jump, return, and execute a rule.
+type FilterInputJumpReturn struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpReturn) Name() string {
+	return "FilterInputJumpReturn"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpReturn) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-N", chainName},
+		{"-P", "INPUT", "ACCEPT"},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", chainName, "-j", "RETURN"},
+		{"-A", chainName, "-j", "DROP"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on acceptPort.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpReturn) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+type FilterInputJumpReturnDrop struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpReturnDrop) Name() string {
+	return "FilterInputJumpReturnDrop"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpReturnDrop) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-N", chainName},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", "INPUT", "-j", "DROP"},
+		{"-A", chainName, "-j", "RETURN"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort)
+	} else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+		return fmt.Errorf("error reading: %v", err)
+	}
+
+	// At this point we know that reading timed out and never received a
+	// packet.
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpReturnDrop) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// Jumping to a top-levl chain is illegal.
+type FilterInputJumpBuiltin struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpBuiltin) Name() string {
+	return "FilterInputJumpBuiltin"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpBuiltin) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-j", "OUTPUT"); err == nil {
+		return fmt.Errorf("iptables should be unable to jump to a built-in chain")
+	}
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpBuiltin) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// Jump twice, then return twice and execute a rule.
+type FilterInputJumpTwice struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpTwice) Name() string {
+	return "FilterInputJumpTwice"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpTwice) ContainerAction(ip net.IP) error {
+	const chainName2 = chainName + "2"
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-N", chainName},
+		{"-N", chainName2},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", chainName, "-j", chainName2},
+		{"-A", "INPUT", "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// UDP packets should jump and return twice, eventually hitting the
+	// ACCEPT rule.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpTwice) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 46a7c99b0..0621861eb 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -249,3 +249,39 @@ func TestFilterOutputDropTCPSrcPort(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
+func TestJumpSerialize(t *testing.T) {
+	if err := singleTest(FilterInputSerializeJump{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpBasic(t *testing.T) {
+	if err := singleTest(FilterInputJumpBasic{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpReturn(t *testing.T) {
+	if err := singleTest(FilterInputJumpReturn{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpReturnDrop(t *testing.T) {
+	if err := singleTest(FilterInputJumpReturnDrop{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpBuiltin(t *testing.T) {
+	if err := singleTest(FilterInputJumpBuiltin{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpTwice(t *testing.T) {
+	if err := singleTest(FilterInputJumpTwice{}); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index 043114c78..293c4e6ed 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -35,6 +35,16 @@ func filterTable(args ...string) error {
 	return nil
 }
 
+// filterTableRules is like filterTable, but runs multiple iptables commands.
+func filterTableRules(argsList [][]string) error {
+	for _, args := range argsList {
+		if err := filterTable(args...); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
 // listenUDP listens on a UDP port and returns the value of net.Conn.Read() for
 // the first read on that port.
 func listenUDP(port int, timeout time.Duration) error {
-- 
cgit v1.2.3


From a6024f7f5f6f438c11e30be0f93657b1956fd5ba Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 13 Feb 2020 17:56:34 -0800
Subject: Add FileExec flag to OpenOptions

This allow callers to say whether the file is being
opened to be executed, so that the proper checks can
be done from FilesystemImpl.OpenAt()

Updates #1623

PiperOrigin-RevId: 295042595
---
 pkg/sentry/fsimpl/ext/filesystem.go            |  2 +-
 pkg/sentry/fsimpl/ext/inode.go                 | 12 ++++++------
 pkg/sentry/fsimpl/gofer/filesystem.go          | 24 ++++++++++++------------
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go |  4 ++--
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go       |  6 +++---
 pkg/sentry/fsimpl/kernfs/filesystem.go         | 10 +++++-----
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go    |  6 +++---
 pkg/sentry/fsimpl/kernfs/kernfs.go             |  2 +-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go        |  8 ++++----
 pkg/sentry/fsimpl/proc/subtasks.go             |  4 ++--
 pkg/sentry/fsimpl/proc/task.go                 |  4 ++--
 pkg/sentry/fsimpl/proc/tasks.go                |  4 ++--
 pkg/sentry/fsimpl/sys/sys.go                   |  4 ++--
 pkg/sentry/fsimpl/tmpfs/filesystem.go          |  2 +-
 pkg/sentry/vfs/options.go                      |  5 +++++
 pkg/sentry/vfs/permissions.go                  | 19 ++++++++++++-------
 pkg/sentry/vfs/vfs.go                          | 19 +++++++++++++++++++
 17 files changed, 82 insertions(+), 53 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 07bf58953..e05429d41 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -296,7 +296,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 {
 		return nil, syserror.EROFS
 	}
-	return inode.open(rp, vfsd, opts.Flags)
+	return inode.open(rp, vfsd, &opts)
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 191b39970..6962083f5 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -148,8 +148,8 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 }
 
 // open creates and returns a file description for the dentry passed in.
-func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
-	ats := vfs.AccessTypesForOpenFlags(flags)
+func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(opts)
 	if err := in.checkPermissions(rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
@@ -157,7 +157,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return &fd.vfsfd, nil
@@ -168,17 +168,17 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return &fd.vfsfd, nil
 	case *symlink:
-		if flags&linux.O_PATH == 0 {
+		if opts.Flags&linux.O_PATH == 0 {
 			// Can't open symlinks without O_PATH.
 			return nil, syserror.ELOOP
 		}
 		var fd symlinkFD
-		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
+		fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", in.impl))
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 8eb61debf..138adb9f7 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -593,7 +593,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		}
 	}
 	if rp.Done() {
-		return start.openLocked(ctx, rp, opts.Flags)
+		return start.openLocked(ctx, rp, &opts)
 	}
 
 afterTrailingSymlink:
@@ -633,12 +633,12 @@ afterTrailingSymlink:
 		start = parent
 		goto afterTrailingSymlink
 	}
-	return child.openLocked(ctx, rp, opts.Flags)
+	return child.openLocked(ctx, rp, &opts)
 }
 
 // Preconditions: fs.renameMu must be locked.
-func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags uint32) (*vfs.FileDescription, error) {
-	ats := vfs.AccessTypesForOpenFlags(flags)
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(opts)
 	if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil {
 		return nil, err
 	}
@@ -646,11 +646,11 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui
 	filetype := d.fileType()
 	switch {
 	case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD:
-		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0); err != nil {
+		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil {
 			return nil, err
 		}
 		fd := &regularFileFD{}
-		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
 			AllowDirectIO: true,
 		}); err != nil {
 			return nil, err
@@ -658,21 +658,21 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui
 		return &fd.vfsfd, nil
 	case filetype == linux.S_IFDIR:
 		// Can't open directories with O_CREAT.
-		if flags&linux.O_CREAT != 0 {
+		if opts.Flags&linux.O_CREAT != 0 {
 			return nil, syserror.EISDIR
 		}
 		// Can't open directories writably.
 		if ats&vfs.MayWrite != 0 {
 			return nil, syserror.EISDIR
 		}
-		if flags&linux.O_DIRECT != 0 {
+		if opts.Flags&linux.O_DIRECT != 0 {
 			return nil, syserror.EINVAL
 		}
 		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
 			return nil, err
 		}
 		fd := &directoryFD{}
-		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return &fd.vfsfd, nil
@@ -680,17 +680,17 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui
 		// Can't open symlinks without O_PATH (which is unimplemented).
 		return nil, syserror.ELOOP
 	default:
-		if flags&linux.O_DIRECT != 0 {
+		if opts.Flags&linux.O_DIRECT != 0 {
 			return nil, syserror.EINVAL
 		}
-		h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0)
+		h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0)
 		if err != nil {
 			return nil, err
 		}
 		fd := &specialFileFD{
 			handle: h,
 		}
-		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			h.close(ctx)
 			return nil, err
 		}
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 733792c78..d092ccb2a 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -53,9 +53,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.Dy
 }
 
 // Open implements Inode.Open.
-func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &DynamicBytesFD{}
-	if err := fd.Init(rp.Mount(), vfsd, f.data, flags); err != nil {
+	if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 6104751c8..eda781155 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -43,12 +43,12 @@ type GenericDirectoryFD struct {
 }
 
 // Init initializes a GenericDirectoryFD.
-func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) error {
-	if vfs.AccessTypesForOpenFlags(flags)&vfs.MayWrite != 0 {
+func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) error {
+	if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
 		// Can't open directories for writing.
 		return syserror.EISDIR
 	}
-	if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
 		return err
 	}
 	fd.children = children
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index e49303c26..ee98eb66a 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -365,7 +365,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	// appropriate bits in rp), but are returned by
 	// FileDescriptionImpl.StatusFlags().
 	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW
-	ats := vfs.AccessTypesForOpenFlags(opts.Flags)
+	ats := vfs.AccessTypesForOpenFlags(&opts)
 
 	// Do not create new file.
 	if opts.Flags&linux.O_CREAT == 0 {
@@ -379,7 +379,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
-		return inode.Open(rp, vfsd, opts.Flags)
+		return inode.Open(rp, vfsd, opts)
 	}
 
 	// May create new file.
@@ -398,7 +398,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
-		return inode.Open(rp, vfsd, opts.Flags)
+		return inode.Open(rp, vfsd, opts)
 	}
 afterTrailingSymlink:
 	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
@@ -438,7 +438,7 @@ afterTrailingSymlink:
 			return nil, err
 		}
 		parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
-		return child.Impl().(*Dentry).inode.Open(rp, child, opts.Flags)
+		return child.Impl().(*Dentry).inode.Open(rp, child, opts)
 	}
 	// Open existing file or follow symlink.
 	if mustCreate {
@@ -463,7 +463,7 @@ afterTrailingSymlink:
 	if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
-	return childInode.Open(rp, childVFSD, opts.Flags)
+	return childInode.Open(rp, childVFSD, opts)
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index adca2313f..099d70a16 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -507,7 +507,7 @@ type InodeSymlink struct {
 }
 
 // Open implements Inode.Open.
-func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	return nil, syserror.ELOOP
 }
 
@@ -549,8 +549,8 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.F
 }
 
 // Open implements kernfs.Inode.
-func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, flags)
+	fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
 	return fd.VFSFileDescription(), nil
 }
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 79ebea8a5..c74fa999b 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -303,7 +303,7 @@ type Inode interface {
 	// inode for its lifetime.
 	//
 	// Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry.
-	Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error)
+	Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
 }
 
 type inodeRefs interface {
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index ee65cf491..96a16e654 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -113,9 +113,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
 	return &dir.dentry
 }
 
-func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags); err != nil {
+	if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
@@ -143,9 +143,9 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 	return &dir.dentry
 }
 
-func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 353e37195..102af0e93 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -114,9 +114,9 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
 }
 
 // Open implements kernfs.Inode.
-func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index eb5bc62c0..2d814668a 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -98,9 +98,9 @@ func (i *taskInode) Valid(ctx context.Context) bool {
 }
 
 // Open implements kernfs.Inode.
-func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 14bd334e8..ebe21630c 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -205,9 +205,9 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 }
 
 // Open implements kernfs.Inode.
-func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index e35d52d17..d693fceae 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -97,9 +97,9 @@ func (d *dir) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error {
 }
 
 // Open implements kernfs.Inode.Open.
-func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
 	return fd.VFSFileDescription(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 72bc15264..8785452b6 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -334,7 +334,7 @@ afterTrailingSymlink:
 }
 
 func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
-	ats := vfs.AccessTypesForOpenFlags(opts.Flags)
+	ats := vfs.AccessTypesForOpenFlags(opts)
 	if !afterCreate {
 		if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
 			return nil, err
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index b7774bf28..fdf8be157 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -72,6 +72,11 @@ type OpenOptions struct {
 	// If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the
 	// created file.
 	Mode linux.FileMode
+
+	// FileExec is set when the file is being opened to be executed.
+	// VirtualFilesystem.OpenAt() checks that the caller has execute permissions
+	// on the file, and that the file is a regular file.
+	FileExec bool
 }
 
 // ReadOptions contains options to FileDescription.PRead(),
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index f664581f4..8e250998a 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -103,17 +103,22 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
 // AccessTypesForOpenFlags returns MayRead|MayWrite in this case.
 //
 // Use May{Read,Write}FileWithOpenFlags() for these checks instead.
-func AccessTypesForOpenFlags(flags uint32) AccessTypes {
-	switch flags & linux.O_ACCMODE {
+func AccessTypesForOpenFlags(opts *OpenOptions) AccessTypes {
+	ats := AccessTypes(0)
+	if opts.FileExec {
+		ats |= MayExec
+	}
+
+	switch opts.Flags & linux.O_ACCMODE {
 	case linux.O_RDONLY:
-		if flags&linux.O_TRUNC != 0 {
-			return MayRead | MayWrite
+		if opts.Flags&linux.O_TRUNC != 0 {
+			return ats | MayRead | MayWrite
 		}
-		return MayRead
+		return ats | MayRead
 	case linux.O_WRONLY:
-		return MayWrite
+		return ats | MayWrite
 	default:
-		return MayRead | MayWrite
+		return ats | MayRead | MayWrite
 	}
 }
 
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 908c69f91..9629afee9 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -379,6 +379,25 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 		fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(rp)
+
+			// TODO(gvisor.dev/issue/1193): Move inside fsimpl to avoid another call
+			// to FileDescription.Stat().
+			if opts.FileExec {
+				// Only a regular file can be executed.
+				stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
+				if err != nil {
+					return nil, err
+				}
+				if stat.Mask&linux.STATX_TYPE != 0 {
+					// This shouldn't happen, but if type can't be retrieved, file can't
+					// be executed.
+					return nil, syserror.EACCES
+				}
+				if linux.FileMode(stat.Mode).FileType() != linux.ModeRegular {
+					return nil, syserror.EACCES
+				}
+			}
+
 			return fd, nil
 		}
 		if !rp.handleError(err) {
-- 
cgit v1.2.3


From ebaf29abeb5e6e6cd87f4b0088719abb30e9d4cb Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Fri, 14 Feb 2020 02:04:41 -0500
Subject: passed the kvm test case of "TestKernelSyscall" on Arm64

For kvm test case "TestKernelSyscall",
redpill/syscall(-1) in guest kernel level will be trapped in el1_svc.
And in el1_svc, we use mmio_exit to leave the guest.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/ring0/entry_arm64.s   | 73 ++++++++++++++++++++++++++-----
 pkg/sentry/platform/ring0/kernel_arm64.go |  8 ++++
 2 files changed, 70 insertions(+), 11 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index baa6c4910..d42eda37b 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -25,10 +25,14 @@
 // not available for calls.
 //
 
+// ERET returns using the ELR and SPSR for the current exception level.
 #define ERET() \
   WORD $0xd69f03e0
 
+// RSV_REG is a register that holds el1 information temporarily.
 #define RSV_REG 	R18_PLATFORM
+
+// RSV_REG_APP is a register that holds el0 information temporarily.
 #define RSV_REG_APP 	R9
 
 #define FPEN_NOTRAP 	0x3
@@ -36,6 +40,12 @@
 
 #define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT)
 
+// Saves a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not saved: R9, R18.
 #define REGISTERS_SAVE(reg, offset) \
   MOVD R0, offset+PTRACE_R0(reg); \
   MOVD R1, offset+PTRACE_R1(reg); \
@@ -67,6 +77,12 @@
   MOVD R29, offset+PTRACE_R29(reg); \
   MOVD R30, offset+PTRACE_R30(reg);
 
+// Loads a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not loaded: R9, R18.
 #define REGISTERS_LOAD(reg, offset) \
   MOVD offset+PTRACE_R0(reg), R0; \
   MOVD offset+PTRACE_R1(reg), R1; \
@@ -98,7 +114,7 @@
   MOVD offset+PTRACE_R29(reg), R29; \
   MOVD offset+PTRACE_R30(reg), R30;
 
-//NOP
+// NOP-s
 #define nop31Instructions() \
         WORD $0xd503201f; \
         WORD $0xd503201f; \
@@ -254,6 +270,7 @@
 #define ESR_ELx_WFx_ISS_WFE	(UL(1) << 0)
 #define ESR_ELx_xVC_IMM_MASK	((1UL << 16) - 1)
 
+// LOAD_KERNEL_ADDRESS loads a kernel address.
 #define LOAD_KERNEL_ADDRESS(from, to) \
 	MOVD from, to; \
 	ORR $0xffff000000000000, to, to;
@@ -263,15 +280,18 @@
 	LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \
 	MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \
 	MOVD RSV_REG, RSP; \
+	WORD $0xd538d092; \   //MRS   TPIDR_EL1, R18
 	ISB $15; \
 	DSB $15;
 
+// SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application.
 #define SWITCH_TO_APP_PAGETABLE(from) \
 	MOVD CPU_TTBR0_APP(from), RSV_REG; \
 	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
 	ISB $15; \
 	DSB $15;
 
+// SWITCH_TO_KVM_PAGETABLE sets the kvm pagetable.
 #define SWITCH_TO_KVM_PAGETABLE(from) \
 	MOVD CPU_TTBR0_KVM(from), RSV_REG; \
 	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
@@ -294,6 +314,7 @@
 	WORD $0xd5181040; \ //MSR R0, CPACR_EL1
 	ISB $15;
 
+// KERNEL_ENTRY_FROM_EL0 is the entry code of the vcpu from el0 to el1.
 #define KERNEL_ENTRY_FROM_EL0 \
 	SUB $16, RSP, RSP; \		// step1, save r18, r9 into kernel temporary stack.
 	STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \
@@ -315,19 +336,22 @@
 	WORD $0xd5384103; \      //  MRS SP_EL0, R3
 	MOVD R3, PTRACE_SP(RSV_REG_APP);
 
+// KERNEL_ENTRY_FROM_EL1 is the entry code of the vcpu from el1 to el1.
 #define KERNEL_ENTRY_FROM_EL1 \
 	WORD $0xd538d092; \   //MRS   TPIDR_EL1, R18
-	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \	// save sentry context
+	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \	// Save sentry context.
 	MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG); \
 	WORD $0xd5384004; \    //    MRS SPSR_EL1, R4
 	MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \
 	MRS ELR_EL1, R4; \
 	MOVD R4, CPU_REGISTERS+PTRACE_PC(RSV_REG); \
 	MOVD RSP, R4; \
-	MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG);
+	MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \
+	LOAD_KERNEL_STACK(RSV_REG);  // Load the temporary stack.
 
+// Halt halts execution.
 TEXT ·Halt(SB),NOSPLIT,$0
-	// clear bluepill.
+	// Clear bluepill.
 	WORD $0xd538d092   //MRS   TPIDR_EL1, R18
 	CMP RSV_REG, R9
 	BNE mmio_exit
@@ -341,8 +365,22 @@ mmio_exit:
 	// MMIO_EXIT.
 	MOVD $0, R9
 	MOVD R0, 0xffff000000001000(R9)
-	B ·kernelExitToEl1(SB)
+	RET
+
+// HaltAndResume halts execution and point the pointer to the resume function.
+TEXT ·HaltAndResume(SB),NOSPLIT,$0
+	BL ·Halt(SB)
+	B ·kernelExitToEl1(SB) // Resume.
 
+// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume.
+TEXT ·HaltEl1SvcAndResume(SB),NOSPLIT,$0
+	WORD $0xd538d092            // MRS TPIDR_EL1, R18
+	MOVD CPU_SELF(RSV_REG), R3  // Load vCPU.
+	MOVD R3, 8(RSP)             // First argument (vCPU).
+	CALL ·kernelSyscall(SB)     // Call the trampoline.
+	B ·kernelExitToEl1(SB)      // Resume.
+
+// Shutdown stops the guest.
 TEXT ·Shutdown(SB),NOSPLIT,$0
 	// PSCI EVENT.
 	MOVD $0x84000009, R0
@@ -429,6 +467,7 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
 	ERET()
 
+// Start is the CPU entrypoint.
 TEXT ·Start(SB),NOSPLIT,$0
 	IRQ_DISABLE
 	MOVD R8, RSV_REG
@@ -437,18 +476,23 @@ TEXT ·Start(SB),NOSPLIT,$0
 
 	B ·kernelExitToEl1(SB)
 
+// El1_sync_invalid is the handler for an invalid EL1_sync.
 TEXT ·El1_sync_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_irq_invalid is the handler for an invalid El1_irq.
 TEXT ·El1_irq_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_fiq_invalid is the handler for an invalid El1_fiq.
 TEXT ·El1_fiq_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_error_invalid is the handler for an invalid El1_error.
 TEXT ·El1_error_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_sync is the handler for El1_sync.
 TEXT ·El1_sync(SB),NOSPLIT,$0
 	KERNEL_ENTRY_FROM_EL1
 	WORD $0xd5385219        // MRS ESR_EL1, R25
@@ -484,10 +528,10 @@ el1_da:
 	MOVD $PageFault, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el1_ia:
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el1_sp_pc:
 	B ·Shutdown(SB)
@@ -496,7 +540,9 @@ el1_undef:
 	B ·Shutdown(SB)
 
 el1_svc:
-	B ·Halt(SB)
+	MOVD $0, CPU_ERROR_CODE(RSV_REG)
+	MOVD $0, CPU_ERROR_TYPE(RSV_REG)
+	B ·HaltEl1SvcAndResume(SB)
 
 el1_dbg:
 	B ·Shutdown(SB)
@@ -508,15 +554,19 @@ el1_fpsimd_acc:
 el1_invalid:
 	B ·Shutdown(SB)
 
+// El1_irq is the handler for El1_irq.
 TEXT ·El1_irq(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_fiq is the handler for El1_fiq.
 TEXT ·El1_fiq(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_error is the handler for El1_error.
 TEXT ·El1_error(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El0_sync is the handler for El0_sync.
 TEXT ·El0_sync(SB),NOSPLIT,$0
 	KERNEL_ENTRY_FROM_EL0
 	WORD $0xd5385219	// MRS ESR_EL1, R25
@@ -554,7 +604,7 @@ el0_svc:
 	MOVD $Syscall, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el0_da:
 	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
@@ -568,7 +618,7 @@ el0_da:
 	MOVD $PageFault, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el0_ia:
 	B ·Shutdown(SB)
@@ -613,7 +663,7 @@ TEXT ·El0_error(SB),NOSPLIT,$0
 	MOVD $VirtualizationException, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 TEXT ·El0_sync_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
@@ -627,6 +677,7 @@ TEXT ·El0_fiq_invalid(SB),NOSPLIT,$0
 TEXT ·El0_error_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// Vectors implements exception vector table.
 TEXT ·Vectors(SB),NOSPLIT,$0
 	B ·El1_sync_invalid(SB)
 	nop31Instructions()
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index c3d341998..ccacaea6b 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -16,6 +16,14 @@
 
 package ring0
 
+// HaltAndResume halts execution and point the pointer to the resume function.
+//go:nosplit
+func HaltAndResume()
+
+// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume.
+//go:nosplit
+func HaltEl1SvcAndResume()
+
 // init initializes architecture-specific state.
 func (k *Kernel) init(opts KernelOpts) {
 	// Save the root page tables.
-- 
cgit v1.2.3


From b2e86906ea4f7bc43b8d2d3a4735a87eca779b33 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 03:26:42 -0800
Subject: Fix various issues related to enabling go-marshal.

- Add missing build tags to files in the abi package.

- Add the marshal package as a sentry dependency, allowed by deps_test.

- Fix an issue with our top-level go_library BUILD rule, which
  incorrectly shadows the variable containing the input set of source
  files. This caused the expansion for the go_marshal clause to
  silently omit input files.

- Fix formatting when copying build tags to gomarshal-generated files.

- Fix a bug with import statement collision detection in go-marshal.

PiperOrigin-RevId: 295112284
---
 pkg/abi/linux/file_amd64.go             |  2 ++
 pkg/abi/linux/file_arm64.go             |  2 ++
 tools/defs.bzl                          | 12 ++++++------
 tools/go_marshal/gomarshal/generator.go |  2 +-
 tools/go_marshal/gomarshal/util.go      | 25 ++++++++++++++++++-------
 5 files changed, 29 insertions(+), 14 deletions(-)

(limited to 'pkg')

diff --git a/pkg/abi/linux/file_amd64.go b/pkg/abi/linux/file_amd64.go
index 9d307e840..8693d49c8 100644
--- a/pkg/abi/linux/file_amd64.go
+++ b/pkg/abi/linux/file_amd64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build amd64
+
 package linux
 
 // Constants for open(2).
diff --git a/pkg/abi/linux/file_arm64.go b/pkg/abi/linux/file_arm64.go
index 26a54f416..ea3adc5f5 100644
--- a/pkg/abi/linux/file_arm64.go
+++ b/pkg/abi/linux/file_arm64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build arm64
+
 package linux
 
 // Constants for open(2).
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 46249f9c4..39f035f12 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -117,10 +117,10 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
         # First, we need to segregate the input files via the special suffixes,
         # and calculate the final output set.
         state_sets = calculate_sets(srcs)
-        for (suffix, srcs) in state_sets.items():
+        for (suffix, src_subset) in state_sets.items():
             go_stateify(
                 name = name + suffix + "_state_autogen_with_imports",
-                srcs = srcs,
+                srcs = src_subset,
                 imports = imports,
                 package = full_pkg,
                 out = name + suffix + "_state_autogen_with_imports.go",
@@ -140,10 +140,10 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
     if marshal:
         # See above.
         marshal_sets = calculate_sets(srcs)
-        for (suffix, srcs) in marshal_sets.items():
+        for (suffix, src_subset) in marshal_sets.items():
             go_marshal(
                 name = name + suffix + "_abi_autogen",
-                srcs = srcs,
+                srcs = src_subset,
                 debug = False,
                 imports = imports,
                 package = name,
@@ -172,11 +172,11 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
 
         # See above.
         marshal_sets = calculate_sets(srcs)
-        for (suffix, srcs) in marshal_sets.items():
+        for (suffix, _) in marshal_sets.items():
             _go_test(
                 name = name + suffix + "_abi_autogen_test",
                 srcs = [name + suffix + "_abi_autogen_test.go"],
-                library = ":" + name + suffix,
+                library = ":" + name,
                 deps = marshal_test_deps,
                 **kwargs
             )
diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 01be7c477..fbec7bb9a 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -123,7 +123,7 @@ func (g *Generator) writeHeader() error {
 	// Emit build tags.
 	if t := tags.Aggregate(g.inputs); len(t) > 0 {
 		b.emit(strings.Join(t.Lines(), "\n"))
-		b.emit("\n")
+		b.emit("\n\n")
 	}
 
 	// Package header.
diff --git a/tools/go_marshal/gomarshal/util.go b/tools/go_marshal/gomarshal/util.go
index 3d86935b4..e2bca4e7c 100644
--- a/tools/go_marshal/gomarshal/util.go
+++ b/tools/go_marshal/gomarshal/util.go
@@ -310,7 +310,7 @@ func (i *importStmt) markUsed() {
 }
 
 func (i *importStmt) equivalent(other *importStmt) bool {
-	return i == other
+	return i.name == other.name && i.path == other.path && i.aliased == other.aliased
 }
 
 // importTable represents a collection of importStmts.
@@ -329,7 +329,7 @@ func newImportTable() *importTable {
 // result in a panic.
 func (i *importTable) merge(other *importTable) {
 	for name, im := range other.is {
-		if dup, ok := i.is[name]; ok && dup.equivalent(im) {
+		if dup, ok := i.is[name]; ok && !dup.equivalent(im) {
 			panic(fmt.Sprintf("Found colliding import statements: ours: %+v, other's: %+v", dup, im))
 		}
 
@@ -337,16 +337,27 @@ func (i *importTable) merge(other *importTable) {
 	}
 }
 
+func (i *importTable) addStmt(s *importStmt) *importStmt {
+	if old, ok := i.is[s.name]; ok && !old.equivalent(s) {
+		// A collision should always be between an import inserted by the
+		// go-marshal tool and an import from the original source file (assuming
+		// the original source file was valid). We could theoretically handle
+		// the collision by assigning a local name to our import. However, this
+		// would need to be plumbed throughout the generator. Given that
+		// collisions should be rare, simply panic on collision.
+		panic(fmt.Sprintf("Import collision: old: %s as %v; new: %v as %v", old.path, old.name, s.path, s.name))
+	}
+	i.is[s.name] = s
+	return s
+}
+
 func (i *importTable) add(s string) *importStmt {
 	n := newImport(s)
-	i.is[n.name] = n
-	return n
+	return i.addStmt(n)
 }
 
 func (i *importTable) addFromSpec(spec *ast.ImportSpec, f *token.FileSet) *importStmt {
-	n := newImportFromSpec(spec, f)
-	i.is[n.name] = n
-	return n
+	return i.addStmt(newImportFromSpec(spec, f))
 }
 
 // Marks the import named n as used. If no such import is in the table, returns
-- 
cgit v1.2.3


From 4075de11be44372c454aae7f9650cdc814c52229 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 11:11:55 -0800
Subject: Plumb VFS2 inside the Sentry

- Added fsbridge package with interface that can be used to open
  and read from VFS1 and VFS2 files.
- Converted ELF loader to use fsbridge
- Added VFS2 types to FSContext
- Added vfs.MountNamespace to ThreadGroup

Updates #1623

PiperOrigin-RevId: 295183950
---
 pkg/sentry/control/BUILD                           |   5 +
 pkg/sentry/control/proc.go                         | 127 +++++++++++++--
 pkg/sentry/fs/proc/BUILD                           |   1 +
 pkg/sentry/fs/proc/task.go                         |  17 +-
 pkg/sentry/fsbridge/BUILD                          |  24 +++
 pkg/sentry/fsbridge/bridge.go                      |  54 ++++++
 pkg/sentry/fsbridge/fs.go                          | 181 +++++++++++++++++++++
 pkg/sentry/fsbridge/vfs.go                         | 134 +++++++++++++++
 pkg/sentry/fsimpl/devtmpfs/devtmpfs.go             |   4 +
 pkg/sentry/fsimpl/gofer/filesystem.go              |   5 +-
 pkg/sentry/fsimpl/gofer/gofer.go                   |   3 +
 pkg/sentry/fsimpl/kernfs/filesystem.go             |  10 +-
 pkg/sentry/fsimpl/proc/BUILD                       |   1 +
 pkg/sentry/fsimpl/proc/filesystem.go               |  18 +-
 pkg/sentry/fsimpl/proc/tasks_test.go               |  17 +-
 pkg/sentry/fsimpl/sys/BUILD                        |   1 +
 pkg/sentry/fsimpl/sys/sys.go                       |   3 +
 pkg/sentry/fsimpl/sys/sys_test.go                  |   7 +-
 pkg/sentry/fsimpl/testutil/BUILD                   |   2 +-
 pkg/sentry/fsimpl/testutil/kernel.go               |  24 +--
 pkg/sentry/fsimpl/testutil/testutil.go             |  12 +-
 pkg/sentry/fsimpl/tmpfs/filesystem.go              |  12 +-
 pkg/sentry/fsimpl/tmpfs/tmpfs.go                   |   3 +
 pkg/sentry/kernel/BUILD                            |   2 +
 pkg/sentry/kernel/fs_context.go                    |  98 +++++++++--
 pkg/sentry/kernel/kernel.go                        | 145 +++++++++++++----
 pkg/sentry/kernel/task.go                          |  27 +++
 pkg/sentry/kernel/task_clone.go                    |  11 +-
 pkg/sentry/kernel/task_context.go                  |   2 +-
 pkg/sentry/kernel/task_exit.go                     |   7 +
 pkg/sentry/kernel/task_log.go                      |  15 +-
 pkg/sentry/kernel/task_start.go                    |  49 +++---
 pkg/sentry/kernel/thread_group.go                  |   6 +-
 pkg/sentry/loader/BUILD                            |   2 +
 pkg/sentry/loader/elf.go                           |  28 ++--
 pkg/sentry/loader/interpreter.go                   |   6 +-
 pkg/sentry/loader/loader.go                        | 179 ++++++--------------
 pkg/sentry/loader/vdso.go                          |   7 +-
 pkg/sentry/mm/BUILD                                |   2 +-
 pkg/sentry/mm/metadata.go                          |  10 +-
 pkg/sentry/mm/mm.go                                |   4 +-
 pkg/sentry/strace/strace.go                        |  28 ++++
 pkg/sentry/syscalls/linux/BUILD                    |   1 +
 pkg/sentry/syscalls/linux/sys_prctl.go             |   3 +-
 pkg/sentry/syscalls/linux/sys_thread.go            |  17 +-
 .../syscalls/linux/vfs2/linux64_override_amd64.go  | 106 ++++++++++++
 pkg/sentry/vfs/BUILD                               |   1 +
 pkg/sentry/vfs/context.go                          |   7 +-
 pkg/sentry/vfs/mount.go                            |  10 +-
 pkg/sentry/vfs/options.go                          |   2 +-
 pkg/sentry/vfs/vfs.go                              |   5 +-
 runsc/boot/loader.go                               |  11 +-
 52 files changed, 1134 insertions(+), 322 deletions(-)
 create mode 100644 pkg/sentry/fsbridge/BUILD
 create mode 100644 pkg/sentry/fsbridge/bridge.go
 create mode 100644 pkg/sentry/fsbridge/fs.go
 create mode 100644 pkg/sentry/fsbridge/vfs.go

(limited to 'pkg')

diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index e69496477..d16d78aa5 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -16,10 +16,13 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fd",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/host",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
@@ -27,8 +30,10 @@ go_library(
         "//pkg/sentry/state",
         "//pkg/sentry/strace",
         "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
         "//pkg/sentry/watchdog",
         "//pkg/sync",
+        "//pkg/syserror",
         "//pkg/tcpip/link/sniffer",
         "//pkg/urpc",
     ],
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index ced51c66c..8973754c8 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -18,19 +18,26 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"path"
 	"sort"
 	"strings"
 	"text/tabwriter"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
@@ -60,6 +67,12 @@ type ExecArgs struct {
 	// process's MountNamespace.
 	MountNamespace *fs.MountNamespace
 
+	// MountNamespaceVFS2 is the mount namespace to execute the new process in.
+	// A reference on MountNamespace must be held for the lifetime of the
+	// ExecArgs. If MountNamespace is nil, it will default to the init
+	// process's MountNamespace.
+	MountNamespaceVFS2 *vfs.MountNamespace
+
 	// WorkingDirectory defines the working directory for the new process.
 	WorkingDirectory string `json:"wd"`
 
@@ -150,6 +163,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		Envv:                    args.Envv,
 		WorkingDirectory:        args.WorkingDirectory,
 		MountNamespace:          args.MountNamespace,
+		MountNamespaceVFS2:      args.MountNamespaceVFS2,
 		Credentials:             creds,
 		FDTable:                 fdTable,
 		Umask:                   0022,
@@ -166,24 +180,53 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		// be donated to the new process in CreateProcess.
 		initArgs.MountNamespace.IncRef()
 	}
+	if initArgs.MountNamespaceVFS2 != nil {
+		// initArgs must hold a reference on MountNamespaceVFS2, which will
+		// be donated to the new process in CreateProcess.
+		initArgs.MountNamespaceVFS2.IncRef()
+	}
 	ctx := initArgs.NewContext(proc.Kernel)
 
 	if initArgs.Filename == "" {
-		// Get the full path to the filename from the PATH env variable.
-		paths := fs.GetPath(initArgs.Envv)
-		mns := initArgs.MountNamespace
-		if mns == nil {
-			mns = proc.Kernel.GlobalInit().Leader().MountNamespace()
-		}
-		f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
-		if err != nil {
-			return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+		if kernel.VFS2Enabled {
+			// Get the full path to the filename from the PATH env variable.
+			if initArgs.MountNamespaceVFS2 == nil {
+				// Set initArgs so that 'ctx' returns the namespace.
+				//
+				// MountNamespaceVFS2 adds a reference to the namespace, which is
+				// transferred to the new process.
+				initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
+			}
+
+			paths := fs.GetPath(initArgs.Envv)
+			vfsObj := proc.Kernel.VFS
+			file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
+			if err != nil {
+				return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+			}
+			initArgs.File = fsbridge.NewVFSFile(file)
+		} else {
+			// Get the full path to the filename from the PATH env variable.
+			paths := fs.GetPath(initArgs.Envv)
+			if initArgs.MountNamespace == nil {
+				// Set initArgs so that 'ctx' returns the namespace.
+				initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
+
+				// initArgs must hold a reference on MountNamespace, which will
+				// be donated to the new process in CreateProcess.
+				initArgs.MountNamespaceVFS2.IncRef()
+			}
+			f, err := initArgs.MountNamespace.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
+			if err != nil {
+				return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+			}
+			initArgs.Filename = f
 		}
-		initArgs.Filename = f
 	}
 
 	mounter := fs.FileOwnerFromContext(ctx)
 
+	// TODO(gvisor.dev/issue/1623): Use host FD when supported in VFS2.
 	var ttyFile *fs.File
 	for appFD, hostFile := range args.FilePayload.Files {
 		var appFile *fs.File
@@ -411,3 +454,67 @@ func ttyName(tty *kernel.TTY) string {
 	}
 	return fmt.Sprintf("pts/%d", tty.Index)
 }
+
+// ResolveExecutablePath resolves the given executable name given a set of
+// paths that might contain it.
+func ResolveExecutablePath(ctx context.Context, vfsObj *vfs.VirtualFilesystem, wd, name string, paths []string) (*vfs.FileDescription, error) {
+	root := vfs.RootFromContext(ctx)
+	defer root.DecRef()
+	creds := auth.CredentialsFromContext(ctx)
+
+	// Absolute paths can be used directly.
+	if path.IsAbs(name) {
+		return openExecutable(ctx, vfsObj, creds, root, name)
+	}
+
+	// Paths with '/' in them should be joined to the working directory, or
+	// to the root if working directory is not set.
+	if strings.IndexByte(name, '/') > 0 {
+		if len(wd) == 0 {
+			wd = "/"
+		}
+		if !path.IsAbs(wd) {
+			return nil, fmt.Errorf("working directory %q must be absolute", wd)
+		}
+		return openExecutable(ctx, vfsObj, creds, root, path.Join(wd, name))
+	}
+
+	// Otherwise, we must lookup the name in the paths, starting from the
+	// calling context's root directory.
+	for _, p := range paths {
+		if !path.IsAbs(p) {
+			// Relative paths aren't safe, no one should be using them.
+			log.Warningf("Skipping relative path %q in $PATH", p)
+			continue
+		}
+
+		binPath := path.Join(p, name)
+		f, err := openExecutable(ctx, vfsObj, creds, root, binPath)
+		if err != nil {
+			return nil, err
+		}
+		if f == nil {
+			continue // Not found/no access.
+		}
+		return f, nil
+	}
+	return nil, syserror.ENOENT
+}
+
+func openExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry, path string) (*vfs.FileDescription, error) {
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root, // binPath is absolute, Start can be anything.
+		Path:               fspath.Parse(path),
+		FollowFinalSymlink: true,
+	}
+	opts := &vfs.OpenOptions{
+		Flags:    linux.O_RDONLY,
+		FileExec: true,
+	}
+	f, err := vfsObj.OpenAt(ctx, creds, &pop, opts)
+	if err == syserror.ENOENT || err == syserror.EACCES {
+		return nil, nil
+	}
+	return f, err
+}
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 280093c5e..77c2c5c0e 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -36,6 +36,7 @@ go_library(
         "//pkg/sentry/fs/proc/device",
         "//pkg/sentry/fs/proc/seqfile",
         "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index ca020e11e..8ab8d8a02 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -28,6 +28,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -249,7 +250,7 @@ func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 	return newProcInode(t, exeSymlink, msrc, fs.Symlink, t)
 }
 
-func (e *exe) executable() (d *fs.Dirent, err error) {
+func (e *exe) executable() (file fsbridge.File, err error) {
 	e.t.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
@@ -262,8 +263,8 @@ func (e *exe) executable() (d *fs.Dirent, err error) {
 		// The MemoryManager may be destroyed, in which case
 		// MemoryManager.destroy will simply set the executable to nil
 		// (with locks held).
-		d = mm.Executable()
-		if d == nil {
+		file = mm.Executable()
+		if file == nil {
 			err = syserror.ENOENT
 		}
 	})
@@ -283,15 +284,7 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 	}
 	defer exec.DecRef()
 
-	root := fs.RootFromContext(ctx)
-	if root == nil {
-		// This doesn't correspond to anything in Linux because the vfs is
-		// global there.
-		return "", syserror.EINVAL
-	}
-	defer root.DecRef()
-	n, _ := exec.FullName(root)
-	return n, nil
+	return exec.PathnameWithDeleted(ctx), nil
 }
 
 // namespaceSymlink represents a symlink in the namespacefs, such as the files
diff --git a/pkg/sentry/fsbridge/BUILD b/pkg/sentry/fsbridge/BUILD
new file mode 100644
index 000000000..6c798f0bd
--- /dev/null
+++ b/pkg/sentry/fsbridge/BUILD
@@ -0,0 +1,24 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "fsbridge",
+    srcs = [
+        "bridge.go",
+        "fs.go",
+        "vfs.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsbridge/bridge.go b/pkg/sentry/fsbridge/bridge.go
new file mode 100644
index 000000000..8e7590721
--- /dev/null
+++ b/pkg/sentry/fsbridge/bridge.go
@@ -0,0 +1,54 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fsbridge provides common interfaces to bridge between VFS1 and VFS2
+// files.
+package fsbridge
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// File provides a common interface to bridge between VFS1 and VFS2 files.
+type File interface {
+	// PathnameWithDeleted returns an absolute pathname to vd, consistent with
+	// Linux's d_path(). In particular, if vd.Dentry() has been disowned,
+	// PathnameWithDeleted appends " (deleted)" to the returned pathname.
+	PathnameWithDeleted(ctx context.Context) string
+
+	// ReadFull read all contents from the file.
+	ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error)
+
+	// ConfigureMMap mutates opts to implement mmap(2) for the file.
+	ConfigureMMap(context.Context, *memmap.MMapOpts) error
+
+	// Type returns the file type, e.g. linux.S_IFREG.
+	Type(context.Context) (linux.FileMode, error)
+
+	// IncRef increments reference.
+	IncRef()
+
+	// DecRef decrements reference.
+	DecRef()
+}
+
+// Lookup provides a common interface to open files.
+type Lookup interface {
+	// OpenPath opens a file.
+	OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error)
+}
diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go
new file mode 100644
index 000000000..093ce1fb3
--- /dev/null
+++ b/pkg/sentry/fsbridge/fs.go
@@ -0,0 +1,181 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsbridge
+
+import (
+	"io"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fsFile implements File interface over fs.File.
+//
+// +stateify savable
+type fsFile struct {
+	file *fs.File
+}
+
+var _ File = (*fsFile)(nil)
+
+// NewFSFile creates a new File over fs.File.
+func NewFSFile(file *fs.File) File {
+	return &fsFile{file: file}
+}
+
+// PathnameWithDeleted implements File.
+func (f *fsFile) PathnameWithDeleted(ctx context.Context) string {
+	root := fs.RootFromContext(ctx)
+	if root == nil {
+		// This doesn't correspond to anything in Linux because the vfs is
+		// global there.
+		return ""
+	}
+	defer root.DecRef()
+
+	name, _ := f.file.Dirent.FullName(root)
+	return name
+}
+
+// ReadFull implements File.
+func (f *fsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	var total int64
+	for dst.NumBytes() > 0 {
+		n, err := f.file.Preadv(ctx, dst, offset+total)
+		total += n
+		if err == io.EOF && total != 0 {
+			return total, io.ErrUnexpectedEOF
+		} else if err != nil {
+			return total, err
+		}
+		dst = dst.DropFirst64(n)
+	}
+	return total, nil
+}
+
+// ConfigureMMap implements File.
+func (f *fsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return f.file.ConfigureMMap(ctx, opts)
+}
+
+// Type implements File.
+func (f *fsFile) Type(context.Context) (linux.FileMode, error) {
+	return linux.FileMode(f.file.Dirent.Inode.StableAttr.Type.LinuxType()), nil
+}
+
+// IncRef implements File.
+func (f *fsFile) IncRef() {
+	f.file.IncRef()
+}
+
+// DecRef implements File.
+func (f *fsFile) DecRef() {
+	f.file.DecRef()
+}
+
+// fsLookup implements Lookup interface using fs.File.
+//
+// +stateify savable
+type fsLookup struct {
+	mntns *fs.MountNamespace
+
+	root       *fs.Dirent
+	workingDir *fs.Dirent
+}
+
+var _ Lookup = (*fsLookup)(nil)
+
+// NewFSLookup creates a new Lookup using VFS1.
+func NewFSLookup(mntns *fs.MountNamespace, root, workingDir *fs.Dirent) Lookup {
+	return &fsLookup{
+		mntns:      mntns,
+		root:       root,
+		workingDir: workingDir,
+	}
+}
+
+// OpenPath implements Lookup.
+func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error) {
+	var d *fs.Dirent
+	var err error
+	if resolveFinal {
+		d, err = l.mntns.FindInode(ctx, l.root, l.workingDir, path, remainingTraversals)
+	} else {
+		d, err = l.mntns.FindLink(ctx, l.root, l.workingDir, path, remainingTraversals)
+	}
+	if err != nil {
+		return nil, err
+	}
+	defer d.DecRef()
+
+	if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
+		return nil, syserror.ELOOP
+	}
+
+	fsPerm := openOptionsToPermMask(&opts)
+	if err := d.Inode.CheckPermission(ctx, fsPerm); err != nil {
+		return nil, err
+	}
+
+	// If they claim it's a directory, then make sure.
+	if strings.HasSuffix(path, "/") {
+		if d.Inode.StableAttr.Type != fs.Directory {
+			return nil, syserror.ENOTDIR
+		}
+	}
+
+	if opts.FileExec && d.Inode.StableAttr.Type != fs.RegularFile {
+		ctx.Infof("%q is not a regular file: %v", path, d.Inode.StableAttr.Type)
+		return nil, syserror.EACCES
+	}
+
+	f, err := d.Inode.GetFile(ctx, d, flagsToFileFlags(opts.Flags))
+	if err != nil {
+		return nil, err
+	}
+
+	return &fsFile{file: f}, nil
+}
+
+func openOptionsToPermMask(opts *vfs.OpenOptions) fs.PermMask {
+	mode := opts.Flags & linux.O_ACCMODE
+	return fs.PermMask{
+		Read:    mode == linux.O_RDONLY || mode == linux.O_RDWR,
+		Write:   mode == linux.O_WRONLY || mode == linux.O_RDWR,
+		Execute: opts.FileExec,
+	}
+}
+
+func flagsToFileFlags(flags uint32) fs.FileFlags {
+	return fs.FileFlags{
+		Direct:      flags&linux.O_DIRECT != 0,
+		DSync:       flags&(linux.O_DSYNC|linux.O_SYNC) != 0,
+		Sync:        flags&linux.O_SYNC != 0,
+		NonBlocking: flags&linux.O_NONBLOCK != 0,
+		Read:        (flags & linux.O_ACCMODE) != linux.O_WRONLY,
+		Write:       (flags & linux.O_ACCMODE) != linux.O_RDONLY,
+		Append:      flags&linux.O_APPEND != 0,
+		Directory:   flags&linux.O_DIRECTORY != 0,
+		Async:       flags&linux.O_ASYNC != 0,
+		LargeFile:   flags&linux.O_LARGEFILE != 0,
+		Truncate:    flags&linux.O_TRUNC != 0,
+	}
+}
diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
new file mode 100644
index 000000000..e657c39bc
--- /dev/null
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -0,0 +1,134 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsbridge
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fsFile implements File interface over vfs.FileDescription.
+//
+// +stateify savable
+type vfsFile struct {
+	file *vfs.FileDescription
+}
+
+var _ File = (*vfsFile)(nil)
+
+// NewVFSFile creates a new File over fs.File.
+func NewVFSFile(file *vfs.FileDescription) File {
+	return &vfsFile{file: file}
+}
+
+// PathnameWithDeleted implements File.
+func (f *vfsFile) PathnameWithDeleted(ctx context.Context) string {
+	root := vfs.RootFromContext(ctx)
+	defer root.DecRef()
+
+	vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem()
+	name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry())
+	return name
+}
+
+// ReadFull implements File.
+func (f *vfsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	var total int64
+	for dst.NumBytes() > 0 {
+		n, err := f.file.PRead(ctx, dst, offset+total, vfs.ReadOptions{})
+		total += n
+		if err == io.EOF && total != 0 {
+			return total, io.ErrUnexpectedEOF
+		} else if err != nil {
+			return total, err
+		}
+		dst = dst.DropFirst64(n)
+	}
+	return total, nil
+}
+
+// ConfigureMMap implements File.
+func (f *vfsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return f.file.ConfigureMMap(ctx, opts)
+}
+
+// Type implements File.
+func (f *vfsFile) Type(ctx context.Context) (linux.FileMode, error) {
+	stat, err := f.file.Stat(ctx, vfs.StatOptions{})
+	if err != nil {
+		return 0, err
+	}
+	return linux.FileMode(stat.Mode).FileType(), nil
+}
+
+// IncRef implements File.
+func (f *vfsFile) IncRef() {
+	f.file.IncRef()
+}
+
+// DecRef implements File.
+func (f *vfsFile) DecRef() {
+	f.file.DecRef()
+}
+
+// fsLookup implements Lookup interface using fs.File.
+//
+// +stateify savable
+type vfsLookup struct {
+	mntns *vfs.MountNamespace
+
+	root       vfs.VirtualDentry
+	workingDir vfs.VirtualDentry
+}
+
+var _ Lookup = (*vfsLookup)(nil)
+
+// NewVFSLookup creates a new Lookup using VFS2.
+func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry) Lookup {
+	return &vfsLookup{
+		mntns:      mntns,
+		root:       root,
+		workingDir: workingDir,
+	}
+}
+
+// OpenPath implements Lookup.
+//
+// remainingTraversals is not configurable in VFS2, all callers are using the
+// default anyways.
+//
+// TODO(gvisor.dev/issue/1623): Check mount has read and exec permission.
+func (l *vfsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
+	vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem()
+	creds := auth.CredentialsFromContext(ctx)
+	pop := &vfs.PathOperation{
+		Root:               l.root,
+		Start:              l.root,
+		Path:               fspath.Parse(path),
+		FollowFinalSymlink: resolveFinal,
+	}
+	fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return &vfsFile{file: fd}, nil
+}
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index e03a0c665..abd4f24e7 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -28,6 +28,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
+// Name is the default filesystem name.
+const Name = "devtmpfs"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct {
 	initOnce sync.Once
@@ -107,6 +110,7 @@ func (a *Accessor) wrapContext(ctx context.Context) *accessorContext {
 func (ac *accessorContext) Value(key interface{}) interface{} {
 	switch key {
 	case vfs.CtxMountNamespace:
+		ac.a.mntns.IncRef()
 		return ac.a.mntns
 	case vfs.CtxRoot:
 		ac.a.root.IncRef()
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 138adb9f7..5cfb0dc4c 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -400,6 +400,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	}
 	vfsObj := rp.VirtualFilesystem()
 	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
 	parent.dirMu.Lock()
 	defer parent.dirMu.Unlock()
 	childVFSD := parent.vfsd.Child(name)
@@ -934,7 +935,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if oldParent == newParent && oldName == newName {
 		return nil
 	}
-	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), &renamed.vfsd, replacedVFSD); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
 		return err
 	}
 	if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index d0552bd99..d00850e25 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -52,6 +52,9 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// Name is the default filesystem name.
+const Name = "9p"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index ee98eb66a..292f58afd 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -544,6 +544,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 
 	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
 	virtfs := rp.VirtualFilesystem()
 
 	srcDirDentry := srcDirVFSD.Impl().(*Dentry)
@@ -595,7 +596,10 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	parentDentry := vfsd.Parent().Impl().(*Dentry)
 	parentDentry.dirMu.Lock()
 	defer parentDentry.dirMu.Unlock()
-	if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
 		return err
 	}
 	if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
@@ -697,7 +701,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	parentDentry := vfsd.Parent().Impl().(*Dentry)
 	parentDentry.dirMu.Lock()
 	defer parentDentry.dirMu.Unlock()
-	if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
 		return err
 	}
 	if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 12aac2e6a..a83245866 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -14,6 +14,7 @@ go_library(
         "tasks_net.go",
         "tasks_sys.go",
     ],
+    visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 11477b6a9..5c19d5522 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -26,15 +26,18 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
-// procFSType is the factory class for procfs.
+// Name is the default filesystem name.
+const Name = "proc"
+
+// FilesystemType is the factory class for procfs.
 //
 // +stateify savable
-type procFSType struct{}
+type FilesystemType struct{}
 
-var _ vfs.FilesystemType = (*procFSType)(nil)
+var _ vfs.FilesystemType = (*FilesystemType)(nil)
 
 // GetFilesystem implements vfs.FilesystemType.
-func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (ft *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	k := kernel.KernelFromContext(ctx)
 	if k == nil {
 		return nil, nil, fmt.Errorf("procfs requires a kernel")
@@ -47,12 +50,13 @@ func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile
 	procfs := &kernfs.Filesystem{}
 	procfs.VFSFilesystem().Init(vfsObj, procfs)
 
-	var data *InternalData
+	var cgroups map[string]string
 	if opts.InternalData != nil {
-		data = opts.InternalData.(*InternalData)
+		data := opts.InternalData.(*InternalData)
+		cgroups = data.Cgroups
 	}
 
-	_, dentry := newTasksInode(procfs, k, pidns, data.Cgroups)
+	_, dentry := newTasksInode(procfs, k, pidns, cgroups)
 	return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
 }
 
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 6fc3524db..96c72cbc9 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -90,8 +90,7 @@ func setup(t *testing.T) *testutil.System {
 	ctx := k.SupervisorContext()
 	creds := auth.CredentialsFromContext(ctx)
 
-	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{
+	k.VFS.MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
 	fsOpts := vfs.GetFilesystemOptions{
@@ -102,11 +101,11 @@ func setup(t *testing.T) *testutil.System {
 			},
 		},
 	}
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &fsOpts)
+	mntns, err := k.VFS.NewMountNamespace(ctx, creds, "", Name, &fsOpts)
 	if err != nil {
 		t.Fatalf("NewMountNamespace(): %v", err)
 	}
-	return testutil.NewSystem(ctx, t, vfsObj, mntns)
+	return testutil.NewSystem(ctx, t, k.VFS, mntns)
 }
 
 func TestTasksEmpty(t *testing.T) {
@@ -131,7 +130,7 @@ func TestTasks(t *testing.T) {
 	var tasks []*kernel.Task
 	for i := 0; i < 5; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc)
+		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root)
 		if err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
@@ -213,7 +212,7 @@ func TestTasksOffset(t *testing.T) {
 	k := kernel.KernelFromContext(s.Ctx)
 	for i := 0; i < 3; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc); err != nil {
+		if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root); err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
 	}
@@ -337,7 +336,7 @@ func TestTask(t *testing.T) {
 
 	k := kernel.KernelFromContext(s.Ctx)
 	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-	_, err := testutil.CreateTask(s.Ctx, "name", tc)
+	_, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root)
 	if err != nil {
 		t.Fatalf("CreateTask(): %v", err)
 	}
@@ -352,7 +351,7 @@ func TestProcSelf(t *testing.T) {
 
 	k := kernel.KernelFromContext(s.Ctx)
 	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-	task, err := testutil.CreateTask(s.Ctx, "name", tc)
+	task, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root)
 	if err != nil {
 		t.Fatalf("CreateTask(): %v", err)
 	}
@@ -433,7 +432,7 @@ func TestTree(t *testing.T) {
 	var tasks []*kernel.Task
 	for i := 0; i < 5; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc)
+		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root)
 		if err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index 66c0d8bc8..a741e2bb6 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "sys.go",
     ],
+    visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index d693fceae..c36c4fa11 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -28,6 +28,9 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// Name is the default filesystem name.
+const Name = "sysfs"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 8b1cf0bd0..5d1ba5867 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -34,16 +34,15 @@ func newTestSystem(t *testing.T) *testutil.System {
 	}
 	ctx := k.SupervisorContext()
 	creds := auth.CredentialsFromContext(ctx)
-	v := vfs.New()
-	v.MustRegisterFilesystemType("sysfs", sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+	k.VFS.MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
 
-	mns, err := v.NewMountNamespace(ctx, creds, "", "sysfs", &vfs.GetFilesystemOptions{})
+	mns, err := k.VFS.NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create new mount namespace: %v", err)
 	}
-	return testutil.NewSystem(ctx, t, v, mns)
+	return testutil.NewSystem(ctx, t, k.VFS, mns)
 }
 
 func TestReadCPUFile(t *testing.T) {
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
index efd5974c4..e4f36f4ae 100644
--- a/pkg/sentry/fsimpl/testutil/BUILD
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -16,7 +16,7 @@ go_library(
         "//pkg/cpuid",
         "//pkg/fspath",
         "//pkg/memutil",
-        "//pkg/sentry/fs",
+        "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/sched",
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index 89f8c4915..a91b3ec4d 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -24,7 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/memutil"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
@@ -33,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 
 	// Platforms are plugable.
 	_ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
@@ -99,26 +100,27 @@ func Boot() (*kernel.Kernel, error) {
 		return nil, fmt.Errorf("initializing kernel: %v", err)
 	}
 
-	ctx := k.SupervisorContext()
+	kernel.VFS2Enabled = true
+
+	vfsObj := vfs.New()
+	vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	k.VFS = vfsObj
 
-	// Create mount namespace without root as it's the minimum required to create
-	// the global thread group.
-	mntns, err := fs.NewMountNamespace(ctx, nil)
-	if err != nil {
-		return nil, err
-	}
 	ls, err := limits.NewLinuxLimitSet()
 	if err != nil {
 		return nil, err
 	}
-	tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
+	tg := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
 	k.TestOnly_SetGlobalInit(tg)
 
 	return k, nil
 }
 
 // CreateTask creates a new bare bones task for tests.
-func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) {
+func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) {
 	k := kernel.KernelFromContext(ctx)
 	config := &kernel.TaskConfig{
 		Kernel:                  k,
@@ -129,6 +131,8 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kern
 		UTSNamespace:            kernel.UTSNamespaceFromContext(ctx),
 		IPCNamespace:            kernel.IPCNamespaceFromContext(ctx),
 		AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+		MountNamespaceVFS2:      mntns,
+		FSContext:               kernel.NewFSContextVFS2(root, cwd, 0022),
 	}
 	return k.TaskSet().NewTask(config)
 }
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index 69fd84ddd..b97e3534a 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -41,12 +41,12 @@ type System struct {
 	Creds *auth.Credentials
 	VFS   *vfs.VirtualFilesystem
 	Root  vfs.VirtualDentry
-	mns   *vfs.MountNamespace
+	MntNs *vfs.MountNamespace
 }
 
 // NewSystem constructs a System.
 //
-// Precondition: Caller must hold a reference on mns, whose ownership
+// Precondition: Caller must hold a reference on MntNs, whose ownership
 // is transferred to the new System.
 func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System {
 	s := &System{
@@ -54,7 +54,7 @@ func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns
 		Ctx:   ctx,
 		Creds: auth.CredentialsFromContext(ctx),
 		VFS:   v,
-		mns:   mns,
+		MntNs: mns,
 		Root:  mns.Root(),
 	}
 	return s
@@ -75,7 +75,7 @@ func (s *System) WithSubtest(t *testing.T) *System {
 		Ctx:   s.Ctx,
 		Creds: s.Creds,
 		VFS:   s.VFS,
-		mns:   s.mns,
+		MntNs: s.MntNs,
 		Root:  s.Root,
 	}
 }
@@ -90,7 +90,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System {
 		Ctx:   ctx,
 		Creds: s.Creds,
 		VFS:   s.VFS,
-		mns:   s.mns,
+		MntNs: s.MntNs,
 		Root:  s.Root,
 	}
 }
@@ -98,7 +98,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System {
 // Destroy release resources associated with a test system.
 func (s *System) Destroy() {
 	s.Root.DecRef()
-	s.mns.DecRef() // Reference on mns passed to NewSystem.
+	s.MntNs.DecRef() // Reference on MntNs passed to NewSystem.
 }
 
 // ReadToEnd reads the contents of fd until EOF to a string.
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 8785452b6..7f7b791c4 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -486,7 +486,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	vfsObj := rp.VirtualFilesystem()
 	oldParentDir := oldParent.inode.impl.(*directory)
 	newParentDir := newParent.inode.impl.(*directory)
-	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := vfsObj.PrepareRenameDentry(mntns, renamedVFSD, replacedVFSD); err != nil {
 		return err
 	}
 	if replaced != nil {
@@ -543,7 +545,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	}
 	defer mnt.EndWrite()
 	vfsObj := rp.VirtualFilesystem()
-	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
 		return err
 	}
 	parent.inode.impl.(*directory).childList.Remove(child)
@@ -631,7 +635,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	}
 	defer mnt.EndWrite()
 	vfsObj := rp.VirtualFilesystem()
-	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
 		return err
 	}
 	parent.inode.impl.(*directory).childList.Remove(child)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 2108d0f4d..c5bb17562 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -40,6 +40,9 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// Name is the default filesystem name.
+const Name = "tmpfs"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 2231d6973..46306945f 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -157,6 +157,7 @@ go_library(
         "//pkg/context",
         "//pkg/cpuid",
         "//pkg/eventchannel",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/metric",
         "//pkg/refs",
@@ -167,6 +168,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/hostcpu",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 2448c1d99..7218aa24e 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
@@ -37,10 +38,16 @@ type FSContext struct {
 	// destroyed.
 	root *fs.Dirent
 
+	// rootVFS2 is the filesystem root.
+	rootVFS2 vfs.VirtualDentry
+
 	// cwd is the current working directory. Will be nil iff the FSContext
 	// has been destroyed.
 	cwd *fs.Dirent
 
+	// cwdVFS2 is the current working directory.
+	cwdVFS2 vfs.VirtualDentry
+
 	// umask is the current file mode creation mask. When a thread using this
 	// context invokes a syscall that creates a file, bits set in umask are
 	// removed from the permissions that the file is created with.
@@ -60,6 +67,19 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
 	return &f
 }
 
+// NewFSContextVFS2 returns a new filesystem context.
+func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
+	root.IncRef()
+	cwd.IncRef()
+	f := FSContext{
+		rootVFS2: root,
+		cwdVFS2:  cwd,
+		umask:    umask,
+	}
+	f.EnableLeakCheck("kernel.FSContext")
+	return &f
+}
+
 // destroy is the destructor for an FSContext.
 //
 // This will call DecRef on both root and cwd Dirents.  If either call to
@@ -75,11 +95,17 @@ func (f *FSContext) destroy() {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
-	f.root.DecRef()
-	f.root = nil
-
-	f.cwd.DecRef()
-	f.cwd = nil
+	if VFS2Enabled {
+		f.rootVFS2.DecRef()
+		f.rootVFS2 = vfs.VirtualDentry{}
+		f.cwdVFS2.DecRef()
+		f.cwdVFS2 = vfs.VirtualDentry{}
+	} else {
+		f.root.DecRef()
+		f.root = nil
+		f.cwd.DecRef()
+		f.cwd = nil
+	}
 }
 
 // DecRef implements RefCounter.DecRef with destructor f.destroy.
@@ -93,12 +119,21 @@ func (f *FSContext) DecRef() {
 func (f *FSContext) Fork() *FSContext {
 	f.mu.Lock()
 	defer f.mu.Unlock()
-	f.cwd.IncRef()
-	f.root.IncRef()
+
+	if VFS2Enabled {
+		f.cwdVFS2.IncRef()
+		f.rootVFS2.IncRef()
+	} else {
+		f.cwd.IncRef()
+		f.root.IncRef()
+	}
+
 	return &FSContext{
-		cwd:   f.cwd,
-		root:  f.root,
-		umask: f.umask,
+		cwd:      f.cwd,
+		root:     f.root,
+		cwdVFS2:  f.cwdVFS2,
+		rootVFS2: f.rootVFS2,
+		umask:    f.umask,
 	}
 }
 
@@ -109,12 +144,23 @@ func (f *FSContext) Fork() *FSContext {
 func (f *FSContext) WorkingDirectory() *fs.Dirent {
 	f.mu.Lock()
 	defer f.mu.Unlock()
-	if f.cwd != nil {
-		f.cwd.IncRef()
-	}
+
+	f.cwd.IncRef()
 	return f.cwd
 }
 
+// WorkingDirectoryVFS2 returns the current working directory.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.cwdVFS2.IncRef()
+	return f.cwdVFS2
+}
+
 // SetWorkingDirectory sets the current working directory.
 // This will take an extra reference on the Dirent.
 //
@@ -137,6 +183,20 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
 	old.DecRef()
 }
 
+// SetWorkingDirectoryVFS2 sets the current working directory.
+// This will take an extra reference on the VirtualDentry.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	old := f.cwdVFS2
+	f.cwdVFS2 = d
+	d.IncRef()
+	old.DecRef()
+}
+
 // RootDirectory returns the current filesystem root.
 //
 // This will return nil if called after destroy(), otherwise it will return a
@@ -150,6 +210,18 @@ func (f *FSContext) RootDirectory() *fs.Dirent {
 	return f.root
 }
 
+// RootDirectoryVFS2 returns the current filesystem root.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.rootVFS2.IncRef()
+	return f.rootVFS2
+}
+
 // SetRootDirectory sets the root directory.
 // This will take an extra reference on the Dirent.
 //
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 3ee760ba2..2665f057c 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -43,11 +43,13 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/eventchannel"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -71,6 +73,10 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
+// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow
+// easy access everywhere. To be removed once VFS2 becomes the default.
+var VFS2Enabled = false
+
 // Kernel represents an emulated Linux kernel. It must be initialized by calling
 // Init() or LoadFrom().
 //
@@ -238,6 +244,9 @@ type Kernel struct {
 
 	// SpecialOpts contains special kernel options.
 	SpecialOpts
+
+	// VFS keeps the filesystem state used across the kernel.
+	VFS *vfs.VirtualFilesystem
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -624,7 +633,7 @@ type CreateProcessArgs struct {
 	// File is a passed host FD pointing to a file to load as the init binary.
 	//
 	// This is checked if and only if Filename is "".
-	File *fs.File
+	File fsbridge.File
 
 	// Argvv is a list of arguments.
 	Argv []string
@@ -673,6 +682,13 @@ type CreateProcessArgs struct {
 	// increment it).
 	MountNamespace *fs.MountNamespace
 
+	// MountNamespaceVFS2 optionally contains the mount namespace for this
+	// process. If nil, the init process's mount namespace is used.
+	//
+	// Anyone setting MountNamespaceVFS2 must donate a reference (i.e.
+	// increment it).
+	MountNamespaceVFS2 *vfs.MountNamespace
+
 	// ContainerID is the container that the process belongs to.
 	ContainerID string
 }
@@ -711,11 +727,22 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.args.Credentials
 	case fs.CtxRoot:
 		if ctx.args.MountNamespace != nil {
-			// MountNamespace.Root() will take a reference on the root
-			// dirent for us.
+			// MountNamespace.Root() will take a reference on the root dirent for us.
 			return ctx.args.MountNamespace.Root()
 		}
 		return nil
+	case vfs.CtxRoot:
+		if ctx.args.MountNamespaceVFS2 == nil {
+			return nil
+		}
+		// MountNamespaceVFS2.Root() takes a reference on the root dirent for us.
+		return ctx.args.MountNamespaceVFS2.Root()
+	case vfs.CtxMountNamespace:
+		if ctx.k.globalInit == nil {
+			return nil
+		}
+		// MountNamespaceVFS2 takes a reference for us.
+		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
 	case ktime.CtxRealtimeClock:
@@ -757,34 +784,77 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	defer k.extMu.Unlock()
 	log.Infof("EXEC: %v", args.Argv)
 
-	// Grab the mount namespace.
-	mounts := args.MountNamespace
-	if mounts == nil {
-		mounts = k.GlobalInit().Leader().MountNamespace()
-		mounts.IncRef()
-	}
-
-	tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
 	ctx := args.NewContext(k)
 
-	// Get the root directory from the MountNamespace.
-	root := mounts.Root()
-	// The call to newFSContext below will take a reference on root, so we
-	// don't need to hold this one.
-	defer root.DecRef()
-
-	// Grab the working directory.
-	remainingTraversals := uint(args.MaxSymlinkTraversals)
-	wd := root // Default.
-	if args.WorkingDirectory != "" {
-		var err error
-		wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
-		if err != nil {
-			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+	var (
+		opener    fsbridge.Lookup
+		fsContext *FSContext
+		mntns     *fs.MountNamespace
+	)
+
+	if VFS2Enabled {
+		mntnsVFS2 := args.MountNamespaceVFS2
+		if mntnsVFS2 == nil {
+			// MountNamespaceVFS2 adds a reference to the namespace, which is
+			// transferred to the new process.
+			mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2()
+		}
+		// Get the root directory from the MountNamespace.
+		root := args.MountNamespaceVFS2.Root()
+		// The call to newFSContext below will take a reference on root, so we
+		// don't need to hold this one.
+		defer root.DecRef()
+
+		// Grab the working directory.
+		wd := root // Default.
+		if args.WorkingDirectory != "" {
+			pop := vfs.PathOperation{
+				Root:               root,
+				Start:              wd,
+				Path:               fspath.Parse(args.WorkingDirectory),
+				FollowFinalSymlink: true,
+			}
+			var err error
+			wd, err = k.VFS.GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{
+				CheckSearchable: true,
+			})
+			if err != nil {
+				return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+			}
+			defer wd.DecRef()
+		}
+		opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd)
+		fsContext = NewFSContextVFS2(root, wd, args.Umask)
+
+	} else {
+		mntns = args.MountNamespace
+		if mntns == nil {
+			mntns = k.GlobalInit().Leader().MountNamespace()
+			mntns.IncRef()
 		}
-		defer wd.DecRef()
+		// Get the root directory from the MountNamespace.
+		root := mntns.Root()
+		// The call to newFSContext below will take a reference on root, so we
+		// don't need to hold this one.
+		defer root.DecRef()
+
+		// Grab the working directory.
+		remainingTraversals := args.MaxSymlinkTraversals
+		wd := root // Default.
+		if args.WorkingDirectory != "" {
+			var err error
+			wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
+			if err != nil {
+				return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+			}
+			defer wd.DecRef()
+		}
+		opener = fsbridge.NewFSLookup(mntns, root, wd)
+		fsContext = newFSContext(root, wd, args.Umask)
 	}
 
+	tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
+
 	// Check which file to start from.
 	switch {
 	case args.Filename != "":
@@ -805,11 +875,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	}
 
 	// Create a fresh task context.
-	remainingTraversals = uint(args.MaxSymlinkTraversals)
+	remainingTraversals := args.MaxSymlinkTraversals
 	loadArgs := loader.LoadArgs{
-		Mounts:              mounts,
-		Root:                root,
-		WorkingDirectory:    wd,
+		Opener:              opener,
 		RemainingTraversals: &remainingTraversals,
 		ResolveFinal:        true,
 		Filename:            args.Filename,
@@ -834,13 +902,14 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		Kernel:                  k,
 		ThreadGroup:             tg,
 		TaskContext:             tc,
-		FSContext:               newFSContext(root, wd, args.Umask),
+		FSContext:               fsContext,
 		FDTable:                 args.FDTable,
 		Credentials:             args.Credentials,
 		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
+		MountNamespaceVFS2:      args.MountNamespaceVFS2,
 		ContainerID:             args.ContainerID,
 	}
 	t, err := k.tasks.NewTask(config)
@@ -1378,6 +1447,20 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 			return ctx.k.globalInit.mounts.Root()
 		}
 		return nil
+	case vfs.CtxRoot:
+		if ctx.k.globalInit == nil {
+			return vfs.VirtualDentry{}
+		}
+		mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+		defer mntns.DecRef()
+		// Root() takes a reference on the root dirent for us.
+		return mntns.Root()
+	case vfs.CtxMountNamespace:
+		if ctx.k.globalInit == nil {
+			return nil
+		}
+		// MountNamespaceVFS2() takes a reference for us.
+		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
 	case ktime.CtxRealtimeClock:
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 981e8c7fe..a3443ff21 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -424,6 +424,11 @@ type Task struct {
 	// abstractSockets is protected by mu.
 	abstractSockets *AbstractSocketNamespace
 
+	// mountNamespaceVFS2 is the task's mount namespace.
+	//
+	// It is protected by mu. It is owned by the task goroutine.
+	mountNamespaceVFS2 *vfs.MountNamespace
+
 	// parentDeathSignal is sent to this task's thread group when its parent exits.
 	//
 	// parentDeathSignal is protected by mu.
@@ -638,6 +643,11 @@ func (t *Task) Value(key interface{}) interface{} {
 		return int32(t.ThreadGroup().ID())
 	case fs.CtxRoot:
 		return t.fsContext.RootDirectory()
+	case vfs.CtxRoot:
+		return t.fsContext.RootDirectoryVFS2()
+	case vfs.CtxMountNamespace:
+		t.mountNamespaceVFS2.IncRef()
+		return t.mountNamespaceVFS2
 	case fs.CtxDirentCacheLimiter:
 		return t.k.DirentCacheLimiter
 	case inet.CtxStack:
@@ -701,6 +711,14 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
 // Preconditions: The caller must be running on the task goroutine, or t.mu
 // must be locked.
 func (t *Task) IsChrooted() bool {
+	if VFS2Enabled {
+		realRoot := t.mountNamespaceVFS2.Root()
+		defer realRoot.DecRef()
+		root := t.fsContext.RootDirectoryVFS2()
+		defer root.DecRef()
+		return root != realRoot
+	}
+
 	realRoot := t.tg.mounts.Root()
 	defer realRoot.DecRef()
 	root := t.fsContext.RootDirectory()
@@ -796,6 +814,15 @@ func (t *Task) MountNamespace() *fs.MountNamespace {
 	return t.tg.mounts
 }
 
+// MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the
+// returned mount namespace.
+func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.mountNamespaceVFS2.IncRef()
+	return t.mountNamespaceVFS2
+}
+
 // AbstractSockets returns t's AbstractSocketNamespace.
 func (t *Task) AbstractSockets() *AbstractSocketNamespace {
 	return t.abstractSockets
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 53d4d211b..ba74b4c1c 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -199,6 +199,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		ipcns = NewIPCNamespace(userns)
 	}
 
+	// TODO(b/63601033): Implement CLONE_NEWNS.
+	mntnsVFS2 := t.mountNamespaceVFS2
+	if mntnsVFS2 != nil {
+		mntnsVFS2.IncRef()
+	}
+
 	tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
 	if err != nil {
 		return 0, nil, err
@@ -241,7 +247,9 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	rseqAddr := usermem.Addr(0)
 	rseqSignature := uint32(0)
 	if opts.NewThreadGroup {
-		tg.mounts.IncRef()
+		if tg.mounts != nil {
+			tg.mounts.IncRef()
+		}
 		sh := t.tg.signalHandlers
 		if opts.NewSignalHandlers {
 			sh = sh.Fork()
@@ -265,6 +273,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		UTSNamespace:            utsns,
 		IPCNamespace:            ipcns,
 		AbstractSocketNamespace: t.abstractSockets,
+		MountNamespaceVFS2:      mntnsVFS2,
 		RSeqAddr:                rseqAddr,
 		RSeqSignature:           rseqSignature,
 		ContainerID:             t.ContainerID(),
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 2d6e7733c..2be982684 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -136,7 +136,7 @@ func (t *Task) Stack() *arch.Stack {
 func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) {
 	// If File is not nil, we should load that instead of resolving Filename.
 	if args.File != nil {
-		args.Filename = args.File.MappedName(ctx)
+		args.Filename = args.File.PathnameWithDeleted(ctx)
 	}
 
 	// Prepare a new user address space to load into.
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 435761e5a..c4ade6e8e 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -269,6 +269,13 @@ func (*runExitMain) execute(t *Task) taskRunState {
 	t.fsContext.DecRef()
 	t.fdTable.DecRef()
 
+	t.mu.Lock()
+	if t.mountNamespaceVFS2 != nil {
+		t.mountNamespaceVFS2.DecRef()
+		t.mountNamespaceVFS2 = nil
+	}
+	t.mu.Unlock()
+
 	// If this is the last task to exit from the thread group, release the
 	// thread group's resources.
 	if lastExiter {
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index 41259210c..6d737d3e5 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -198,18 +198,11 @@ func (t *Task) traceExecEvent(tc *TaskContext) {
 	if !trace.IsEnabled() {
 		return
 	}
-	d := tc.MemoryManager.Executable()
-	if d == nil {
+	file := tc.MemoryManager.Executable()
+	if file == nil {
 		trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
 		return
 	}
-	defer d.DecRef()
-	root := t.fsContext.RootDirectory()
-	if root == nil {
-		trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>")
-		return
-	}
-	defer root.DecRef()
-	n, _ := d.FullName(root)
-	trace.Logf(t.traceContext, traceCategory, "exec: %s", n)
+	defer file.DecRef()
+	trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t))
 }
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index de838beef..f9236a842 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -80,6 +81,9 @@ type TaskConfig struct {
 	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
 	AbstractSocketNamespace *AbstractSocketNamespace
 
+	// MountNamespaceVFS2 is the MountNamespace of the new task.
+	MountNamespaceVFS2 *vfs.MountNamespace
+
 	// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
 	RSeqAddr usermem.Addr
 
@@ -116,28 +120,29 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 			parent:   cfg.Parent,
 			children: make(map[*Task]struct{}),
 		},
-		runState:        (*runApp)(nil),
-		interruptChan:   make(chan struct{}, 1),
-		signalMask:      cfg.SignalMask,
-		signalStack:     arch.SignalStack{Flags: arch.SignalStackFlagDisable},
-		tc:              *tc,
-		fsContext:       cfg.FSContext,
-		fdTable:         cfg.FDTable,
-		p:               cfg.Kernel.Platform.NewContext(),
-		k:               cfg.Kernel,
-		ptraceTracees:   make(map[*Task]struct{}),
-		allowedCPUMask:  cfg.AllowedCPUMask.Copy(),
-		ioUsage:         &usage.IO{},
-		niceness:        cfg.Niceness,
-		netns:           cfg.NetworkNamespaced,
-		utsns:           cfg.UTSNamespace,
-		ipcns:           cfg.IPCNamespace,
-		abstractSockets: cfg.AbstractSocketNamespace,
-		rseqCPU:         -1,
-		rseqAddr:        cfg.RSeqAddr,
-		rseqSignature:   cfg.RSeqSignature,
-		futexWaiter:     futex.NewWaiter(),
-		containerID:     cfg.ContainerID,
+		runState:           (*runApp)(nil),
+		interruptChan:      make(chan struct{}, 1),
+		signalMask:         cfg.SignalMask,
+		signalStack:        arch.SignalStack{Flags: arch.SignalStackFlagDisable},
+		tc:                 *tc,
+		fsContext:          cfg.FSContext,
+		fdTable:            cfg.FDTable,
+		p:                  cfg.Kernel.Platform.NewContext(),
+		k:                  cfg.Kernel,
+		ptraceTracees:      make(map[*Task]struct{}),
+		allowedCPUMask:     cfg.AllowedCPUMask.Copy(),
+		ioUsage:            &usage.IO{},
+		niceness:           cfg.Niceness,
+		netns:              cfg.NetworkNamespaced,
+		utsns:              cfg.UTSNamespace,
+		ipcns:              cfg.IPCNamespace,
+		abstractSockets:    cfg.AbstractSocketNamespace,
+		mountNamespaceVFS2: cfg.MountNamespaceVFS2,
+		rseqCPU:            -1,
+		rseqAddr:           cfg.RSeqAddr,
+		rseqSignature:      cfg.RSeqSignature,
+		futexWaiter:        futex.NewWaiter(),
+		containerID:        cfg.ContainerID,
 	}
 	t.creds.Store(cfg.Credentials)
 	t.endStopCond.L = &t.tg.signalHandlers.mu
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 768e958d2..268f62e9d 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -256,7 +256,7 @@ type ThreadGroup struct {
 	tty *TTY
 }
 
-// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
+// NewThreadGroup returns a new, empty thread group in PID namespace pidns. The
 // thread group leader will send its parent terminationSignal when it exits.
 // The new thread group isn't visible to the system until a task has been
 // created inside of it by a successful call to TaskSet.NewTask.
@@ -317,7 +317,9 @@ func (tg *ThreadGroup) release() {
 	for _, it := range its {
 		it.DestroyTimer()
 	}
-	tg.mounts.DecRef()
+	if tg.mounts != nil {
+		tg.mounts.DecRef()
+	}
 }
 
 // forEachChildThreadGroupLocked indicates over all child ThreadGroups.
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 23790378a..c6aa65f28 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -33,6 +33,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
@@ -40,6 +41,7 @@ go_library(
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 122ed05c2..616fafa2c 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -27,7 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -97,11 +97,11 @@ type elfInfo struct {
 // accepts from the ELF, and it doesn't parse unnecessary parts of the file.
 //
 // ctx may be nil if f does not need it.
-func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
+func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) {
 	// Check ident first; it will tell us the endianness of the rest of the
 	// structs.
 	var ident [elf.EI_NIDENT]byte
-	_, err := readFull(ctx, f, usermem.BytesIOSequence(ident[:]), 0)
+	_, err := f.ReadFull(ctx, usermem.BytesIOSequence(ident[:]), 0)
 	if err != nil {
 		log.Infof("Error reading ELF ident: %v", err)
 		// The entire ident array always exists.
@@ -137,7 +137,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
 
 	var hdr elf.Header64
 	hdrBuf := make([]byte, header64Size)
-	_, err = readFull(ctx, f, usermem.BytesIOSequence(hdrBuf), 0)
+	_, err = f.ReadFull(ctx, usermem.BytesIOSequence(hdrBuf), 0)
 	if err != nil {
 		log.Infof("Error reading ELF header: %v", err)
 		// The entire header always exists.
@@ -187,7 +187,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
 	}
 
 	phdrBuf := make([]byte, totalPhdrSize)
-	_, err = readFull(ctx, f, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
+	_, err = f.ReadFull(ctx, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
 	if err != nil {
 		log.Infof("Error reading ELF phdrs: %v", err)
 		// If phdrs were specified, they should all exist.
@@ -227,7 +227,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
 
 // mapSegment maps a phdr into the Task. offset is the offset to apply to
 // phdr.Vaddr.
-func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.ProgHeader, offset usermem.Addr) error {
+func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr *elf.ProgHeader, offset usermem.Addr) error {
 	// We must make a page-aligned mapping.
 	adjust := usermem.Addr(phdr.Vaddr).PageOffset()
 
@@ -395,7 +395,7 @@ type loadedELF struct {
 //
 // Preconditions:
 //  * f is an ELF file
-func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
+func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
 	first := true
 	var start, end usermem.Addr
 	var interpreter string
@@ -431,7 +431,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 			}
 
 			path := make([]byte, phdr.Filesz)
-			_, err := readFull(ctx, f, usermem.BytesIOSequence(path), int64(phdr.Off))
+			_, err := f.ReadFull(ctx, usermem.BytesIOSequence(path), int64(phdr.Off))
 			if err != nil {
 				// If an interpreter was specified, it should exist.
 				ctx.Infof("Error reading PT_INTERP path: %v", err)
@@ -564,7 +564,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 // Preconditions:
 //  * f is an ELF file
 //  * f is the first ELF loaded into m
-func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
+func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f fsbridge.File) (loadedELF, arch.Context, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
 		ctx.Infof("Failed to parse initial ELF: %v", err)
@@ -602,7 +602,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS
 //
 // Preconditions:
 //  * f is an ELF file
-func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, initial loadedELF) (loadedELF, error) {
+func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
 		if err == syserror.ENOEXEC {
@@ -649,16 +649,14 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error
 		// Refresh the traversal limit.
 		*args.RemainingTraversals = linux.MaxSymlinkTraversals
 		args.Filename = bin.interpreter
-		d, i, err := openPath(ctx, args)
+		intFile, err := openPath(ctx, args)
 		if err != nil {
 			ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err)
 			return loadedELF{}, nil, err
 		}
-		defer i.DecRef()
-		// We don't need the Dirent.
-		d.DecRef()
+		defer intFile.DecRef()
 
-		interp, err = loadInterpreterELF(ctx, args.MemoryManager, i, bin)
+		interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin)
 		if err != nil {
 			ctx.Infof("Error loading interpreter: %v", err)
 			return loadedELF{}, nil, err
diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
index 098a45d36..3886b4d33 100644
--- a/pkg/sentry/loader/interpreter.go
+++ b/pkg/sentry/loader/interpreter.go
@@ -19,7 +19,7 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -37,9 +37,9 @@ const (
 )
 
 // parseInterpreterScript returns the interpreter path and argv.
-func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, argv []string) (newpath string, newargv []string, err error) {
+func parseInterpreterScript(ctx context.Context, filename string, f fsbridge.File, argv []string) (newpath string, newargv []string, err error) {
 	line := make([]byte, interpMaxLineLength)
-	n, err := readFull(ctx, f, usermem.BytesIOSequence(line), 0)
+	n, err := f.ReadFull(ctx, usermem.BytesIOSequence(line), 0)
 	// Short read is OK.
 	if err != nil && err != io.ErrUnexpectedEOF {
 		if err == io.EOF {
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 9a613d6b7..d6675b8f0 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -20,7 +20,6 @@ import (
 	"fmt"
 	"io"
 	"path"
-	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -29,8 +28,10 @@ import (
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -41,16 +42,6 @@ type LoadArgs struct {
 	// MemoryManager is the memory manager to load the executable into.
 	MemoryManager *mm.MemoryManager
 
-	// Mounts is the mount namespace in which to look up Filename.
-	Mounts *fs.MountNamespace
-
-	// Root is the root directory under which to look up Filename.
-	Root *fs.Dirent
-
-	// WorkingDirectory is the working directory under which to look up
-	// Filename.
-	WorkingDirectory *fs.Dirent
-
 	// RemainingTraversals is the maximum number of symlinks to follow to
 	// resolve Filename. This counter is passed by reference to keep it
 	// updated throughout the call stack.
@@ -65,7 +56,12 @@ type LoadArgs struct {
 
 	// File is an open fs.File object of the executable. If File is not
 	// nil, then File will be loaded and Filename will be ignored.
-	File *fs.File
+	//
+	// The caller is responsible for checking that the user can execute this file.
+	File fsbridge.File
+
+	// Opener is used to open the executable file when 'File' is nil.
+	Opener fsbridge.Lookup
 
 	// CloseOnExec indicates that the executable (or one of its parent
 	// directories) was opened with O_CLOEXEC. If the executable is an
@@ -106,103 +102,32 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in
 // installed in the Task FDTable. The caller takes ownership of both.
 //
 // args.Filename must be a readable, executable, regular file.
-func openPath(ctx context.Context, args LoadArgs) (*fs.Dirent, *fs.File, error) {
+func openPath(ctx context.Context, args LoadArgs) (fsbridge.File, error) {
 	if args.Filename == "" {
 		ctx.Infof("cannot open empty name")
-		return nil, nil, syserror.ENOENT
-	}
-
-	var d *fs.Dirent
-	var err error
-	if args.ResolveFinal {
-		d, err = args.Mounts.FindInode(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals)
-	} else {
-		d, err = args.Mounts.FindLink(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals)
-	}
-	if err != nil {
-		return nil, nil, err
-	}
-	// Defer a DecRef for the sake of failure cases.
-	defer d.DecRef()
-
-	if !args.ResolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
-		return nil, nil, syserror.ELOOP
-	}
-
-	if err := checkPermission(ctx, d); err != nil {
-		return nil, nil, err
-	}
-
-	// If they claim it's a directory, then make sure.
-	//
-	// N.B. we reject directories below, but we must first reject
-	// non-directories passed as directories.
-	if strings.HasSuffix(args.Filename, "/") && !fs.IsDir(d.Inode.StableAttr) {
-		return nil, nil, syserror.ENOTDIR
-	}
-
-	if err := checkIsRegularFile(ctx, d, args.Filename); err != nil {
-		return nil, nil, err
-	}
-
-	f, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
-	if err != nil {
-		return nil, nil, err
-	}
-	// Defer a DecRef for the sake of failure cases.
-	defer f.DecRef()
-
-	if err := checkPread(ctx, f, args.Filename); err != nil {
-		return nil, nil, err
-	}
-
-	d.IncRef()
-	f.IncRef()
-	return d, f, err
-}
-
-// checkFile performs checks on a file to be executed.
-func checkFile(ctx context.Context, f *fs.File, filename string) error {
-	if err := checkPermission(ctx, f.Dirent); err != nil {
-		return err
-	}
-
-	if err := checkIsRegularFile(ctx, f.Dirent, filename); err != nil {
-		return err
+		return nil, syserror.ENOENT
 	}
 
-	return checkPread(ctx, f, filename)
-}
-
-// checkPermission checks whether the file is readable and executable.
-func checkPermission(ctx context.Context, d *fs.Dirent) error {
-	perms := fs.PermMask{
-		// TODO(gvisor.dev/issue/160): Linux requires only execute
-		// permission, not read. However, our backing filesystems may
-		// prevent us from reading the file without read permission.
-		//
-		// Additionally, a task with a non-readable executable has
-		// additional constraints on access via ptrace and procfs.
-		Read:    true,
-		Execute: true,
+	// TODO(gvisor.dev/issue/160): Linux requires only execute permission,
+	// not read. However, our backing filesystems may prevent us from reading
+	// the file without read permission. Additionally, a task with a
+	// non-readable executable has additional constraints on access via
+	// ptrace and procfs.
+	opts := vfs.OpenOptions{
+		Flags:    linux.O_RDONLY,
+		FileExec: true,
 	}
-	return d.Inode.CheckPermission(ctx, perms)
+	return args.Opener.OpenPath(ctx, args.Filename, opts, args.RemainingTraversals, args.ResolveFinal)
 }
 
 // checkIsRegularFile prevents us from trying to execute a directory, pipe, etc.
-func checkIsRegularFile(ctx context.Context, d *fs.Dirent, filename string) error {
-	attr := d.Inode.StableAttr
-	if !fs.IsRegular(attr) {
-		ctx.Infof("%s is not regular: %v", filename, attr)
-		return syserror.EACCES
+func checkIsRegularFile(ctx context.Context, file fsbridge.File, filename string) error {
+	t, err := file.Type(ctx)
+	if err != nil {
+		return err
 	}
-	return nil
-}
-
-// checkPread checks whether we can read the file at arbitrary offsets.
-func checkPread(ctx context.Context, f *fs.File, filename string) error {
-	if !f.Flags().Pread {
-		ctx.Infof("%s cannot be read at an offset: %+v", filename, f.Flags())
+	if t != linux.ModeRegular {
+		ctx.Infof("%q is not a regular file: %v", filename, t)
 		return syserror.EACCES
 	}
 	return nil
@@ -224,8 +149,10 @@ const (
 	maxLoaderAttempts = 6
 )
 
-// loadExecutable loads an executable that is pointed to by args.File. If nil,
-// the path args.Filename is resolved and loaded. If the executable is an
+// loadExecutable loads an executable that is pointed to by args.File. The
+// caller is responsible for checking that the user can execute this file.
+// If nil, the path args.Filename is resolved and loaded (check that the user
+// can execute this file is done here in this case). If the executable is an
 // interpreter script rather than an ELF, the binary of the corresponding
 // interpreter will be loaded.
 //
@@ -234,37 +161,27 @@ const (
 //  * arch.Context matching the binary arch
 //  * fs.Dirent of the binary file
 //  * Possibly updated args.Argv
-func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, fsbridge.File, []string, error) {
 	for i := 0; i < maxLoaderAttempts; i++ {
-		var (
-			d   *fs.Dirent
-			err error
-		)
 		if args.File == nil {
-			d, args.File, err = openPath(ctx, args)
-			// We will return d in the successful case, but defer a DecRef for the
-			// sake of intermediate loops and failure cases.
-			if d != nil {
-				defer d.DecRef()
-			}
-			if args.File != nil {
-				defer args.File.DecRef()
+			var err error
+			args.File, err = openPath(ctx, args)
+			if err != nil {
+				ctx.Infof("Error opening %s: %v", args.Filename, err)
+				return loadedELF{}, nil, nil, nil, err
 			}
+			// Ensure file is release in case the code loops or errors out.
+			defer args.File.DecRef()
 		} else {
-			d = args.File.Dirent
-			d.IncRef()
-			defer d.DecRef()
-			err = checkFile(ctx, args.File, args.Filename)
-		}
-		if err != nil {
-			ctx.Infof("Error opening %s: %v", args.Filename, err)
-			return loadedELF{}, nil, nil, nil, err
+			if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil {
+				return loadedELF{}, nil, nil, nil, err
+			}
 		}
 
 		// Check the header. Is this an ELF or interpreter script?
 		var hdr [4]uint8
 		// N.B. We assume that reading from a regular file cannot block.
-		_, err = readFull(ctx, args.File, usermem.BytesIOSequence(hdr[:]), 0)
+		_, err := args.File.ReadFull(ctx, usermem.BytesIOSequence(hdr[:]), 0)
 		// Allow unexpected EOF, as a valid executable could be only three bytes
 		// (e.g., #!a).
 		if err != nil && err != io.ErrUnexpectedEOF {
@@ -281,9 +198,10 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 				ctx.Infof("Error loading ELF: %v", err)
 				return loadedELF{}, nil, nil, nil, err
 			}
-			// An ELF is always terminal. Hold on to d.
-			d.IncRef()
-			return loaded, ac, d, args.Argv, err
+			// An ELF is always terminal. Hold on to file.
+			args.File.IncRef()
+			return loaded, ac, args.File, args.Argv, err
+
 		case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)):
 			if args.CloseOnExec {
 				return loadedELF{}, nil, nil, nil, syserror.ENOENT
@@ -295,6 +213,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 			}
 			// Refresh the traversal limit for the interpreter.
 			*args.RemainingTraversals = linux.MaxSymlinkTraversals
+
 		default:
 			ctx.Infof("Unknown magic: %v", hdr)
 			return loadedELF{}, nil, nil, nil, syserror.ENOEXEC
@@ -317,11 +236,11 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 //  * Load is called on the Task goroutine.
 func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
 	// Load the executable itself.
-	loaded, ac, d, newArgv, err := loadExecutable(ctx, args)
+	loaded, ac, file, newArgv, err := loadExecutable(ctx, args)
 	if err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
 	}
-	defer d.DecRef()
+	defer file.DecRef()
 
 	// Load the VDSO.
 	vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded)
@@ -390,7 +309,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
 	m.SetEnvvStart(sl.EnvvStart)
 	m.SetEnvvEnd(sl.EnvvEnd)
 	m.SetAuxv(auxv)
-	m.SetExecutable(d)
+	m.SetExecutable(file)
 
 	ac.SetIP(uintptr(loaded.entry))
 	ac.SetStack(uintptr(stack.Bottom))
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 52f446ed7..161b28c2c 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -69,6 +70,8 @@ type byteReader struct {
 var _ fs.FileOperations = (*byteReader)(nil)
 
 // newByteReaderFile creates a fake file to read data from.
+//
+// TODO(gvisor.dev/issue/1623): Convert to VFS2.
 func newByteReaderFile(ctx context.Context, data []byte) *fs.File {
 	// Create a fake inode.
 	inode := fs.NewInode(
@@ -123,7 +126,7 @@ func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSeq
 // * PT_LOAD segments don't extend beyond the end of the file.
 //
 // ctx may be nil if f does not need it.
-func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error) {
+func validateVDSO(ctx context.Context, f fsbridge.File, size uint64) (elfInfo, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
 		log.Infof("Unable to parse VDSO header: %v", err)
@@ -221,7 +224,7 @@ type VDSO struct {
 // PrepareVDSO validates the system VDSO and returns a VDSO, containing the
 // param page for updating by the kernel.
 func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
-	vdsoFile := newByteReaderFile(ctx, vdsoBin)
+	vdsoFile := fsbridge.NewFSFile(newByteReaderFile(ctx, vdsoBin))
 
 	// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
 	// nil context can be passed.
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index e5729ced5..73591dab7 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -105,8 +105,8 @@ go_library(
         "//pkg/safecopy",
         "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/futex",
         "//pkg/sentry/kernel/shm",
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index f550acae0..6a49334f4 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -16,7 +16,7 @@ package mm
 
 import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -132,7 +132,7 @@ func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) {
 //
 // An additional reference will be taken in the case of a non-nil executable,
 // which must be released by the caller.
-func (mm *MemoryManager) Executable() *fs.Dirent {
+func (mm *MemoryManager) Executable() fsbridge.File {
 	mm.metadataMu.Lock()
 	defer mm.metadataMu.Unlock()
 
@@ -147,15 +147,15 @@ func (mm *MemoryManager) Executable() *fs.Dirent {
 // SetExecutable sets the executable.
 //
 // This takes a reference on d.
-func (mm *MemoryManager) SetExecutable(d *fs.Dirent) {
+func (mm *MemoryManager) SetExecutable(file fsbridge.File) {
 	mm.metadataMu.Lock()
 
 	// Grab a new reference.
-	d.IncRef()
+	file.IncRef()
 
 	// Set the executable.
 	orig := mm.executable
-	mm.executable = d
+	mm.executable = file
 
 	mm.metadataMu.Unlock()
 
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 09e582dd3..637383c7a 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -37,7 +37,7 @@ package mm
 import (
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
@@ -215,7 +215,7 @@ type MemoryManager struct {
 	// is not nil, it holds a reference on the Dirent.
 	//
 	// executable is protected by metadataMu.
-	executable *fs.Dirent
+	executable fsbridge.File
 
 	// dumpability describes if and how this MemoryManager may be dumped to
 	// userspace.
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index a796b2396..46cb2a1cc 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -141,6 +141,10 @@ func path(t *kernel.Task, addr usermem.Addr) string {
 }
 
 func fd(t *kernel.Task, fd int32) string {
+	if kernel.VFS2Enabled {
+		return fdVFS2(t, fd)
+	}
+
 	root := t.FSContext().RootDirectory()
 	if root != nil {
 		defer root.DecRef()
@@ -169,6 +173,30 @@ func fd(t *kernel.Task, fd int32) string {
 	return fmt.Sprintf("%#x %s", fd, name)
 }
 
+func fdVFS2(t *kernel.Task, fd int32) string {
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+
+	vfsObj := root.Mount().Filesystem().VirtualFilesystem()
+	if fd == linux.AT_FDCWD {
+		wd := t.FSContext().WorkingDirectoryVFS2()
+		defer wd.DecRef()
+
+		name, _ := vfsObj.PathnameWithDeleted(t, root, wd)
+		return fmt.Sprintf("AT_FDCWD %s", name)
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		// Cast FD to uint64 to avoid printing negative hex.
+		return fmt.Sprintf("%#x (bad FD)", uint64(fd))
+	}
+	defer file.DecRef()
+
+	name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry())
+	return fmt.Sprintf("%#x %s", fd, name)
+}
+
 func fdpair(t *kernel.Task, addr usermem.Addr) string {
 	var fds [2]int32
 	_, err := t.CopyIn(addr, &fds)
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index be16ee686..0d24fd3c4 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -74,6 +74,7 @@ go_library(
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/epoll",
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 98db32d77..9c6728530 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -135,7 +136,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			}
 
 			// Set the underlying executable.
-			t.MemoryManager().SetExecutable(file.Dirent)
+			t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file))
 
 		case linux.PR_SET_MM_AUXV,
 			linux.PR_SET_MM_START_CODE,
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 0c9e2255d..00915fdde 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/loader"
@@ -119,7 +120,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 	defer root.DecRef()
 
 	var wd *fs.Dirent
-	var executable *fs.File
+	var executable fsbridge.File
 	var closeOnExec bool
 	if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) {
 		// Even if the pathname is absolute, we may still need the wd
@@ -136,7 +137,15 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 		closeOnExec = fdFlags.CloseOnExec
 
 		if atEmptyPath && len(pathname) == 0 {
-			executable = f
+			// TODO(gvisor.dev/issue/160): Linux requires only execute permission,
+			// not read. However, our backing filesystems may prevent us from reading
+			// the file without read permission. Additionally, a task with a
+			// non-readable executable has additional constraints on access via
+			// ptrace and procfs.
+			if err := f.Dirent.Inode.CheckPermission(t, fs.PermMask{Read: true, Execute: true}); err != nil {
+				return 0, nil, err
+			}
+			executable = fsbridge.NewFSFile(f)
 		} else {
 			wd = f.Dirent
 			wd.IncRef()
@@ -152,9 +161,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 	// Load the new TaskContext.
 	remainingTraversals := uint(linux.MaxSymlinkTraversals)
 	loadArgs := loader.LoadArgs{
-		Mounts:              t.MountNamespace(),
-		Root:                root,
-		WorkingDirectory:    wd,
+		Opener:              fsbridge.NewFSLookup(t.MountNamespace(), root, wd),
 		RemainingTraversals: &remainingTraversals,
 		ResolveFinal:        resolveFinal,
 		Filename:            pathname,
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
index c134714ee..e0ac32b33 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -22,4 +22,110 @@ import (
 // Override syscall table to add syscalls implementations from this package.
 func Override(table map[uintptr]kernel.Syscall) {
 	table[0] = syscalls.Supported("read", Read)
+
+	// Remove syscalls that haven't been converted yet. It's better to get ENOSYS
+	// rather than a SIGSEGV deep in the stack.
+	delete(table, 1)   // write
+	delete(table, 2)   // open
+	delete(table, 3)   // close
+	delete(table, 4)   // stat
+	delete(table, 5)   // fstat
+	delete(table, 6)   // lstat
+	delete(table, 7)   // poll
+	delete(table, 8)   // lseek
+	delete(table, 9)   // mmap
+	delete(table, 16)  // ioctl
+	delete(table, 17)  // pread64
+	delete(table, 18)  // pwrite64
+	delete(table, 19)  // readv
+	delete(table, 20)  // writev
+	delete(table, 21)  // access
+	delete(table, 22)  // pipe
+	delete(table, 32)  // dup
+	delete(table, 33)  // dup2
+	delete(table, 40)  // sendfile
+	delete(table, 59)  // execve
+	delete(table, 72)  // fcntl
+	delete(table, 73)  // flock
+	delete(table, 74)  // fsync
+	delete(table, 75)  // fdatasync
+	delete(table, 76)  // truncate
+	delete(table, 77)  // ftruncate
+	delete(table, 78)  // getdents
+	delete(table, 79)  // getcwd
+	delete(table, 80)  // chdir
+	delete(table, 81)  // fchdir
+	delete(table, 82)  // rename
+	delete(table, 83)  // mkdir
+	delete(table, 84)  // rmdir
+	delete(table, 85)  // creat
+	delete(table, 86)  // link
+	delete(table, 87)  // unlink
+	delete(table, 88)  // symlink
+	delete(table, 89)  // readlink
+	delete(table, 90)  // chmod
+	delete(table, 91)  // fchmod
+	delete(table, 92)  // chown
+	delete(table, 93)  // fchown
+	delete(table, 94)  // lchown
+	delete(table, 133) // mknod
+	delete(table, 137) // statfs
+	delete(table, 138) // fstatfs
+	delete(table, 161) // chroot
+	delete(table, 162) // sync
+	delete(table, 165) // mount
+	delete(table, 166) // umount2
+	delete(table, 172) // iopl
+	delete(table, 173) // ioperm
+	delete(table, 187) // readahead
+	delete(table, 188) // setxattr
+	delete(table, 189) // lsetxattr
+	delete(table, 190) // fsetxattr
+	delete(table, 191) // getxattr
+	delete(table, 192) // lgetxattr
+	delete(table, 193) // fgetxattr
+	delete(table, 206) // io_setup
+	delete(table, 207) // io_destroy
+	delete(table, 208) // io_getevents
+	delete(table, 209) // io_submit
+	delete(table, 210) // io_cancel
+	delete(table, 213) // epoll_create
+	delete(table, 214) // epoll_ctl_old
+	delete(table, 215) // epoll_wait_old
+	delete(table, 216) // remap_file_pages
+	delete(table, 217) // getdents64
+	delete(table, 232) // epoll_wait
+	delete(table, 233) // epoll_ctl
+	delete(table, 253) // inotify_init
+	delete(table, 254) // inotify_add_watch
+	delete(table, 255) // inotify_rm_watch
+	delete(table, 257) // openat
+	delete(table, 258) // mkdirat
+	delete(table, 259) // mknodat
+	delete(table, 260) // fchownat
+	delete(table, 261) // futimesat
+	delete(table, 262) // fstatat
+	delete(table, 263) // unlinkat
+	delete(table, 264) // renameat
+	delete(table, 265) // linkat
+	delete(table, 266) // symlinkat
+	delete(table, 267) // readlinkat
+	delete(table, 268) // fchmodat
+	delete(table, 269) // faccessat
+	delete(table, 270) // pselect
+	delete(table, 271) // ppoll
+	delete(table, 285) // fallocate
+	delete(table, 291) // epoll_create1
+	delete(table, 292) // dup3
+	delete(table, 293) // pipe2
+	delete(table, 294) // inotify_init1
+	delete(table, 295) // preadv
+	delete(table, 296) // pwritev
+	delete(table, 306) // syncfs
+	delete(table, 316) // renameat2
+	delete(table, 319) // memfd_create
+	delete(table, 322) // execveat
+	delete(table, 327) // preadv2
+	delete(table, 328) // pwritev2
+	delete(table, 332) // statx
 }
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 14b39eb9d..0b4f18ab5 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -43,6 +43,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index d97362b9a..82781e6d3 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -29,9 +29,10 @@ const (
 	CtxRoot
 )
 
-// MountNamespaceFromContext returns the MountNamespace used by ctx. It does
-// not take a reference on the returned MountNamespace. If ctx is not
-// associated with a MountNamespace, MountNamespaceFromContext returns nil.
+// MountNamespaceFromContext returns the MountNamespace used by ctx. If ctx is
+// not associated with a MountNamespace, MountNamespaceFromContext returns nil.
+//
+// A reference is taken on the returned MountNamespace.
 func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
 	if v := ctx.Value(CtxMountNamespace); v != nil {
 		return v.(*MountNamespace)
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 1fbb420f9..ad2c9fcf4 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -114,6 +114,7 @@ type MountNamespace struct {
 func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
 	rft := vfs.getFilesystemType(fsTypeName)
 	if rft == nil {
+		ctx.Warningf("Unknown filesystem: %s", fsTypeName)
 		return nil, syserror.ENODEV
 	}
 	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
@@ -231,9 +232,12 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
 		return syserror.EINVAL
 	}
 	vfs.mountMu.Lock()
-	if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns {
-		vfs.mountMu.Unlock()
-		return syserror.EINVAL
+	if mntns := MountNamespaceFromContext(ctx); mntns != nil {
+		defer mntns.DecRef()
+		if mntns != vd.mount.ns {
+			vfs.mountMu.Unlock()
+			return syserror.EINVAL
+		}
 	}
 
 	// TODO(jamieliu): Linux special-cases umount of the caller's root, which
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index fdf8be157..6af7fdac1 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -61,7 +61,7 @@ type MountOptions struct {
 type OpenOptions struct {
 	// Flags contains access mode and flags as specified for open(2).
 	//
-	// FilesystemImpls is reponsible for implementing the following flags:
+	// FilesystemImpls are responsible for implementing the following flags:
 	// O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC,
 	// O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and
 	// O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 9629afee9..51deae313 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -393,7 +393,8 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 					// be executed.
 					return nil, syserror.EACCES
 				}
-				if linux.FileMode(stat.Mode).FileType() != linux.ModeRegular {
+				if t := linux.FileMode(stat.Mode).FileType(); t != linux.ModeRegular {
+					ctx.Infof("%q is not a regular file: %v", pop.Path, t)
 					return nil, syserror.EACCES
 				}
 			}
@@ -743,6 +744,8 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
 // VirtualDentry methods require that a reference is held on the VirtualDentry.
 //
 // VirtualDentry is analogous to Linux's struct path.
+//
+// +stateify savable
 type VirtualDentry struct {
 	mount  *Mount
 	dentry *Dentry
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 9f0d5d7af..239ca5302 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -795,16 +795,19 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 		return 0, fmt.Errorf("container %q not started", args.ContainerID)
 	}
 
+	// TODO(gvisor.dev/issue/1623): Add VFS2 support
+
 	// Get the container MountNamespace from the Task.
 	tg.Leader().WithMuLocked(func(t *kernel.Task) {
-		// task.MountNamespace() does not take a ref, so we must do so
-		// ourselves.
+		// task.MountNamespace() does not take a ref, so we must do so ourselves.
 		args.MountNamespace = t.MountNamespace()
 		args.MountNamespace.IncRef()
 	})
-	defer args.MountNamespace.DecRef()
+	if args.MountNamespace != nil {
+		defer args.MountNamespace.DecRef()
+	}
 
-	// Add the HOME enviroment varible if it is not already set.
+	// Add the HOME environment variable if it is not already set.
 	root := args.MountNamespace.Root()
 	defer root.DecRef()
 	ctx := fs.WithRoot(l.k.SupervisorContext(), root)
-- 
cgit v1.2.3


From 3c26f5ecb0087337b1f194b6d429ce68f3af70eb Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 12:07:08 -0800
Subject: Enable automated marshalling for struct stat.

This requires fixing a few build issues for non-am64 platforms.

PiperOrigin-RevId: 295196922
---
 pkg/abi/linux/BUILD                         |  1 +
 pkg/abi/linux/file_amd64.go                 |  2 +
 pkg/abi/linux/file_arm64.go                 |  2 +
 pkg/abi/linux/time.go                       |  2 +
 pkg/sentry/kernel/BUILD                     |  1 +
 pkg/sentry/syscalls/linux/BUILD             |  2 -
 pkg/sentry/syscalls/linux/sys_stat.go       | 26 +++++++++-
 pkg/sentry/syscalls/linux/sys_stat_amd64.go | 75 ----------------------------
 pkg/sentry/syscalls/linux/sys_stat_arm64.go | 77 -----------------------------
 9 files changed, 32 insertions(+), 156 deletions(-)
 delete mode 100644 pkg/sentry/syscalls/linux/sys_stat_amd64.go
 delete mode 100644 pkg/sentry/syscalls/linux/sys_stat_arm64.go

(limited to 'pkg')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 1f3c0c687..b7015367b 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -59,6 +59,7 @@ go_library(
         "wait.go",
         "xattr.go",
     ],
+    marshal = True,
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi",
diff --git a/pkg/abi/linux/file_amd64.go b/pkg/abi/linux/file_amd64.go
index 8693d49c8..6b72364ea 100644
--- a/pkg/abi/linux/file_amd64.go
+++ b/pkg/abi/linux/file_amd64.go
@@ -25,6 +25,8 @@ const (
 )
 
 // Stat represents struct stat.
+//
+// +marshal
 type Stat struct {
 	Dev     uint64
 	Ino     uint64
diff --git a/pkg/abi/linux/file_arm64.go b/pkg/abi/linux/file_arm64.go
index ea3adc5f5..6492c9038 100644
--- a/pkg/abi/linux/file_arm64.go
+++ b/pkg/abi/linux/file_arm64.go
@@ -25,6 +25,8 @@ const (
 )
 
 // Stat represents struct stat.
+//
+// +marshal
 type Stat struct {
 	Dev     uint64
 	Ino     uint64
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index 5c5a58cd4..e562b46d9 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -101,6 +101,8 @@ func NsecToTimeT(nsec int64) TimeT {
 }
 
 // Timespec represents struct timespec in <time.h>.
+//
+// +marshal
 type Timespec struct {
 	Sec  int64
 	Nsec int64
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 46306945f..beba29a09 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -201,6 +201,7 @@ go_library(
         "//pkg/tcpip/stack",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/marshal",
     ],
 )
 
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 0d24fd3c4..c7883e68e 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -42,8 +42,6 @@ go_library(
         "sys_socket.go",
         "sys_splice.go",
         "sys_stat.go",
-        "sys_stat_amd64.go",
-        "sys_stat_arm64.go",
         "sys_sync.go",
         "sys_sysinfo.go",
         "sys_syslog.go",
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index c841abccb..8b66a9006 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -23,6 +23,24 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+	return linux.Stat{
+		Dev:     sattr.DeviceID,
+		Ino:     sattr.InodeID,
+		Nlink:   uattr.Links,
+		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+		Size:    uattr.Size,
+		Blksize: sattr.BlockSize,
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	}
+}
+
 // Stat implements linux syscall stat(2).
 func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
@@ -112,7 +130,9 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
 	if err != nil {
 		return err
 	}
-	return copyOutStat(t, statAddr, d.Inode.StableAttr, uattr)
+	s := statFromAttrs(t, d.Inode.StableAttr, uattr)
+	_, err = s.CopyOut(t, statAddr)
+	return err
 }
 
 // fstat implements fstat for the given *fs.File.
@@ -121,7 +141,9 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error {
 	if err != nil {
 		return err
 	}
-	return copyOutStat(t, statAddr, f.Dirent.Inode.StableAttr, uattr)
+	s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr)
+	_, err = s.CopyOut(t, statAddr)
+	return err
 }
 
 // Statx implements linux syscall statx(2).
diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
deleted file mode 100644
index 75a567bd4..000000000
--- a/pkg/sentry/syscalls/linux/sys_stat_amd64.go
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//+build amd64
-
-package linux
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
-// address dst in t's address space. It encodes the stat struct to bytes
-// manually, as stat() is a very common syscall for many applications, and
-// t.CopyObjectOut has noticeable performance impact due to its many slice
-// allocations and use of reflection.
-func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
-	b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
-
-	// Dev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
-	// Ino (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
-	// Nlink (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links)
-	// Mode (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
-	// UID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
-	// GID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
-	// Padding (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, 0)
-	// Rdev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
-	// Size (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
-	// Blksize (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.BlockSize))
-	// Blocks (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
-
-	// ATime
-	atime := uattr.AccessTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
-
-	// MTime
-	mtime := uattr.ModificationTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
-
-	// CTime
-	ctime := uattr.StatusChangeTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
-
-	_, err := t.CopyOutBytes(dst, b)
-	return err
-}
diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
deleted file mode 100644
index 80c98d05c..000000000
--- a/pkg/sentry/syscalls/linux/sys_stat_arm64.go
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//+build arm64
-
-package linux
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
-// address dst in t's address space. It encodes the stat struct to bytes
-// manually, as stat() is a very common syscall for many applications, and
-// t.CopyObjectOut has noticeable performance impact due to its many slice
-// allocations and use of reflection.
-func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
-	b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
-
-	// Dev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
-	// Ino (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
-	// Mode (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
-	// Nlink (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Links))
-	// UID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
-	// GID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
-	// Rdev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
-	// Padding (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, 0)
-	// Size (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
-	// Blksize (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(sattr.BlockSize))
-	// Padding (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, 0)
-	// Blocks (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
-
-	// ATime
-	atime := uattr.AccessTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
-
-	// MTime
-	mtime := uattr.ModificationTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
-
-	// CTime
-	ctime := uattr.StatusChangeTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
-
-	_, err := t.CopyOutBytes(dst, b)
-	return err
-}
-- 
cgit v1.2.3


From 50c493193b72997a6b09f353fd9217349941c494 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 12:22:25 -0800
Subject: Un-export p9 message encode/decode functions.

These are not used outside of the p9 package.

PiperOrigin-RevId: 295200052
---
 pkg/p9/buffer.go             |  10 +-
 pkg/p9/messages.go           | 672 +++++++++++++++++++++----------------------
 pkg/p9/messages_test.go      |   4 +-
 pkg/p9/p9.go                 |  68 ++---
 pkg/p9/transport.go          |   4 +-
 pkg/p9/transport_flipcall.go |   4 +-
 pkg/p9/transport_test.go     |   8 +-
 7 files changed, 385 insertions(+), 385 deletions(-)

(limited to 'pkg')

diff --git a/pkg/p9/buffer.go b/pkg/p9/buffer.go
index 249536d8a..6a4951821 100644
--- a/pkg/p9/buffer.go
+++ b/pkg/p9/buffer.go
@@ -20,16 +20,16 @@ import (
 
 // encoder is used for messages and 9P primitives.
 type encoder interface {
-	// Decode decodes from the given buffer. Decode may be called more than once
+	// decode decodes from the given buffer. decode may be called more than once
 	// to reuse the instance. It must clear any previous state.
 	//
 	// This may not fail, exhaustion will be recorded in the buffer.
-	Decode(b *buffer)
+	decode(b *buffer)
 
-	// Encode encodes to the given buffer.
+	// encode encodes to the given buffer.
 	//
 	// This may not fail.
-	Encode(b *buffer)
+	encode(b *buffer)
 }
 
 // order is the byte order used for encoding.
@@ -39,7 +39,7 @@ var order = binary.LittleEndian
 //
 // This is passed to the encoder methods.
 type buffer struct {
-	// data is the underlying data. This may grow during Encode.
+	// data is the underlying data. This may grow during encode.
 	data []byte
 
 	// overflow indicates whether an overflow has occurred.
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index b1cede5f5..3863ad1f5 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -51,7 +51,7 @@ type payloader interface {
 	// SetPayload returns the decoded message.
 	//
 	// This is going to be total message size - FixedSize. But this should
-	// be validated during Decode, which will be called after SetPayload.
+	// be validated during decode, which will be called after SetPayload.
 	SetPayload([]byte)
 }
 
@@ -90,14 +90,14 @@ type Tversion struct {
 	Version string
 }
 
-// Decode implements encoder.Decode.
-func (t *Tversion) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tversion) decode(b *buffer) {
 	t.MSize = b.Read32()
 	t.Version = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tversion) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tversion) encode(b *buffer) {
 	b.Write32(t.MSize)
 	b.WriteString(t.Version)
 }
@@ -121,14 +121,14 @@ type Rversion struct {
 	Version string
 }
 
-// Decode implements encoder.Decode.
-func (r *Rversion) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rversion) decode(b *buffer) {
 	r.MSize = b.Read32()
 	r.Version = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rversion) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rversion) encode(b *buffer) {
 	b.Write32(r.MSize)
 	b.WriteString(r.Version)
 }
@@ -149,13 +149,13 @@ type Tflush struct {
 	OldTag Tag
 }
 
-// Decode implements encoder.Decode.
-func (t *Tflush) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tflush) decode(b *buffer) {
 	t.OldTag = b.ReadTag()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tflush) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tflush) encode(b *buffer) {
 	b.WriteTag(t.OldTag)
 }
 
@@ -173,12 +173,12 @@ func (t *Tflush) String() string {
 type Rflush struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rflush) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rflush) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rflush) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rflush) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -203,8 +203,8 @@ type Twalk struct {
 	Names []string
 }
 
-// Decode implements encoder.Decode.
-func (t *Twalk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Twalk) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.NewFID = b.ReadFID()
 	n := b.Read16()
@@ -214,8 +214,8 @@ func (t *Twalk) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
-func (t *Twalk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Twalk) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteFID(t.NewFID)
 	b.Write16(uint16(len(t.Names)))
@@ -240,22 +240,22 @@ type Rwalk struct {
 	QIDs []QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rwalk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rwalk) decode(b *buffer) {
 	n := b.Read16()
 	r.QIDs = r.QIDs[:0]
 	for i := 0; i < int(n); i++ {
 		var q QID
-		q.Decode(b)
+		q.decode(b)
 		r.QIDs = append(r.QIDs, q)
 	}
 }
 
-// Encode implements encoder.Encode.
-func (r *Rwalk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rwalk) encode(b *buffer) {
 	b.Write16(uint16(len(r.QIDs)))
 	for _, q := range r.QIDs {
-		q.Encode(b)
+		q.encode(b)
 	}
 }
 
@@ -275,13 +275,13 @@ type Tclunk struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tclunk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tclunk) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tclunk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tclunk) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -299,12 +299,12 @@ func (t *Tclunk) String() string {
 type Rclunk struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rclunk) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rclunk) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rclunk) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rclunk) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -325,13 +325,13 @@ type Tremove struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tremove) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tremove) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tremove) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tremove) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -349,12 +349,12 @@ func (t *Tremove) String() string {
 type Rremove struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rremove) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rremove) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rremove) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rremove) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -374,13 +374,13 @@ type Rlerror struct {
 	Error uint32
 }
 
-// Decode implements encoder.Decode.
-func (r *Rlerror) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rlerror) decode(b *buffer) {
 	r.Error = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rlerror) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rlerror) encode(b *buffer) {
 	b.Write32(r.Error)
 }
 
@@ -409,16 +409,16 @@ type Tauth struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tauth) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tauth) decode(b *buffer) {
 	t.AuthenticationFID = b.ReadFID()
 	t.UserName = b.ReadString()
 	t.AttachName = b.ReadString()
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tauth) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tauth) encode(b *buffer) {
 	b.WriteFID(t.AuthenticationFID)
 	b.WriteString(t.UserName)
 	b.WriteString(t.AttachName)
@@ -437,7 +437,7 @@ func (t *Tauth) String() string {
 
 // Rauth is an authentication response.
 //
-// Encode, Decode and Length are inherited directly from QID.
+// encode and decode are inherited directly from QID.
 type Rauth struct {
 	QID
 }
@@ -463,16 +463,16 @@ type Tattach struct {
 	Auth Tauth
 }
 
-// Decode implements encoder.Decode.
-func (t *Tattach) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tattach) decode(b *buffer) {
 	t.FID = b.ReadFID()
-	t.Auth.Decode(b)
+	t.Auth.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (t *Tattach) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tattach) encode(b *buffer) {
 	b.WriteFID(t.FID)
-	t.Auth.Encode(b)
+	t.Auth.encode(b)
 }
 
 // Type implements message.Type.
@@ -509,14 +509,14 @@ type Tlopen struct {
 	Flags OpenFlags
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlopen) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlopen) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Flags = b.ReadOpenFlags()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlopen) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlopen) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteOpenFlags(t.Flags)
 }
@@ -542,15 +542,15 @@ type Rlopen struct {
 	filePayload
 }
 
-// Decode implements encoder.Decode.
-func (r *Rlopen) Decode(b *buffer) {
-	r.QID.Decode(b)
+// decode implements encoder.decode.
+func (r *Rlopen) decode(b *buffer) {
+	r.QID.decode(b)
 	r.IoUnit = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rlopen) Encode(b *buffer) {
-	r.QID.Encode(b)
+// encode implements encoder.encode.
+func (r *Rlopen) encode(b *buffer) {
+	r.QID.encode(b)
 	b.Write32(r.IoUnit)
 }
 
@@ -587,8 +587,8 @@ type Tlcreate struct {
 	GID GID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlcreate) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlcreate) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 	t.OpenFlags = b.ReadOpenFlags()
@@ -596,8 +596,8 @@ func (t *Tlcreate) Decode(b *buffer) {
 	t.GID = b.ReadGID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlcreate) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlcreate) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 	b.WriteOpenFlags(t.OpenFlags)
@@ -617,7 +617,7 @@ func (t *Tlcreate) String() string {
 
 // Rlcreate is a create response.
 //
-// The Encode, Decode, etc. methods are inherited from Rlopen.
+// The encode, decode, etc. methods are inherited from Rlopen.
 type Rlcreate struct {
 	Rlopen
 }
@@ -647,16 +647,16 @@ type Tsymlink struct {
 	GID GID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tsymlink) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tsymlink) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Target = b.ReadString()
 	t.GID = b.ReadGID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tsymlink) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tsymlink) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
 	b.WriteString(t.Target)
@@ -679,14 +679,14 @@ type Rsymlink struct {
 	QID QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rsymlink) Decode(b *buffer) {
-	r.QID.Decode(b)
+// decode implements encoder.decode.
+func (r *Rsymlink) decode(b *buffer) {
+	r.QID.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rsymlink) Encode(b *buffer) {
-	r.QID.Encode(b)
+// encode implements encoder.encode.
+func (r *Rsymlink) encode(b *buffer) {
+	r.QID.encode(b)
 }
 
 // Type implements message.Type.
@@ -711,15 +711,15 @@ type Tlink struct {
 	Name string
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlink) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlink) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Target = b.ReadFID()
 	t.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlink) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlink) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteFID(t.Target)
 	b.WriteString(t.Name)
@@ -744,12 +744,12 @@ func (*Rlink) Type() MsgType {
 	return MsgRlink
 }
 
-// Decode implements encoder.Decode.
-func (*Rlink) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rlink) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rlink) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rlink) encode(*buffer) {
 }
 
 // String implements fmt.Stringer.
@@ -772,16 +772,16 @@ type Trenameat struct {
 	NewName string
 }
 
-// Decode implements encoder.Decode.
-func (t *Trenameat) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Trenameat) decode(b *buffer) {
 	t.OldDirectory = b.ReadFID()
 	t.OldName = b.ReadString()
 	t.NewDirectory = b.ReadFID()
 	t.NewName = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Trenameat) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Trenameat) encode(b *buffer) {
 	b.WriteFID(t.OldDirectory)
 	b.WriteString(t.OldName)
 	b.WriteFID(t.NewDirectory)
@@ -802,12 +802,12 @@ func (t *Trenameat) String() string {
 type Rrenameat struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rrenameat) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rrenameat) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rrenameat) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rrenameat) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -832,15 +832,15 @@ type Tunlinkat struct {
 	Flags uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Tunlinkat) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tunlinkat) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Flags = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tunlinkat) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tunlinkat) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
 	b.Write32(t.Flags)
@@ -860,12 +860,12 @@ func (t *Tunlinkat) String() string {
 type Runlinkat struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Runlinkat) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Runlinkat) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Runlinkat) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Runlinkat) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -893,15 +893,15 @@ type Trename struct {
 	Name string
 }
 
-// Decode implements encoder.Decode.
-func (t *Trename) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Trename) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Trename) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Trename) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
@@ -921,12 +921,12 @@ func (t *Trename) String() string {
 type Rrename struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rrename) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rrename) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rrename) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rrename) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -945,13 +945,13 @@ type Treadlink struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Treadlink) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Treadlink) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Treadlink) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Treadlink) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -971,13 +971,13 @@ type Rreadlink struct {
 	Target string
 }
 
-// Decode implements encoder.Decode.
-func (r *Rreadlink) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rreadlink) decode(b *buffer) {
 	r.Target = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rreadlink) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rreadlink) encode(b *buffer) {
 	b.WriteString(r.Target)
 }
 
@@ -1003,15 +1003,15 @@ type Tread struct {
 	Count uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Tread) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tread) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Offset = b.Read64()
 	t.Count = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tread) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tread) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.Write64(t.Offset)
 	b.Write32(t.Count)
@@ -1033,20 +1033,20 @@ type Rread struct {
 	Data []byte
 }
 
-// Decode implements encoder.Decode.
+// decode implements encoder.decode.
 //
 // Data is automatically decoded via Payload.
-func (r *Rread) Decode(b *buffer) {
+func (r *Rread) decode(b *buffer) {
 	count := b.Read32()
 	if count != uint32(len(r.Data)) {
 		b.markOverrun()
 	}
 }
 
-// Encode implements encoder.Encode.
+// encode implements encoder.encode.
 //
 // Data is automatically encoded via Payload.
-func (r *Rread) Encode(b *buffer) {
+func (r *Rread) encode(b *buffer) {
 	b.Write32(uint32(len(r.Data)))
 }
 
@@ -1087,8 +1087,8 @@ type Twrite struct {
 	Data []byte
 }
 
-// Decode implements encoder.Decode.
-func (t *Twrite) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Twrite) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Offset = b.Read64()
 	count := b.Read32()
@@ -1097,10 +1097,10 @@ func (t *Twrite) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
+// encode implements encoder.encode.
 //
 // This uses the buffer payload to avoid a copy.
-func (t *Twrite) Encode(b *buffer) {
+func (t *Twrite) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.Write64(t.Offset)
 	b.Write32(uint32(len(t.Data)))
@@ -1137,13 +1137,13 @@ type Rwrite struct {
 	Count uint32
 }
 
-// Decode implements encoder.Decode.
-func (r *Rwrite) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rwrite) decode(b *buffer) {
 	r.Count = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rwrite) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rwrite) encode(b *buffer) {
 	b.Write32(r.Count)
 }
 
@@ -1178,8 +1178,8 @@ type Tmknod struct {
 	GID GID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tmknod) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tmknod) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Mode = b.ReadFileMode()
@@ -1188,8 +1188,8 @@ func (t *Tmknod) Decode(b *buffer) {
 	t.GID = b.ReadGID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tmknod) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tmknod) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
 	b.WriteFileMode(t.Mode)
@@ -1214,14 +1214,14 @@ type Rmknod struct {
 	QID QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rmknod) Decode(b *buffer) {
-	r.QID.Decode(b)
+// decode implements encoder.decode.
+func (r *Rmknod) decode(b *buffer) {
+	r.QID.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rmknod) Encode(b *buffer) {
-	r.QID.Encode(b)
+// encode implements encoder.encode.
+func (r *Rmknod) encode(b *buffer) {
+	r.QID.encode(b)
 }
 
 // Type implements message.Type.
@@ -1249,16 +1249,16 @@ type Tmkdir struct {
 	GID GID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tmkdir) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tmkdir) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Permissions = b.ReadPermissions()
 	t.GID = b.ReadGID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tmkdir) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tmkdir) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
 	b.WritePermissions(t.Permissions)
@@ -1281,14 +1281,14 @@ type Rmkdir struct {
 	QID QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rmkdir) Decode(b *buffer) {
-	r.QID.Decode(b)
+// decode implements encoder.decode.
+func (r *Rmkdir) decode(b *buffer) {
+	r.QID.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rmkdir) Encode(b *buffer) {
-	r.QID.Encode(b)
+// encode implements encoder.encode.
+func (r *Rmkdir) encode(b *buffer) {
+	r.QID.encode(b)
 }
 
 // Type implements message.Type.
@@ -1310,16 +1310,16 @@ type Tgetattr struct {
 	AttrMask AttrMask
 }
 
-// Decode implements encoder.Decode.
-func (t *Tgetattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tgetattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
-	t.AttrMask.Decode(b)
+	t.AttrMask.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (t *Tgetattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tgetattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
-	t.AttrMask.Encode(b)
+	t.AttrMask.encode(b)
 }
 
 // Type implements message.Type.
@@ -1344,18 +1344,18 @@ type Rgetattr struct {
 	Attr Attr
 }
 
-// Decode implements encoder.Decode.
-func (r *Rgetattr) Decode(b *buffer) {
-	r.Valid.Decode(b)
-	r.QID.Decode(b)
-	r.Attr.Decode(b)
+// decode implements encoder.decode.
+func (r *Rgetattr) decode(b *buffer) {
+	r.Valid.decode(b)
+	r.QID.decode(b)
+	r.Attr.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rgetattr) Encode(b *buffer) {
-	r.Valid.Encode(b)
-	r.QID.Encode(b)
-	r.Attr.Encode(b)
+// encode implements encoder.encode.
+func (r *Rgetattr) encode(b *buffer) {
+	r.Valid.encode(b)
+	r.QID.encode(b)
+	r.Attr.encode(b)
 }
 
 // Type implements message.Type.
@@ -1380,18 +1380,18 @@ type Tsetattr struct {
 	SetAttr SetAttr
 }
 
-// Decode implements encoder.Decode.
-func (t *Tsetattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tsetattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
-	t.Valid.Decode(b)
-	t.SetAttr.Decode(b)
+	t.Valid.decode(b)
+	t.SetAttr.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (t *Tsetattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tsetattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
-	t.Valid.Encode(b)
-	t.SetAttr.Encode(b)
+	t.Valid.encode(b)
+	t.SetAttr.encode(b)
 }
 
 // Type implements message.Type.
@@ -1408,12 +1408,12 @@ func (t *Tsetattr) String() string {
 type Rsetattr struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rsetattr) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rsetattr) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rsetattr) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rsetattr) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1435,18 +1435,18 @@ type Tallocate struct {
 	Length uint64
 }
 
-// Decode implements encoder.Decode.
-func (t *Tallocate) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tallocate) decode(b *buffer) {
 	t.FID = b.ReadFID()
-	t.Mode.Decode(b)
+	t.Mode.decode(b)
 	t.Offset = b.Read64()
 	t.Length = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tallocate) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tallocate) encode(b *buffer) {
 	b.WriteFID(t.FID)
-	t.Mode.Encode(b)
+	t.Mode.encode(b)
 	b.Write64(t.Offset)
 	b.Write64(t.Length)
 }
@@ -1465,12 +1465,12 @@ func (t *Tallocate) String() string {
 type Rallocate struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rallocate) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rallocate) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rallocate) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rallocate) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1492,14 +1492,14 @@ type Tlistxattr struct {
 	Size uint64
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlistxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlistxattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Size = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlistxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlistxattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.Write64(t.Size)
 }
@@ -1520,8 +1520,8 @@ type Rlistxattr struct {
 	Xattrs []string
 }
 
-// Decode implements encoder.Decode.
-func (r *Rlistxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rlistxattr) decode(b *buffer) {
 	n := b.Read16()
 	r.Xattrs = r.Xattrs[:0]
 	for i := 0; i < int(n); i++ {
@@ -1529,8 +1529,8 @@ func (r *Rlistxattr) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
-func (r *Rlistxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rlistxattr) encode(b *buffer) {
 	b.Write16(uint16(len(r.Xattrs)))
 	for _, x := range r.Xattrs {
 		b.WriteString(x)
@@ -1559,15 +1559,15 @@ type Txattrwalk struct {
 	Name string
 }
 
-// Decode implements encoder.Decode.
-func (t *Txattrwalk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Txattrwalk) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.NewFID = b.ReadFID()
 	t.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Txattrwalk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Txattrwalk) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteFID(t.NewFID)
 	b.WriteString(t.Name)
@@ -1589,13 +1589,13 @@ type Rxattrwalk struct {
 	Size uint64
 }
 
-// Decode implements encoder.Decode.
-func (r *Rxattrwalk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rxattrwalk) decode(b *buffer) {
 	r.Size = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rxattrwalk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rxattrwalk) encode(b *buffer) {
 	b.Write64(r.Size)
 }
 
@@ -1627,16 +1627,16 @@ type Txattrcreate struct {
 	Flags uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Txattrcreate) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Txattrcreate) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 	t.AttrSize = b.Read64()
 	t.Flags = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Txattrcreate) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Txattrcreate) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 	b.Write64(t.AttrSize)
@@ -1657,12 +1657,12 @@ func (t *Txattrcreate) String() string {
 type Rxattrcreate struct {
 }
 
-// Decode implements encoder.Decode.
-func (r *Rxattrcreate) Decode(*buffer) {
+// decode implements encoder.decode.
+func (r *Rxattrcreate) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (r *Rxattrcreate) Encode(*buffer) {
+// encode implements encoder.encode.
+func (r *Rxattrcreate) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1687,15 +1687,15 @@ type Tgetxattr struct {
 	Size uint64
 }
 
-// Decode implements encoder.Decode.
-func (t *Tgetxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tgetxattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Size = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tgetxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tgetxattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 	b.Write64(t.Size)
@@ -1717,13 +1717,13 @@ type Rgetxattr struct {
 	Value string
 }
 
-// Decode implements encoder.Decode.
-func (r *Rgetxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rgetxattr) decode(b *buffer) {
 	r.Value = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rgetxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rgetxattr) encode(b *buffer) {
 	b.WriteString(r.Value)
 }
 
@@ -1752,16 +1752,16 @@ type Tsetxattr struct {
 	Flags uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Tsetxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tsetxattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Value = b.ReadString()
 	t.Flags = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tsetxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tsetxattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 	b.WriteString(t.Value)
@@ -1782,12 +1782,12 @@ func (t *Tsetxattr) String() string {
 type Rsetxattr struct {
 }
 
-// Decode implements encoder.Decode.
-func (r *Rsetxattr) Decode(*buffer) {
+// decode implements encoder.decode.
+func (r *Rsetxattr) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (r *Rsetxattr) Encode(*buffer) {
+// encode implements encoder.encode.
+func (r *Rsetxattr) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1809,14 +1809,14 @@ type Tremovexattr struct {
 	Name string
 }
 
-// Decode implements encoder.Decode.
-func (t *Tremovexattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tremovexattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tremovexattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tremovexattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 }
@@ -1835,12 +1835,12 @@ func (t *Tremovexattr) String() string {
 type Rremovexattr struct {
 }
 
-// Decode implements encoder.Decode.
-func (r *Rremovexattr) Decode(*buffer) {
+// decode implements encoder.decode.
+func (r *Rremovexattr) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (r *Rremovexattr) Encode(*buffer) {
+// encode implements encoder.encode.
+func (r *Rremovexattr) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1865,15 +1865,15 @@ type Treaddir struct {
 	Count uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Treaddir) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Treaddir) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Offset = b.Read64()
 	t.Count = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Treaddir) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Treaddir) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.Write64(t.Offset)
 	b.Write32(t.Count)
@@ -1907,14 +1907,14 @@ type Rreaddir struct {
 	payload []byte
 }
 
-// Decode implements encoder.Decode.
-func (r *Rreaddir) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rreaddir) decode(b *buffer) {
 	r.Count = b.Read32()
 	entriesBuf := buffer{data: r.payload}
 	r.Entries = r.Entries[:0]
 	for {
 		var d Dirent
-		d.Decode(&entriesBuf)
+		d.decode(&entriesBuf)
 		if entriesBuf.isOverrun() {
 			// Couldn't decode a complete entry.
 			break
@@ -1923,11 +1923,11 @@ func (r *Rreaddir) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
-func (r *Rreaddir) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rreaddir) encode(b *buffer) {
 	entriesBuf := buffer{}
 	for _, d := range r.Entries {
-		d.Encode(&entriesBuf)
+		d.encode(&entriesBuf)
 		if len(entriesBuf.data) >= int(r.Count) {
 			break
 		}
@@ -1972,13 +1972,13 @@ type Tfsync struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tfsync) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tfsync) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tfsync) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tfsync) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -1996,12 +1996,12 @@ func (t *Tfsync) String() string {
 type Rfsync struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rfsync) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rfsync) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rfsync) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rfsync) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -2020,13 +2020,13 @@ type Tstatfs struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tstatfs) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tstatfs) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tstatfs) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tstatfs) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -2046,14 +2046,14 @@ type Rstatfs struct {
 	FSStat FSStat
 }
 
-// Decode implements encoder.Decode.
-func (r *Rstatfs) Decode(b *buffer) {
-	r.FSStat.Decode(b)
+// decode implements encoder.decode.
+func (r *Rstatfs) decode(b *buffer) {
+	r.FSStat.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rstatfs) Encode(b *buffer) {
-	r.FSStat.Encode(b)
+// encode implements encoder.encode.
+func (r *Rstatfs) encode(b *buffer) {
+	r.FSStat.encode(b)
 }
 
 // Type implements message.Type.
@@ -2072,13 +2072,13 @@ type Tflushf struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tflushf) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tflushf) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tflushf) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tflushf) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -2096,12 +2096,12 @@ func (t *Tflushf) String() string {
 type Rflushf struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rflushf) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rflushf) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rflushf) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rflushf) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -2126,8 +2126,8 @@ type Twalkgetattr struct {
 	Names []string
 }
 
-// Decode implements encoder.Decode.
-func (t *Twalkgetattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Twalkgetattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.NewFID = b.ReadFID()
 	n := b.Read16()
@@ -2137,8 +2137,8 @@ func (t *Twalkgetattr) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
-func (t *Twalkgetattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Twalkgetattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteFID(t.NewFID)
 	b.Write16(uint16(len(t.Names)))
@@ -2169,26 +2169,26 @@ type Rwalkgetattr struct {
 	QIDs []QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rwalkgetattr) Decode(b *buffer) {
-	r.Valid.Decode(b)
-	r.Attr.Decode(b)
+// decode implements encoder.decode.
+func (r *Rwalkgetattr) decode(b *buffer) {
+	r.Valid.decode(b)
+	r.Attr.decode(b)
 	n := b.Read16()
 	r.QIDs = r.QIDs[:0]
 	for i := 0; i < int(n); i++ {
 		var q QID
-		q.Decode(b)
+		q.decode(b)
 		r.QIDs = append(r.QIDs, q)
 	}
 }
 
-// Encode implements encoder.Encode.
-func (r *Rwalkgetattr) Encode(b *buffer) {
-	r.Valid.Encode(b)
-	r.Attr.Encode(b)
+// encode implements encoder.encode.
+func (r *Rwalkgetattr) encode(b *buffer) {
+	r.Valid.encode(b)
+	r.Attr.encode(b)
 	b.Write16(uint16(len(r.QIDs)))
 	for _, q := range r.QIDs {
-		q.Encode(b)
+		q.encode(b)
 	}
 }
 
@@ -2210,15 +2210,15 @@ type Tucreate struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tucreate) Decode(b *buffer) {
-	t.Tlcreate.Decode(b)
+// decode implements encoder.decode.
+func (t *Tucreate) decode(b *buffer) {
+	t.Tlcreate.decode(b)
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tucreate) Encode(b *buffer) {
-	t.Tlcreate.Encode(b)
+// encode implements encoder.encode.
+func (t *Tucreate) encode(b *buffer) {
+	t.Tlcreate.encode(b)
 	b.WriteUID(t.UID)
 }
 
@@ -2255,15 +2255,15 @@ type Tumkdir struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tumkdir) Decode(b *buffer) {
-	t.Tmkdir.Decode(b)
+// decode implements encoder.decode.
+func (t *Tumkdir) decode(b *buffer) {
+	t.Tmkdir.decode(b)
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tumkdir) Encode(b *buffer) {
-	t.Tmkdir.Encode(b)
+// encode implements encoder.encode.
+func (t *Tumkdir) encode(b *buffer) {
+	t.Tmkdir.encode(b)
 	b.WriteUID(t.UID)
 }
 
@@ -2300,15 +2300,15 @@ type Tumknod struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tumknod) Decode(b *buffer) {
-	t.Tmknod.Decode(b)
+// decode implements encoder.decode.
+func (t *Tumknod) decode(b *buffer) {
+	t.Tmknod.decode(b)
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tumknod) Encode(b *buffer) {
-	t.Tmknod.Encode(b)
+// encode implements encoder.encode.
+func (t *Tumknod) encode(b *buffer) {
+	t.Tmknod.encode(b)
 	b.WriteUID(t.UID)
 }
 
@@ -2345,15 +2345,15 @@ type Tusymlink struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tusymlink) Decode(b *buffer) {
-	t.Tsymlink.Decode(b)
+// decode implements encoder.decode.
+func (t *Tusymlink) decode(b *buffer) {
+	t.Tsymlink.decode(b)
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tusymlink) Encode(b *buffer) {
-	t.Tsymlink.Encode(b)
+// encode implements encoder.encode.
+func (t *Tusymlink) encode(b *buffer) {
+	t.Tsymlink.encode(b)
 	b.WriteUID(t.UID)
 }
 
@@ -2391,14 +2391,14 @@ type Tlconnect struct {
 	Flags ConnectFlags
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlconnect) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlconnect) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Flags = b.ReadConnectFlags()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlconnect) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlconnect) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteConnectFlags(t.Flags)
 }
@@ -2418,11 +2418,11 @@ type Rlconnect struct {
 	filePayload
 }
 
-// Decode implements encoder.Decode.
-func (r *Rlconnect) Decode(*buffer) {}
+// decode implements encoder.decode.
+func (r *Rlconnect) decode(*buffer) {}
 
-// Encode implements encoder.Encode.
-func (r *Rlconnect) Encode(*buffer) {}
+// encode implements encoder.encode.
+func (r *Rlconnect) encode(*buffer) {}
 
 // Type implements message.Type.
 func (*Rlconnect) Type() MsgType {
@@ -2445,14 +2445,14 @@ type Tchannel struct {
 	Control uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Tchannel) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tchannel) decode(b *buffer) {
 	t.ID = b.Read32()
 	t.Control = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tchannel) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tchannel) encode(b *buffer) {
 	b.Write32(t.ID)
 	b.Write32(t.Control)
 }
@@ -2474,14 +2474,14 @@ type Rchannel struct {
 	filePayload
 }
 
-// Decode implements encoder.Decode.
-func (r *Rchannel) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rchannel) decode(b *buffer) {
 	r.Offset = b.Read64()
 	r.Length = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rchannel) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rchannel) encode(b *buffer) {
 	b.Write64(r.Offset)
 	b.Write64(r.Length)
 }
@@ -2577,7 +2577,7 @@ func calculateSize(m message) uint32 {
 		return p.FixedSize()
 	}
 	var dataBuf buffer
-	m.Encode(&dataBuf)
+	m.encode(&dataBuf)
 	return uint32(len(dataBuf.data))
 }
 
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index 825c939da..c20324404 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -382,7 +382,7 @@ func TestEncodeDecode(t *testing.T) {
 		// Encode the original.
 		data := make([]byte, initialBufferLength)
 		buf := buffer{data: data[:0]}
-		enc.Encode(&buf)
+		enc.encode(&buf)
 
 		// Create a new object, same as the first.
 		enc2 := reflect.New(reflect.ValueOf(enc).Elem().Type()).Interface().(encoder)
@@ -399,7 +399,7 @@ func TestEncodeDecode(t *testing.T) {
 		}
 
 		// Mark sure it was okay.
-		enc2.Decode(&buf2)
+		enc2.decode(&buf2)
 		if buf2.isOverrun() {
 			t.Errorf("object %#v->%#v got overrun on decode", enc, enc2)
 			continue
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 20ab31f7a..28d851ff5 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -450,15 +450,15 @@ func (q QID) String() string {
 	return fmt.Sprintf("QID{Type: %d, Version: %d, Path: %d}", q.Type, q.Version, q.Path)
 }
 
-// Decode implements encoder.Decode.
-func (q *QID) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (q *QID) decode(b *buffer) {
 	q.Type = b.ReadQIDType()
 	q.Version = b.Read32()
 	q.Path = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (q *QID) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (q *QID) encode(b *buffer) {
 	b.WriteQIDType(q.Type)
 	b.Write32(q.Version)
 	b.Write64(q.Path)
@@ -515,8 +515,8 @@ type FSStat struct {
 	NameLength uint32
 }
 
-// Decode implements encoder.Decode.
-func (f *FSStat) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (f *FSStat) decode(b *buffer) {
 	f.Type = b.Read32()
 	f.BlockSize = b.Read32()
 	f.Blocks = b.Read64()
@@ -528,8 +528,8 @@ func (f *FSStat) Decode(b *buffer) {
 	f.NameLength = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (f *FSStat) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (f *FSStat) encode(b *buffer) {
 	b.Write32(f.Type)
 	b.Write32(f.BlockSize)
 	b.Write64(f.Blocks)
@@ -679,8 +679,8 @@ func (a AttrMask) String() string {
 	return fmt.Sprintf("AttrMask{with: %s}", strings.Join(masks, " "))
 }
 
-// Decode implements encoder.Decode.
-func (a *AttrMask) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (a *AttrMask) decode(b *buffer) {
 	mask := b.Read64()
 	a.Mode = mask&0x00000001 != 0
 	a.NLink = mask&0x00000002 != 0
@@ -698,8 +698,8 @@ func (a *AttrMask) Decode(b *buffer) {
 	a.DataVersion = mask&0x00002000 != 0
 }
 
-// Encode implements encoder.Encode.
-func (a *AttrMask) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (a *AttrMask) encode(b *buffer) {
 	var mask uint64
 	if a.Mode {
 		mask |= 0x00000001
@@ -774,8 +774,8 @@ func (a Attr) String() string {
 		a.Mode, a.UID, a.GID, a.NLink, a.RDev, a.Size, a.BlockSize, a.Blocks, a.ATimeSeconds, a.ATimeNanoSeconds, a.MTimeSeconds, a.MTimeNanoSeconds, a.CTimeSeconds, a.CTimeNanoSeconds, a.BTimeSeconds, a.BTimeNanoSeconds, a.Gen, a.DataVersion)
 }
 
-// Encode implements encoder.Encode.
-func (a *Attr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (a *Attr) encode(b *buffer) {
 	b.WriteFileMode(a.Mode)
 	b.WriteUID(a.UID)
 	b.WriteGID(a.GID)
@@ -796,8 +796,8 @@ func (a *Attr) Encode(b *buffer) {
 	b.Write64(a.DataVersion)
 }
 
-// Decode implements encoder.Decode.
-func (a *Attr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (a *Attr) decode(b *buffer) {
 	a.Mode = b.ReadFileMode()
 	a.UID = b.ReadUID()
 	a.GID = b.ReadGID()
@@ -926,8 +926,8 @@ func (s SetAttrMask) Empty() bool {
 	return !s.Permissions && !s.UID && !s.GID && !s.Size && !s.ATime && !s.MTime && !s.CTime && !s.ATimeNotSystemTime && !s.MTimeNotSystemTime
 }
 
-// Decode implements encoder.Decode.
-func (s *SetAttrMask) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (s *SetAttrMask) decode(b *buffer) {
 	mask := b.Read32()
 	s.Permissions = mask&0x00000001 != 0
 	s.UID = mask&0x00000002 != 0
@@ -972,8 +972,8 @@ func (s SetAttrMask) bitmask() uint32 {
 	return mask
 }
 
-// Encode implements encoder.Encode.
-func (s *SetAttrMask) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (s *SetAttrMask) encode(b *buffer) {
 	b.Write32(s.bitmask())
 }
 
@@ -994,8 +994,8 @@ func (s SetAttr) String() string {
 	return fmt.Sprintf("SetAttr{Permissions: 0o%o, UID: %d, GID: %d, Size: %d, ATime: {Sec: %d, NanoSec: %d}, MTime: {Sec: %d, NanoSec: %d}}", s.Permissions, s.UID, s.GID, s.Size, s.ATimeSeconds, s.ATimeNanoSeconds, s.MTimeSeconds, s.MTimeNanoSeconds)
 }
 
-// Decode implements encoder.Decode.
-func (s *SetAttr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (s *SetAttr) decode(b *buffer) {
 	s.Permissions = b.ReadPermissions()
 	s.UID = b.ReadUID()
 	s.GID = b.ReadGID()
@@ -1006,8 +1006,8 @@ func (s *SetAttr) Decode(b *buffer) {
 	s.MTimeNanoSeconds = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (s *SetAttr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (s *SetAttr) encode(b *buffer) {
 	b.WritePermissions(s.Permissions)
 	b.WriteUID(s.UID)
 	b.WriteGID(s.GID)
@@ -1064,17 +1064,17 @@ func (d Dirent) String() string {
 	return fmt.Sprintf("Dirent{QID: %d, Offset: %d, Type: 0x%X, Name: %s}", d.QID, d.Offset, d.Type, d.Name)
 }
 
-// Decode implements encoder.Decode.
-func (d *Dirent) Decode(b *buffer) {
-	d.QID.Decode(b)
+// decode implements encoder.decode.
+func (d *Dirent) decode(b *buffer) {
+	d.QID.decode(b)
 	d.Offset = b.Read64()
 	d.Type = b.ReadQIDType()
 	d.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (d *Dirent) Encode(b *buffer) {
-	d.QID.Encode(b)
+// encode implements encoder.encode.
+func (d *Dirent) encode(b *buffer) {
+	d.QID.encode(b)
 	b.Write64(d.Offset)
 	b.WriteQIDType(d.Type)
 	b.WriteString(d.Name)
@@ -1118,8 +1118,8 @@ func (a *AllocateMode) ToLinux() uint32 {
 	return rv
 }
 
-// Decode implements encoder.Decode.
-func (a *AllocateMode) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (a *AllocateMode) decode(b *buffer) {
 	mask := b.Read32()
 	a.KeepSize = mask&0x01 != 0
 	a.PunchHole = mask&0x02 != 0
@@ -1130,8 +1130,8 @@ func (a *AllocateMode) Decode(b *buffer) {
 	a.Unshare = mask&0x40 != 0
 }
 
-// Encode implements encoder.Encode.
-func (a *AllocateMode) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (a *AllocateMode) encode(b *buffer) {
 	mask := uint32(0)
 	if a.KeepSize {
 		mask |= 0x01
diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go
index 9c11e28ce..7cec0e86d 100644
--- a/pkg/p9/transport.go
+++ b/pkg/p9/transport.go
@@ -80,7 +80,7 @@ func send(s *unet.Socket, tag Tag, m message) error {
 	}
 
 	// Encode the message. The buffer will grow automatically.
-	m.Encode(&dataBuf)
+	m.encode(&dataBuf)
 
 	// Get our vectors to send.
 	var hdr [headerLength]byte
@@ -316,7 +316,7 @@ func recv(s *unet.Socket, msize uint32, lookup lookupTagAndType) (Tag, message,
 	}
 
 	// Decode the message data.
-	m.Decode(&dataBuf)
+	m.decode(&dataBuf)
 	if dataBuf.isOverrun() {
 		// No need to drain the socket.
 		return NoTag, nil, ErrNoValidMessage
diff --git a/pkg/p9/transport_flipcall.go b/pkg/p9/transport_flipcall.go
index 233f825e3..a0d274f3b 100644
--- a/pkg/p9/transport_flipcall.go
+++ b/pkg/p9/transport_flipcall.go
@@ -151,7 +151,7 @@ func (ch *channel) send(m message) (uint32, error) {
 	} else {
 		ch.buf.Write8(0) // No incoming FD.
 	}
-	m.Encode(&ch.buf)
+	m.encode(&ch.buf)
 	ssz := uint32(len(ch.buf.data)) // Updated below.
 
 	// Is there a payload?
@@ -205,7 +205,7 @@ func (ch *channel) recv(r message, rsz uint32) (message, error) {
 		ch.buf.data = ch.buf.data[:fs]
 	}
 
-	r.Decode(&ch.buf)
+	r.decode(&ch.buf)
 	if ch.buf.isOverrun() {
 		// Nothing valid was available.
 		log.Debugf("recv [got %d bytes, needed more]", rsz)
diff --git a/pkg/p9/transport_test.go b/pkg/p9/transport_test.go
index 2f50ff3ea..3668fcad7 100644
--- a/pkg/p9/transport_test.go
+++ b/pkg/p9/transport_test.go
@@ -56,8 +56,8 @@ func TestSendRecv(t *testing.T) {
 // badDecode overruns on decode.
 type badDecode struct{}
 
-func (*badDecode) Decode(b *buffer) { b.markOverrun() }
-func (*badDecode) Encode(b *buffer) {}
+func (*badDecode) decode(b *buffer) { b.markOverrun() }
+func (*badDecode) encode(b *buffer) {}
 func (*badDecode) Type() MsgType    { return MsgTypeBadDecode }
 func (*badDecode) String() string   { return "badDecode{}" }
 
@@ -81,8 +81,8 @@ func TestRecvOverrun(t *testing.T) {
 // unregistered is not registered on decode.
 type unregistered struct{}
 
-func (*unregistered) Decode(b *buffer) {}
-func (*unregistered) Encode(b *buffer) {}
+func (*unregistered) decode(b *buffer) {}
+func (*unregistered) encode(b *buffer) {}
 func (*unregistered) Type() MsgType    { return MsgTypeUnregistered }
 func (*unregistered) String() string   { return "unregistered{}" }
 
-- 
cgit v1.2.3


From e4c7f3e6f6c19f3259820a4c41b69e85c0454379 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 13:39:51 -0800
Subject: Inline vfs.VirtualFilesystem in Kernel struct

This saves one pointer dereference per VFS access.

Updates #1623

PiperOrigin-RevId: 295216176
---
 pkg/sentry/control/proc.go                        |  2 +-
 pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go       |  5 +++-
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go |  5 +++-
 pkg/sentry/fsimpl/ext/ext_test.go                 |  5 +++-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go           |  5 +++-
 pkg/sentry/fsimpl/proc/tasks_test.go              |  6 ++--
 pkg/sentry/fsimpl/sys/sys_test.go                 |  6 ++--
 pkg/sentry/fsimpl/testutil/kernel.go              |  7 +++--
 pkg/sentry/fsimpl/tmpfs/benchmark_test.go         | 10 +++++--
 pkg/sentry/fsimpl/tmpfs/pipe_test.go              |  5 +++-
 pkg/sentry/fsimpl/tmpfs/regular_file_test.go      |  6 +++-
 pkg/sentry/kernel/kernel.go                       |  9 ++++--
 pkg/sentry/vfs/dentry.go                          |  4 ++-
 pkg/sentry/vfs/device.go                          |  3 ++
 pkg/sentry/vfs/file_description_impl_util_test.go | 10 +++++--
 pkg/sentry/vfs/filesystem.go                      |  2 ++
 pkg/sentry/vfs/filesystem_type.go                 |  1 +
 pkg/sentry/vfs/mount.go                           |  4 +++
 pkg/sentry/vfs/mount_unsafe.go                    |  8 ++++--
 pkg/sentry/vfs/vfs.go                             | 35 +++++++++++------------
 20 files changed, 94 insertions(+), 44 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 8973754c8..5457ba5e7 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -199,7 +199,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 			}
 
 			paths := fs.GetPath(initArgs.Envv)
-			vfsObj := proc.Kernel.VFS
+			vfsObj := proc.Kernel.VFS()
 			file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
 			if err != nil {
 				return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 73308a2b5..b6d52c015 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -29,7 +29,10 @@ func TestDevtmpfs(t *testing.T) {
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
 
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	// Register tmpfs just so that we can have a root filesystem that isn't
 	// devtmpfs.
 	vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 2015a8871..89caee3df 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -52,7 +52,10 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 	creds := auth.CredentialsFromContext(ctx)
 
 	// Create VFS.
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		return nil, nil, nil, nil, err
+	}
 	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 05f992826..ef6127f3c 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -65,7 +65,10 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 	creds := auth.CredentialsFromContext(ctx)
 
 	// Create VFS.
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 96a16e654..0459fb305 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -45,7 +45,10 @@ type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
 func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
-	v := vfs.New()
+	v := &vfs.VirtualFilesystem{}
+	if err := v.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 96c72cbc9..c5d531fe0 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -90,7 +90,7 @@ func setup(t *testing.T) *testutil.System {
 	ctx := k.SupervisorContext()
 	creds := auth.CredentialsFromContext(ctx)
 
-	k.VFS.MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
 	fsOpts := vfs.GetFilesystemOptions{
@@ -101,11 +101,11 @@ func setup(t *testing.T) *testutil.System {
 			},
 		},
 	}
-	mntns, err := k.VFS.NewMountNamespace(ctx, creds, "", Name, &fsOpts)
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", Name, &fsOpts)
 	if err != nil {
 		t.Fatalf("NewMountNamespace(): %v", err)
 	}
-	return testutil.NewSystem(ctx, t, k.VFS, mntns)
+	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
 }
 
 func TestTasksEmpty(t *testing.T) {
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 5d1ba5867..4b3602d47 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -34,15 +34,15 @@ func newTestSystem(t *testing.T) *testutil.System {
 	}
 	ctx := k.SupervisorContext()
 	creds := auth.CredentialsFromContext(ctx)
-	k.VFS.MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+	k.VFS().MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
 
-	mns, err := k.VFS.NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
+	mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create new mount namespace: %v", err)
 	}
-	return testutil.NewSystem(ctx, t, k.VFS, mns)
+	return testutil.NewSystem(ctx, t, k.VFS(), mns)
 }
 
 func TestReadCPUFile(t *testing.T) {
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index a91b3ec4d..d0be32e72 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -102,12 +102,13 @@ func Boot() (*kernel.Kernel, error) {
 
 	kernel.VFS2Enabled = true
 
-	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+	if err := k.VFS().Init(); err != nil {
+		return nil, fmt.Errorf("VFS init: %v", err)
+	}
+	k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 		AllowUserList:  true,
 	})
-	k.VFS = vfsObj
 
 	ls, err := limits.NewLinuxLimitSet()
 	if err != nil {
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index 9fce5e4b4..383133e44 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -175,7 +175,10 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			creds := auth.CredentialsFromContext(ctx)
 
 			// Create VFS.
-			vfsObj := vfs.New()
+			vfsObj := vfs.VirtualFilesystem{}
+			if err := vfsObj.Init(); err != nil {
+				b.Fatalf("VFS init: %v", err)
+			}
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
@@ -366,7 +369,10 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			creds := auth.CredentialsFromContext(ctx)
 
 			// Create VFS.
-			vfsObj := vfs.New()
+			vfsObj := vfs.VirtualFilesystem{}
+			if err := vfsObj.Init(); err != nil {
+				b.Fatalf("VFS init: %v", err)
+			}
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index 5ee7f2a72..1614f2c39 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -151,7 +151,10 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	creds := auth.CredentialsFromContext(ctx)
 
 	// Create VFS.
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index e9f71e334..0399725cf 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -40,7 +40,11 @@ var nextFileID int64
 func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) {
 	creds := auth.CredentialsFromContext(ctx)
 
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err)
+	}
+
 	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 2665f057c..ea21af33f 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -246,7 +246,7 @@ type Kernel struct {
 	SpecialOpts
 
 	// VFS keeps the filesystem state used across the kernel.
-	VFS *vfs.VirtualFilesystem
+	vfs vfs.VirtualFilesystem
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -815,7 +815,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 				FollowFinalSymlink: true,
 			}
 			var err error
-			wd, err = k.VFS.GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{
+			wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{
 				CheckSearchable: true,
 			})
 			if err != nil {
@@ -1506,3 +1506,8 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
 		Registers: t.Arch().StateData().Proto(),
 	})
 }
+
+// VFS returns the virtual filesystem for the kernel.
+func (k *Kernel) VFS() *vfs.VirtualFilesystem {
+	return &k.vfs
+}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 486a76475..35b208721 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -71,6 +71,8 @@ import (
 // lifetime. Dentry reference counts only indicate the extent to which VFS
 // requires Dentries to exist; Filesystems may elect to cache or discard
 // Dentries with zero references.
+//
+// +stateify savable
 type Dentry struct {
 	// parent is this Dentry's parent in this Filesystem. If this Dentry is
 	// independent, parent is nil.
@@ -89,7 +91,7 @@ type Dentry struct {
 	children map[string]*Dentry
 
 	// mu synchronizes disowning and mounting over this Dentry.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// impl is the DentryImpl associated with this Dentry. impl is immutable.
 	// This should be the last field in Dentry.
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
index 3af2aa58d..bda5576fa 100644
--- a/pkg/sentry/vfs/device.go
+++ b/pkg/sentry/vfs/device.go
@@ -56,6 +56,7 @@ type Device interface {
 	Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error)
 }
 
+// +stateify savable
 type registeredDevice struct {
 	dev  Device
 	opts RegisterDeviceOptions
@@ -63,6 +64,8 @@ type registeredDevice struct {
 
 // RegisterDeviceOptions contains options to
 // VirtualFilesystem.RegisterDevice().
+//
+// +stateify savable
 type RegisterDeviceOptions struct {
 	// GroupName is the name shown for this device registration in
 	// /proc/devices. If GroupName is empty, this registration will not be
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 8fa26418e..3a75d4d62 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -107,7 +107,10 @@ func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error {
 func TestGenCountFD(t *testing.T) {
 	ctx := contexttest.Context(t)
 
-	vfsObj := New() // vfs.New()
+	vfsObj := &VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{})
 	defer fd.DecRef()
 
@@ -162,7 +165,10 @@ func TestGenCountFD(t *testing.T) {
 func TestWritable(t *testing.T) {
 	ctx := contexttest.Context(t)
 
-	vfsObj := New() // vfs.New()
+	vfsObj := &VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"})
 	defer fd.DecRef()
 
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index a06a6caf3..556976d0b 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -29,6 +29,8 @@ import (
 // Filesystem methods require that a reference is held.
 //
 // Filesystem is analogous to Linux's struct super_block.
+//
+// +stateify savable
 type Filesystem struct {
 	// refs is the reference count. refs is accessed using atomic memory
 	// operations.
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index c58b70728..bb9cada81 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -44,6 +44,7 @@ type GetFilesystemOptions struct {
 	InternalData interface{}
 }
 
+// +stateify savable
 type registeredFilesystemType struct {
 	fsType FilesystemType
 	opts   RegisterFilesystemTypeOptions
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index ad2c9fcf4..9912df799 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -38,6 +38,8 @@ import (
 //
 // Mount is analogous to Linux's struct mount. (gVisor does not distinguish
 // between struct mount and struct vfsmount.)
+//
+// +stateify savable
 type Mount struct {
 	// vfs, fs, and root are immutable. References are held on fs and root.
 	//
@@ -85,6 +87,8 @@ type Mount struct {
 // MountNamespace methods require that a reference is held.
 //
 // MountNamespace is analogous to Linux's struct mnt_namespace.
+//
+// +stateify savable
 type MountNamespace struct {
 	// root is the MountNamespace's root mount. root is immutable.
 	root *Mount
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index bd90d36c4..1fe766a44 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -64,6 +64,8 @@ func (mnt *Mount) storeKey(vd VirtualDentry) {
 // (provided mutation is sufficiently uncommon).
 //
 // mountTable.Init() must be called on new mountTables before use.
+//
+// +stateify savable
 type mountTable struct {
 	// mountTable is implemented as a seqcount-protected hash table that
 	// resolves collisions with linear probing, featuring Robin Hood insertion
@@ -75,8 +77,8 @@ type mountTable struct {
 	// intrinsics and inline assembly, limiting the performance of this
 	// approach.)
 
-	seq  sync.SeqCount
-	seed uint32 // for hashing keys
+	seq  sync.SeqCount `state:"nosave"`
+	seed uint32        // for hashing keys
 
 	// size holds both length (number of elements) and capacity (number of
 	// slots): capacity is stored as its base-2 log (referred to as order) in
@@ -89,7 +91,7 @@ type mountTable struct {
 	// length and cap in separate uint32s) for ~free.
 	size uint64
 
-	slots unsafe.Pointer // []mountSlot; never nil after Init
+	slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init
 }
 
 type mountSlot struct {
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 51deae313..8f29031b2 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -46,11 +46,13 @@ import (
 //
 // There is no analogue to the VirtualFilesystem type in Linux, as the
 // equivalent state in Linux is global.
+//
+// +stateify savable
 type VirtualFilesystem struct {
 	// mountMu serializes mount mutations.
 	//
 	// mountMu is analogous to Linux's namespace_sem.
-	mountMu sync.Mutex
+	mountMu sync.Mutex `state:"nosave"`
 
 	// mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
 	// are uniquely namespaced, including mount parent in the key correctly
@@ -89,44 +91,42 @@ type VirtualFilesystem struct {
 
 	// devices contains all registered Devices. devices is protected by
 	// devicesMu.
-	devicesMu sync.RWMutex
+	devicesMu sync.RWMutex `state:"nosave"`
 	devices   map[devTuple]*registeredDevice
 
 	// anonBlockDevMinor contains all allocated anonymous block device minor
 	// numbers. anonBlockDevMinorNext is a lower bound for the smallest
 	// unallocated anonymous block device number. anonBlockDevMinorNext and
 	// anonBlockDevMinor are protected by anonBlockDevMinorMu.
-	anonBlockDevMinorMu   sync.Mutex
+	anonBlockDevMinorMu   sync.Mutex `state:"nosave"`
 	anonBlockDevMinorNext uint32
 	anonBlockDevMinor     map[uint32]struct{}
 
 	// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
 	// fsTypesMu.
-	fsTypesMu sync.RWMutex
+	fsTypesMu sync.RWMutex `state:"nosave"`
 	fsTypes   map[string]*registeredFilesystemType
 
 	// filesystems contains all Filesystems. filesystems is protected by
 	// filesystemsMu.
-	filesystemsMu sync.Mutex
+	filesystemsMu sync.Mutex `state:"nosave"`
 	filesystems   map[*Filesystem]struct{}
 }
 
-// New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
-func New() *VirtualFilesystem {
-	vfs := &VirtualFilesystem{
-		mountpoints:           make(map[*Dentry]map[*Mount]struct{}),
-		devices:               make(map[devTuple]*registeredDevice),
-		anonBlockDevMinorNext: 1,
-		anonBlockDevMinor:     make(map[uint32]struct{}),
-		fsTypes:               make(map[string]*registeredFilesystemType),
-		filesystems:           make(map[*Filesystem]struct{}),
-	}
+// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
+func (vfs *VirtualFilesystem) Init() error {
+	vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
+	vfs.devices = make(map[devTuple]*registeredDevice)
+	vfs.anonBlockDevMinorNext = 1
+	vfs.anonBlockDevMinor = make(map[uint32]struct{})
+	vfs.fsTypes = make(map[string]*registeredFilesystemType)
+	vfs.filesystems = make(map[*Filesystem]struct{})
 	vfs.mounts.Init()
 
 	// Construct vfs.anonMount.
 	anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
 	if err != nil {
-		panic(fmt.Sprintf("VirtualFilesystem.GetAnonBlockDevMinor() failed during VirtualFilesystem construction: %v", err))
+		return err
 	}
 	anonfs := anonFilesystem{
 		devMinor: anonfsDevMinor,
@@ -137,8 +137,7 @@ func New() *VirtualFilesystem {
 		fs:   &anonfs.vfsfs,
 		refs: 1,
 	}
-
-	return vfs
+	return nil
 }
 
 // PathOperation specifies the path operated on by a VFS method.
-- 
cgit v1.2.3


From 87bc2834c97a958d0762833fe8db749ccc6d5d50 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 14:23:35 -0800
Subject: Enable automated marshalling for RSeqCriticalSection.

PiperOrigin-RevId: 295226468
---
 pkg/sentry/kernel/rseq.go | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index efebfd872..18416643b 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -303,26 +303,14 @@ func (t *Task) rseqAddrInterrupt() {
 		return
 	}
 
-	buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection)
-	if _, err := t.CopyInBytes(critAddr, buf); err != nil {
+	var cs linux.RSeqCriticalSection
+	if _, err := cs.CopyIn(t, critAddr); err != nil {
 		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
 		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
 		return
 	}
 
-	// Manually marshal RSeqCriticalSection as this is in the hot path when
-	// rseq is enabled. It must be as fast as possible.
-	//
-	// TODO(b/130243041): Replace with go_marshal.
-	cs := linux.RSeqCriticalSection{
-		Version:          usermem.ByteOrder.Uint32(buf[0:4]),
-		Flags:            usermem.ByteOrder.Uint32(buf[4:8]),
-		Start:            usermem.ByteOrder.Uint64(buf[8:16]),
-		PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]),
-		Abort:            usermem.ByteOrder.Uint64(buf[24:32]),
-	}
-
 	if cs.Version != 0 {
 		t.Debugf("Unknown version in %+v", cs)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
-- 
cgit v1.2.3


From 3557b2665198b57c04924ad4be8dbf9e42cedf71 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 14:39:40 -0800
Subject: Allow vfs.IterDirentsCallback.Handle() to return an error.

This is easier than storing errors from e.g. CopyOut in the callback.

PiperOrigin-RevId: 295230021
---
 pkg/sentry/fsimpl/ext/directory.go       |  6 +++---
 pkg/sentry/fsimpl/ext/ext_test.go        |  4 ++--
 pkg/sentry/fsimpl/gofer/directory.go     |  4 ++--
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 12 ++++++------
 pkg/sentry/fsimpl/proc/subtasks.go       |  4 ++--
 pkg/sentry/fsimpl/proc/tasks.go          | 12 ++++++------
 pkg/sentry/fsimpl/testutil/testutil.go   |  4 ++--
 pkg/sentry/fsimpl/tmpfs/directory.go     | 18 +++++++++---------
 pkg/sentry/vfs/file_description.go       | 10 +++++-----
 9 files changed, 37 insertions(+), 37 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index ebb72b75e..bd6ede995 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -188,14 +188,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 				childType = fs.ToInodeType(childInode.diskInode.Mode().FileType())
 			}
 
-			if !cb.Handle(vfs.Dirent{
+			if err := cb.Handle(vfs.Dirent{
 				Name:    child.diskDirent.FileName(),
 				Type:    fs.ToDirentType(childType),
 				Ino:     uint64(child.diskDirent.Inode()),
 				NextOff: fd.off + 1,
-			}) {
+			}); err != nil {
 				dir.childList.InsertBefore(child, fd.iter)
-				return nil
+				return err
 			}
 			fd.off++
 		}
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index ef6127f3c..29bb73765 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -499,9 +499,9 @@ func newIterDirentCb() *iterDirentsCb {
 }
 
 // Handle implements vfs.IterDirentsCallback.Handle.
-func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) bool {
+func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) error {
 	cb.dirents = append(cb.dirents, dirent)
-	return true
+	return nil
 }
 
 // TestIterDirents tests the FileDescriptionImpl.IterDirents functionality.
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 6d4ebc2bf..5dbfc6250 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -65,8 +65,8 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	}
 
 	for fd.off < int64(len(fd.dirents)) {
-		if !cb.Handle(fd.dirents[fd.off]) {
-			return nil
+		if err := cb.Handle(fd.dirents[fd.off]); err != nil {
+			return err
 		}
 		fd.off++
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index eda781155..5650512e0 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -116,8 +116,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 			Ino:     stat.Ino,
 			NextOff: 1,
 		}
-		if !cb.Handle(dirent) {
-			return nil
+		if err := cb.Handle(dirent); err != nil {
+			return err
 		}
 		fd.off++
 	}
@@ -132,8 +132,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 			Ino:     stat.Ino,
 			NextOff: 2,
 		}
-		if !cb.Handle(dirent) {
-			return nil
+		if err := cb.Handle(dirent); err != nil {
+			return err
 		}
 		fd.off++
 	}
@@ -153,8 +153,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 			Ino:     stat.Ino,
 			NextOff: fd.off + 1,
 		}
-		if !cb.Handle(dirent) {
-			return nil
+		if err := cb.Handle(dirent); err != nil {
+			return err
 		}
 		fd.off++
 	}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 102af0e93..f3f4e49b4 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -105,8 +105,8 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
 			Ino:     i.inoGen.NextIno(),
 			NextOff: offset + 1,
 		}
-		if !cb.Handle(dirent) {
-			return offset, nil
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
 		}
 		offset++
 	}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index ebe21630c..ce08a7d53 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -151,8 +151,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 			Ino:     i.inoGen.NextIno(),
 			NextOff: offset + 1,
 		}
-		if !cb.Handle(dirent) {
-			return offset, nil
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
 		}
 		offset++
 	}
@@ -163,8 +163,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 			Ino:     i.inoGen.NextIno(),
 			NextOff: offset + 1,
 		}
-		if !cb.Handle(dirent) {
-			return offset, nil
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
 		}
 		offset++
 	}
@@ -196,8 +196,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 			Ino:     i.inoGen.NextIno(),
 			NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1,
 		}
-		if !cb.Handle(dirent) {
-			return offset, nil
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
 		}
 		offset++
 	}
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index b97e3534a..e16808c63 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -226,7 +226,7 @@ func (d *DirentCollector) SkipDotsChecks(value bool) {
 }
 
 // Handle implements vfs.IterDirentsCallback.Handle.
-func (d *DirentCollector) Handle(dirent vfs.Dirent) bool {
+func (d *DirentCollector) Handle(dirent vfs.Dirent) error {
 	d.mu.Lock()
 	if d.dirents == nil {
 		d.dirents = make(map[string]*vfs.Dirent)
@@ -234,7 +234,7 @@ func (d *DirentCollector) Handle(dirent vfs.Dirent) bool {
 	d.order = append(d.order, &dirent)
 	d.dirents[dirent.Name] = &dirent
 	d.mu.Unlock()
-	return true
+	return nil
 }
 
 // Count returns the number of dirents currently in the collector.
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index dc0d27cf9..b4380af38 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -74,25 +74,25 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	defer fs.mu.Unlock()
 
 	if fd.off == 0 {
-		if !cb.Handle(vfs.Dirent{
+		if err := cb.Handle(vfs.Dirent{
 			Name:    ".",
 			Type:    linux.DT_DIR,
 			Ino:     vfsd.Impl().(*dentry).inode.ino,
 			NextOff: 1,
-		}) {
-			return nil
+		}); err != nil {
+			return err
 		}
 		fd.off++
 	}
 	if fd.off == 1 {
 		parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
-		if !cb.Handle(vfs.Dirent{
+		if err := cb.Handle(vfs.Dirent{
 			Name:    "..",
 			Type:    parentInode.direntType(),
 			Ino:     parentInode.ino,
 			NextOff: 2,
-		}) {
-			return nil
+		}); err != nil {
+			return err
 		}
 		fd.off++
 	}
@@ -111,14 +111,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	for child != nil {
 		// Skip other directoryFD iterators.
 		if child.inode != nil {
-			if !cb.Handle(vfs.Dirent{
+			if err := cb.Handle(vfs.Dirent{
 				Name:    child.vfsd.Name(),
 				Type:    child.inode.direntType(),
 				Ino:     child.inode.ino,
 				NextOff: fd.off + 1,
-			}) {
+			}); err != nil {
 				dir.childList.InsertBefore(child, fd.iter)
-				return nil
+				return err
 			}
 			fd.off++
 		}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 5bac660c7..9a1ad630c 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -435,11 +435,11 @@ type Dirent struct {
 
 // IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
 type IterDirentsCallback interface {
-	// Handle handles the given iterated Dirent. It returns true if iteration
-	// should continue, and false if FileDescriptionImpl.IterDirents should
-	// terminate now and restart with the same Dirent the next time it is
-	// called.
-	Handle(dirent Dirent) bool
+	// Handle handles the given iterated Dirent. If Handle returns a non-nil
+	// error, FileDescriptionImpl.IterDirents must stop iteration and return
+	// the error; the next call to FileDescriptionImpl.IterDirents should
+	// restart with the same Dirent.
+	Handle(dirent Dirent) error
 }
 
 // OnClose is called when a file descriptor representing the FileDescription is
-- 
cgit v1.2.3


From 5baf9dc2fbb459828b4102b0a1c5214879434c03 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 15:48:09 -0800
Subject: Synchronize signalling with S/R

This is to fix a data race between sending an external signal to
a ThreadGroup and kernel saving state for S/R.

PiperOrigin-RevId: 295244281
---
 pkg/sentry/kernel/kernel.go | 8 ++++++++
 runsc/boot/loader.go        | 8 ++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index ea21af33f..7da0368f1 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -1169,6 +1169,14 @@ func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) {
 	k.sendExternalSignal(info, context)
 }
 
+// SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup.
+// This function doesn't skip signals like SendExternalSignal does.
+func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *arch.SignalInfo) error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return tg.SendSignal(info)
+}
+
 // SendContainerSignal sends the given signal to all processes inside the
 // namespace that match the given container ID.
 func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 239ca5302..eef43b9df 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -997,7 +997,7 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er
 	execTG, _, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
 	if err == nil {
 		// Send signal directly to the identified process.
-		return execTG.SendSignal(&arch.SignalInfo{Signo: signo})
+		return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo})
 	}
 
 	// The caller may be signaling a process not started directly via exec.
@@ -1014,7 +1014,7 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er
 	if tg.Leader().ContainerID() != cid {
 		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
 	}
-	return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+	return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
 }
 
 func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
@@ -1032,7 +1032,7 @@ func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, s
 		// No foreground process group has been set. Signal the
 		// original thread group.
 		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
-		return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+		return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
 	}
 	// Send the signal to all processes in the process group.
 	var lastErr error
@@ -1040,7 +1040,7 @@ func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, s
 		if tg.ProcessGroup() != pg {
 			continue
 		}
-		if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil {
+		if err := l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}); err != nil {
 			lastErr = err
 		}
 	}
-- 
cgit v1.2.3


From a5069f820f22734b6c466068a02bbbe83ba091da Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 14 Feb 2020 16:19:00 -0800
Subject: Remove linux.EpollEvent.Fd.

glibc defines struct epoll_event in such a way that epoll_event.data.fd exists.
However, the kernel's definition of struct epoll_event makes epoll_event.data
an opaque uint64, so naming half of it "fd" just introduces confusion. Remove
the Fd field, and make Data a [2]int32 to compensate.

Also add required padding to linux.EpollEvent on ARM64.

PiperOrigin-RevId: 295250424
---
 pkg/abi/linux/BUILD                    |  2 ++
 pkg/abi/linux/epoll.go                 |  7 -------
 pkg/abi/linux/epoll_amd64.go           | 25 +++++++++++++++++++++++++
 pkg/abi/linux/epoll_arm64.go           | 24 ++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/sys_epoll.go |  3 +--
 pkg/sentry/vfs/epoll.go                | 19 +++++++++----------
 6 files changed, 61 insertions(+), 19 deletions(-)
 create mode 100644 pkg/abi/linux/epoll_amd64.go
 create mode 100644 pkg/abi/linux/epoll_arm64.go

(limited to 'pkg')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index b7015367b..a89f34d4b 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -17,6 +17,8 @@ go_library(
         "dev.go",
         "elf.go",
         "epoll.go",
+        "epoll_amd64.go",
+        "epoll_arm64.go",
         "errors.go",
         "eventfd.go",
         "exec.go",
diff --git a/pkg/abi/linux/epoll.go b/pkg/abi/linux/epoll.go
index 0e881aa3c..6e4de69da 100644
--- a/pkg/abi/linux/epoll.go
+++ b/pkg/abi/linux/epoll.go
@@ -14,13 +14,6 @@
 
 package linux
 
-// EpollEvent is equivalent to struct epoll_event from epoll(2).
-type EpollEvent struct {
-	Events uint32
-	Fd     int32
-	Data   int32
-}
-
 // Event masks.
 const (
 	EPOLLIN     = 0x1
diff --git a/pkg/abi/linux/epoll_amd64.go b/pkg/abi/linux/epoll_amd64.go
new file mode 100644
index 000000000..57041491c
--- /dev/null
+++ b/pkg/abi/linux/epoll_amd64.go
@@ -0,0 +1,25 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// EpollEvent is equivalent to struct epoll_event from epoll(2).
+type EpollEvent struct {
+	Events uint32
+	// Linux makes struct epoll_event::data a __u64. We represent it as
+	// [2]int32 because, on amd64, Linux also makes struct epoll_event
+	// __attribute__((packed)), such that there is no padding between Events
+	// and Data.
+	Data [2]int32
+}
diff --git a/pkg/abi/linux/epoll_arm64.go b/pkg/abi/linux/epoll_arm64.go
new file mode 100644
index 000000000..62ef5821e
--- /dev/null
+++ b/pkg/abi/linux/epoll_arm64.go
@@ -0,0 +1,24 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// EpollEvent is equivalent to struct epoll_event from epoll(2).
+type EpollEvent struct {
+	Events uint32
+	// Linux makes struct epoll_event a __u64, necessitating 4 bytes of padding
+	// here.
+	_    int32
+	Data [2]int32
+}
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 5f11b496c..fbef5b376 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -83,8 +83,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		}
 
 		mask = waiter.EventMaskFromLinux(e.Events)
-		data[0] = e.Fd
-		data[1] = e.Data
+		data = e.Data
 	}
 
 	// Perform the requested operations.
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 7c83f9a5a..eed41139b 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -85,8 +85,8 @@ type epollInterest struct {
 	ready bool
 	epollInterestEntry
 
-	// userData is the epoll_data_t associated with this epollInterest.
-	// userData is protected by epoll.mu.
+	// userData is the struct epoll_event::data associated with this
+	// epollInterest. userData is protected by epoll.mu.
 	userData [2]int32
 }
 
@@ -157,7 +157,7 @@ func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (
 // AddInterest implements the semantics of EPOLL_CTL_ADD.
 //
 // Preconditions: A reference must be held on file.
-func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
 	// Check for cyclic polling if necessary.
 	subep, _ := file.impl.(*EpollInstance)
 	if subep != nil {
@@ -183,12 +183,12 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint
 	}
 
 	// Register interest in file.
-	mask |= linux.EPOLLERR | linux.EPOLLRDHUP
+	mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
 	epi := &epollInterest{
 		epoll:    ep,
 		key:      key,
 		mask:     mask,
-		userData: userData,
+		userData: event.Data,
 	}
 	ep.interest[key] = epi
 	wmask := waiter.EventMaskFromLinux(mask)
@@ -236,7 +236,7 @@ func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursi
 // ModifyInterest implements the semantics of EPOLL_CTL_MOD.
 //
 // Preconditions: A reference must be held on file.
-func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
 	ep.interestMu.Lock()
 	defer ep.interestMu.Unlock()
 
@@ -250,13 +250,13 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask u
 	}
 
 	// Update epi for the next call to ep.ReadEvents().
+	mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
 	ep.mu.Lock()
 	epi.mask = mask
-	epi.userData = userData
+	epi.userData = event.Data
 	ep.mu.Unlock()
 
 	// Re-register with the new mask.
-	mask |= linux.EPOLLERR | linux.EPOLLRDHUP
 	file.EventUnregister(&epi.waiter)
 	wmask := waiter.EventMaskFromLinux(mask)
 	file.EventRegister(&epi.waiter, wmask)
@@ -363,8 +363,7 @@ func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
 		// Report ievents.
 		events[i] = linux.EpollEvent{
 			Events: ievents.ToLinux(),
-			Fd:     epi.userData[0],
-			Data:   epi.userData[1],
+			Data:   epi.userData,
 		}
 		i++
 		if i == len(events) {
-- 
cgit v1.2.3


From fae3de21af7f50266565643c6283912b087b0f5a Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 10:49:47 -0800
Subject: ring0/pagetables: fix typo

PiperOrigin-RevId: 295770717
---
 pkg/sentry/platform/ring0/pagetables/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 971eed7fa..4f2406ce3 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -7,7 +7,7 @@ go_template(
     name = "generic_walker",
     srcs = select_arch(
         amd64 = ["walker_amd64.go"],
-        arm64 = ["walker_amd64.go"],
+        arm64 = ["walker_arm64.go"],
     ),
     opt_types = [
         "Visitor",
-- 
cgit v1.2.3


From 906eb6295d54a05663a223f1dc379a16148de2d1 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 13:42:31 -0800
Subject: atomicbitops package cleanups

- Redocument memory ordering from "no ordering" to "acquire-release". (No
  functional change: both LOCK WHATEVER on x86, and LDAXR/STLXR loops on ARM64,
  already have this property.)

- Remove IncUnlessZeroInt32 and DecUnlessOneInt32, which were only faster than
  the equivalent loops using sync/atomic before the Go compiler inlined
  non-unsafe.Pointer atomics many releases ago.

PiperOrigin-RevId: 295811743
---
 pkg/atomicbitops/BUILD                   |   9 +-
 pkg/atomicbitops/atomic_bitops.go        |  60 -------
 pkg/atomicbitops/atomic_bitops_amd64.s   | 115 --------------
 pkg/atomicbitops/atomic_bitops_arm64.s   | 139 ----------------
 pkg/atomicbitops/atomic_bitops_common.go | 147 -----------------
 pkg/atomicbitops/atomic_bitops_test.go   | 262 -------------------------------
 pkg/atomicbitops/atomicbitops.go         |  47 ++++++
 pkg/atomicbitops/atomicbitops_amd64.s    |  77 +++++++++
 pkg/atomicbitops/atomicbitops_arm64.s    | 105 +++++++++++++
 pkg/atomicbitops/atomicbitops_noasm.go   | 105 +++++++++++++
 pkg/atomicbitops/atomicbitops_test.go    | 198 +++++++++++++++++++++++
 pkg/sentry/mm/address_space.go           |  23 ++-
 pkg/sentry/mm/lifecycle.go               |  11 +-
 13 files changed, 563 insertions(+), 735 deletions(-)
 delete mode 100644 pkg/atomicbitops/atomic_bitops.go
 delete mode 100644 pkg/atomicbitops/atomic_bitops_amd64.s
 delete mode 100644 pkg/atomicbitops/atomic_bitops_arm64.s
 delete mode 100644 pkg/atomicbitops/atomic_bitops_common.go
 delete mode 100644 pkg/atomicbitops/atomic_bitops_test.go
 create mode 100644 pkg/atomicbitops/atomicbitops.go
 create mode 100644 pkg/atomicbitops/atomicbitops_amd64.s
 create mode 100644 pkg/atomicbitops/atomicbitops_arm64.s
 create mode 100644 pkg/atomicbitops/atomicbitops_noasm.go
 create mode 100644 pkg/atomicbitops/atomicbitops_test.go

(limited to 'pkg')

diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index 3948074ba..ba8b06071 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -5,10 +5,9 @@ package(licenses = ["notice"])
 go_library(
     name = "atomicbitops",
     srcs = [
-        "atomic_bitops.go",
-        "atomic_bitops_amd64.s",
-        "atomic_bitops_arm64.s",
-        "atomic_bitops_common.go",
+        "atomicbitops.go",
+        "atomicbitops_amd64.s",
+        "atomicbitops_noasm.go",
     ],
     visibility = ["//:sandbox"],
 )
@@ -16,7 +15,7 @@ go_library(
 go_test(
     name = "atomicbitops_test",
     size = "small",
-    srcs = ["atomic_bitops_test.go"],
+    srcs = ["atomicbitops_test.go"],
     library = ":atomicbitops",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/atomicbitops/atomic_bitops.go b/pkg/atomicbitops/atomic_bitops.go
deleted file mode 100644
index fcc41a9ea..000000000
--- a/pkg/atomicbitops/atomic_bitops.go
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build amd64 arm64
-
-// Package atomicbitops provides basic bitwise operations in an atomic way.
-// The implementation on amd64 leverages the LOCK prefix directly instead of
-// relying on the generic cas primitives, and the arm64 leverages the LDAXR
-// and STLXR pair primitives.
-//
-// WARNING: the bitwise ops provided in this package doesn't imply any memory
-// ordering. Using them to construct locks must employ proper memory barriers.
-package atomicbitops
-
-// AndUint32 atomically applies bitwise and operation to *addr with val.
-func AndUint32(addr *uint32, val uint32)
-
-// OrUint32 atomically applies bitwise or operation to *addr with val.
-func OrUint32(addr *uint32, val uint32)
-
-// XorUint32 atomically applies bitwise xor operation to *addr with val.
-func XorUint32(addr *uint32, val uint32)
-
-// CompareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
-// the value previously stored at addr.
-func CompareAndSwapUint32(addr *uint32, old, new uint32) uint32
-
-// AndUint64 atomically applies bitwise and operation to *addr with val.
-func AndUint64(addr *uint64, val uint64)
-
-// OrUint64 atomically applies bitwise or operation to *addr with val.
-func OrUint64(addr *uint64, val uint64)
-
-// XorUint64 atomically applies bitwise xor operation to *addr with val.
-func XorUint64(addr *uint64, val uint64)
-
-// CompareAndSwapUint64 is like sync/atomic.CompareAndSwapUint64, but returns
-// the value previously stored at addr.
-func CompareAndSwapUint64(addr *uint64, old, new uint64) uint64
-
-// IncUnlessZeroInt32 increments the value stored at the given address and
-// returns true; unless the value stored in the pointer is zero, in which case
-// it is left unmodified and false is returned.
-func IncUnlessZeroInt32(addr *int32) bool
-
-// DecUnlessOneInt32 decrements the value stored at the given address and
-// returns true; unless the value stored in the pointer is 1, in which case it
-// is left unmodified and false is returned.
-func DecUnlessOneInt32(addr *int32) bool
diff --git a/pkg/atomicbitops/atomic_bitops_amd64.s b/pkg/atomicbitops/atomic_bitops_amd64.s
deleted file mode 100644
index db0972001..000000000
--- a/pkg/atomicbitops/atomic_bitops_amd64.s
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build amd64
-
-#include "textflag.h"
-
-TEXT ·AndUint32(SB),$0-12
-  MOVQ  addr+0(FP), BP
-  MOVL  val+8(FP), AX
-  LOCK
-  ANDL   AX, 0(BP)
-  RET
-
-TEXT ·OrUint32(SB),$0-12
-  MOVQ  addr+0(FP), BP
-  MOVL  val+8(FP), AX
-  LOCK
-  ORL   AX, 0(BP)
-  RET
-
-TEXT ·XorUint32(SB),$0-12
-  MOVQ  addr+0(FP), BP
-  MOVL  val+8(FP), AX
-  LOCK
-  XORL   AX, 0(BP)
-  RET
-
-TEXT ·CompareAndSwapUint32(SB),$0-20
-  MOVQ  addr+0(FP), DI
-  MOVL  old+8(FP), AX
-  MOVL  new+12(FP), DX
-  LOCK
-  CMPXCHGL DX, 0(DI)
-  MOVL  AX, ret+16(FP)
-  RET
-
-TEXT ·AndUint64(SB),$0-16
-  MOVQ  addr+0(FP), BP
-  MOVQ  val+8(FP), AX
-  LOCK
-  ANDQ   AX, 0(BP)
-  RET
-
-TEXT ·OrUint64(SB),$0-16
-  MOVQ  addr+0(FP), BP
-  MOVQ  val+8(FP), AX
-  LOCK
-  ORQ   AX, 0(BP)
-  RET
-
-TEXT ·XorUint64(SB),$0-16
-  MOVQ  addr+0(FP), BP
-  MOVQ  val+8(FP), AX
-  LOCK
-  XORQ   AX, 0(BP)
-  RET
-
-TEXT ·CompareAndSwapUint64(SB),$0-32
-  MOVQ  addr+0(FP), DI
-  MOVQ  old+8(FP), AX
-  MOVQ  new+16(FP), DX
-  LOCK
-  CMPXCHGQ DX, 0(DI)
-  MOVQ  AX, ret+24(FP)
-  RET
-
-TEXT ·IncUnlessZeroInt32(SB),NOSPLIT,$0-9
-  MOVQ     addr+0(FP), DI
-  MOVL     0(DI), AX
-
-retry:
-  TESTL    AX, AX
-  JZ       fail
-  LEAL     1(AX), DX
-  LOCK
-  CMPXCHGL DX, 0(DI)
-  JNZ      retry
-
-  SETEQ    ret+8(FP)
-  RET
-
-fail:
-  MOVB     AX, ret+8(FP)
-  RET
-
-TEXT ·DecUnlessOneInt32(SB),NOSPLIT,$0-9
-  MOVQ     addr+0(FP), DI
-  MOVL     0(DI), AX
-
-retry:
-  LEAL     -1(AX), DX
-  TESTL    DX, DX
-  JZ       fail
-  LOCK
-  CMPXCHGL DX, 0(DI)
-  JNZ      retry
-
-  SETEQ    ret+8(FP)
-  RET
-
-fail:
-  MOVB     DX, ret+8(FP)
-  RET
diff --git a/pkg/atomicbitops/atomic_bitops_arm64.s b/pkg/atomicbitops/atomic_bitops_arm64.s
deleted file mode 100644
index 97f8808c1..000000000
--- a/pkg/atomicbitops/atomic_bitops_arm64.s
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build arm64
-
-#include "textflag.h"
-
-TEXT ·AndUint32(SB),$0-12
-  MOVD    ptr+0(FP), R0
-  MOVW    val+8(FP), R1
-again:
-  LDAXRW  (R0), R2
-  ANDW    R1, R2
-  STLXRW  R2, (R0), R3
-  CBNZ    R3, again
-  RET
-
-TEXT ·OrUint32(SB),$0-12
-  MOVD    ptr+0(FP), R0
-  MOVW    val+8(FP), R1
-again:
-  LDAXRW  (R0), R2
-  ORRW    R1, R2
-  STLXRW  R2, (R0), R3
-  CBNZ    R3, again
-  RET
-
-TEXT ·XorUint32(SB),$0-12
-  MOVD    ptr+0(FP), R0
-  MOVW    val+8(FP), R1
-again:
-  LDAXRW  (R0), R2
-  EORW    R1, R2
-  STLXRW  R2, (R0), R3
-  CBNZ    R3, again
-  RET
-
-TEXT ·CompareAndSwapUint32(SB),$0-20
-  MOVD addr+0(FP), R0
-  MOVW old+8(FP), R1
-  MOVW new+12(FP), R2
-
-again:
-  LDAXRW (R0), R3
-  CMPW R1, R3
-  BNE done
-  STLXRW R2, (R0), R4
-  CBNZ R4, again
-done:
-  MOVW R3, prev+16(FP)
-  RET
-
-TEXT ·AndUint64(SB),$0-16
-  MOVD    ptr+0(FP), R0
-  MOVD    val+8(FP), R1
-again:
-  LDAXR   (R0), R2
-  AND     R1, R2
-  STLXR   R2, (R0), R3
-  CBNZ    R3, again
-  RET
-
-TEXT ·OrUint64(SB),$0-16
-  MOVD    ptr+0(FP), R0
-  MOVD    val+8(FP), R1
-again:
-  LDAXR   (R0), R2
-  ORR     R1, R2
-  STLXR   R2, (R0), R3
-  CBNZ    R3, again
-  RET
-
-TEXT ·XorUint64(SB),$0-16
-  MOVD    ptr+0(FP), R0
-  MOVD    val+8(FP), R1
-again:
-  LDAXR   (R0), R2
-  EOR     R1, R2
-  STLXR   R2, (R0), R3
-  CBNZ    R3, again
-  RET
-
-TEXT ·CompareAndSwapUint64(SB),$0-32
-  MOVD addr+0(FP), R0
-  MOVD old+8(FP), R1
-  MOVD new+16(FP), R2
-
-again:
-  LDAXR (R0), R3
-  CMP R1, R3
-  BNE done
-  STLXR R2, (R0), R4
-  CBNZ R4, again
-done:
-  MOVD R3, prev+24(FP)
-  RET
-
-TEXT ·IncUnlessZeroInt32(SB),NOSPLIT,$0-9
-  MOVD addr+0(FP), R0
-
-again:
-  LDAXRW (R0), R1
-  CBZ R1, fail
-  ADDW $1, R1
-  STLXRW R1, (R0), R2
-  CBNZ R2, again
-  MOVW $1, R2
-  MOVB R2, ret+8(FP)
-  RET
-fail:
-  MOVB ZR, ret+8(FP)
-  RET
-
-TEXT ·DecUnlessOneInt32(SB),NOSPLIT,$0-9
-  MOVD addr+0(FP), R0
-
-again:
-  LDAXRW (R0), R1
-  SUBSW $1, R1, R1
-  BEQ fail
-  STLXRW R1, (R0), R2
-  CBNZ R2, again
-  MOVW $1, R2
-  MOVB R2, ret+8(FP)
-  RET
-fail:
-  MOVB ZR, ret+8(FP)
-  RET
diff --git a/pkg/atomicbitops/atomic_bitops_common.go b/pkg/atomicbitops/atomic_bitops_common.go
deleted file mode 100644
index 85163ad62..000000000
--- a/pkg/atomicbitops/atomic_bitops_common.go
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build !amd64,!arm64
-
-package atomicbitops
-
-import (
-	"sync/atomic"
-)
-
-// AndUint32 atomically applies bitwise and operation to *addr with val.
-func AndUint32(addr *uint32, val uint32) {
-	for {
-		o := atomic.LoadUint32(addr)
-		n := o & val
-		if atomic.CompareAndSwapUint32(addr, o, n) {
-			break
-		}
-	}
-}
-
-// OrUint32 atomically applies bitwise or operation to *addr with val.
-func OrUint32(addr *uint32, val uint32) {
-	for {
-		o := atomic.LoadUint32(addr)
-		n := o | val
-		if atomic.CompareAndSwapUint32(addr, o, n) {
-			break
-		}
-	}
-}
-
-// XorUint32 atomically applies bitwise xor operation to *addr with val.
-func XorUint32(addr *uint32, val uint32) {
-	for {
-		o := atomic.LoadUint32(addr)
-		n := o ^ val
-		if atomic.CompareAndSwapUint32(addr, o, n) {
-			break
-		}
-	}
-}
-
-// CompareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
-// the value previously stored at addr.
-func CompareAndSwapUint32(addr *uint32, old, new uint32) (prev uint32) {
-	for {
-		prev = atomic.LoadUint32(addr)
-		if prev != old {
-			return
-		}
-		if atomic.CompareAndSwapUint32(addr, old, new) {
-			return
-		}
-	}
-}
-
-// AndUint64 atomically applies bitwise and operation to *addr with val.
-func AndUint64(addr *uint64, val uint64) {
-	for {
-		o := atomic.LoadUint64(addr)
-		n := o & val
-		if atomic.CompareAndSwapUint64(addr, o, n) {
-			break
-		}
-	}
-}
-
-// OrUint64 atomically applies bitwise or operation to *addr with val.
-func OrUint64(addr *uint64, val uint64) {
-	for {
-		o := atomic.LoadUint64(addr)
-		n := o | val
-		if atomic.CompareAndSwapUint64(addr, o, n) {
-			break
-		}
-	}
-}
-
-// XorUint64 atomically applies bitwise xor operation to *addr with val.
-func XorUint64(addr *uint64, val uint64) {
-	for {
-		o := atomic.LoadUint64(addr)
-		n := o ^ val
-		if atomic.CompareAndSwapUint64(addr, o, n) {
-			break
-		}
-	}
-}
-
-// CompareAndSwapUint64 is like sync/atomic.CompareAndSwapUint64, but returns
-// the value previously stored at addr.
-func CompareAndSwapUint64(addr *uint64, old, new uint64) (prev uint64) {
-	for {
-		prev = atomic.LoadUint64(addr)
-		if prev != old {
-			return
-		}
-		if atomic.CompareAndSwapUint64(addr, old, new) {
-			return
-		}
-	}
-}
-
-// IncUnlessZeroInt32 increments the value stored at the given address and
-// returns true; unless the value stored in the pointer is zero, in which case
-// it is left unmodified and false is returned.
-func IncUnlessZeroInt32(addr *int32) bool {
-	for {
-		v := atomic.LoadInt32(addr)
-		if v == 0 {
-			return false
-		}
-
-		if atomic.CompareAndSwapInt32(addr, v, v+1) {
-			return true
-		}
-	}
-}
-
-// DecUnlessOneInt32 decrements the value stored at the given address and
-// returns true; unless the value stored in the pointer is 1, in which case it
-// is left unmodified and false is returned.
-func DecUnlessOneInt32(addr *int32) bool {
-	for {
-		v := atomic.LoadInt32(addr)
-		if v == 1 {
-			return false
-		}
-
-		if atomic.CompareAndSwapInt32(addr, v, v-1) {
-			return true
-		}
-	}
-}
diff --git a/pkg/atomicbitops/atomic_bitops_test.go b/pkg/atomicbitops/atomic_bitops_test.go
deleted file mode 100644
index 9466d3e23..000000000
--- a/pkg/atomicbitops/atomic_bitops_test.go
+++ /dev/null
@@ -1,262 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package atomicbitops
-
-import (
-	"runtime"
-	"testing"
-
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-const iterations = 100
-
-func detectRaces32(val, target uint32, fn func(*uint32, uint32)) bool {
-	runtime.GOMAXPROCS(100)
-	for n := 0; n < iterations; n++ {
-		x := val
-		var wg sync.WaitGroup
-		for i := uint32(0); i < 32; i++ {
-			wg.Add(1)
-			go func(a *uint32, i uint32) {
-				defer wg.Done()
-				fn(a, uint32(1<<i))
-			}(&x, i)
-		}
-		wg.Wait()
-		if x != target {
-			return true
-		}
-	}
-	return false
-}
-
-func detectRaces64(val, target uint64, fn func(*uint64, uint64)) bool {
-	runtime.GOMAXPROCS(100)
-	for n := 0; n < iterations; n++ {
-		x := val
-		var wg sync.WaitGroup
-		for i := uint64(0); i < 64; i++ {
-			wg.Add(1)
-			go func(a *uint64, i uint64) {
-				defer wg.Done()
-				fn(a, uint64(1<<i))
-			}(&x, i)
-		}
-		wg.Wait()
-		if x != target {
-			return true
-		}
-	}
-	return false
-}
-
-func TestOrUint32(t *testing.T) {
-	if detectRaces32(0x0, 0xffffffff, OrUint32) {
-		t.Error("Data race detected!")
-	}
-}
-
-func TestAndUint32(t *testing.T) {
-	if detectRaces32(0xf0f0f0f0, 0x00000000, AndUint32) {
-		t.Error("Data race detected!")
-	}
-}
-
-func TestXorUint32(t *testing.T) {
-	if detectRaces32(0xf0f0f0f0, 0x0f0f0f0f, XorUint32) {
-		t.Error("Data race detected!")
-	}
-}
-
-func TestOrUint64(t *testing.T) {
-	if detectRaces64(0x0, 0xffffffffffffffff, OrUint64) {
-		t.Error("Data race detected!")
-	}
-}
-
-func TestAndUint64(t *testing.T) {
-	if detectRaces64(0xf0f0f0f0f0f0f0f0, 0x0, AndUint64) {
-		t.Error("Data race detected!")
-	}
-}
-
-func TestXorUint64(t *testing.T) {
-	if detectRaces64(0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f, XorUint64) {
-		t.Error("Data race detected!")
-	}
-}
-
-func TestCompareAndSwapUint32(t *testing.T) {
-	tests := []struct {
-		name string
-		prev uint32
-		old  uint32
-		new  uint32
-		next uint32
-	}{
-		{
-			name: "Successful compare-and-swap with prev == new",
-			prev: 10,
-			old:  10,
-			new:  10,
-			next: 10,
-		},
-		{
-			name: "Successful compare-and-swap with prev != new",
-			prev: 20,
-			old:  20,
-			new:  22,
-			next: 22,
-		},
-		{
-			name: "Failed compare-and-swap with prev == new",
-			prev: 31,
-			old:  30,
-			new:  31,
-			next: 31,
-		},
-		{
-			name: "Failed compare-and-swap with prev != new",
-			prev: 41,
-			old:  40,
-			new:  42,
-			next: 41,
-		},
-	}
-	for _, test := range tests {
-		val := test.prev
-		prev := CompareAndSwapUint32(&val, test.old, test.new)
-		if got, want := prev, test.prev; got != want {
-			t.Errorf("%s: incorrect returned previous value: got %d, expected %d", test.name, got, want)
-		}
-		if got, want := val, test.next; got != want {
-			t.Errorf("%s: incorrect value stored in val: got %d, expected %d", test.name, got, want)
-		}
-	}
-}
-
-func TestCompareAndSwapUint64(t *testing.T) {
-	tests := []struct {
-		name string
-		prev uint64
-		old  uint64
-		new  uint64
-		next uint64
-	}{
-		{
-			name: "Successful compare-and-swap with prev == new",
-			prev: 0x100000000,
-			old:  0x100000000,
-			new:  0x100000000,
-			next: 0x100000000,
-		},
-		{
-			name: "Successful compare-and-swap with prev != new",
-			prev: 0x200000000,
-			old:  0x200000000,
-			new:  0x200000002,
-			next: 0x200000002,
-		},
-		{
-			name: "Failed compare-and-swap with prev == new",
-			prev: 0x300000001,
-			old:  0x300000000,
-			new:  0x300000001,
-			next: 0x300000001,
-		},
-		{
-			name: "Failed compare-and-swap with prev != new",
-			prev: 0x400000001,
-			old:  0x400000000,
-			new:  0x400000002,
-			next: 0x400000001,
-		},
-	}
-	for _, test := range tests {
-		val := test.prev
-		prev := CompareAndSwapUint64(&val, test.old, test.new)
-		if got, want := prev, test.prev; got != want {
-			t.Errorf("%s: incorrect returned previous value: got %d, expected %d", test.name, got, want)
-		}
-		if got, want := val, test.next; got != want {
-			t.Errorf("%s: incorrect value stored in val: got %d, expected %d", test.name, got, want)
-		}
-	}
-}
-
-func TestIncUnlessZeroInt32(t *testing.T) {
-	for _, test := range []struct {
-		initial int32
-		final   int32
-		ret     bool
-	}{
-		{
-			initial: 0,
-			final:   0,
-			ret:     false,
-		},
-		{
-			initial: 1,
-			final:   2,
-			ret:     true,
-		},
-		{
-			initial: 2,
-			final:   3,
-			ret:     true,
-		},
-	} {
-		val := test.initial
-		if got, want := IncUnlessZeroInt32(&val), test.ret; got != want {
-			t.Errorf("For initial value of %d: incorrect return value: got %v, wanted %v", test.initial, got, want)
-		}
-		if got, want := val, test.final; got != want {
-			t.Errorf("For initial value of %d: incorrect final value: got %d, wanted %d", test.initial, got, want)
-		}
-	}
-}
-
-func TestDecUnlessOneInt32(t *testing.T) {
-	for _, test := range []struct {
-		initial int32
-		final   int32
-		ret     bool
-	}{
-		{
-			initial: 0,
-			final:   -1,
-			ret:     true,
-		},
-		{
-			initial: 1,
-			final:   1,
-			ret:     false,
-		},
-		{
-			initial: 2,
-			final:   1,
-			ret:     true,
-		},
-	} {
-		val := test.initial
-		if got, want := DecUnlessOneInt32(&val), test.ret; got != want {
-			t.Errorf("For initial value of %d: incorrect return value: got %v, wanted %v", test.initial, got, want)
-		}
-		if got, want := val, test.final; got != want {
-			t.Errorf("For initial value of %d: incorrect final value: got %d, wanted %d", test.initial, got, want)
-		}
-	}
-}
diff --git a/pkg/atomicbitops/atomicbitops.go b/pkg/atomicbitops/atomicbitops.go
new file mode 100644
index 000000000..1be081719
--- /dev/null
+++ b/pkg/atomicbitops/atomicbitops.go
@@ -0,0 +1,47 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64 arm64
+
+// Package atomicbitops provides extensions to the sync/atomic package.
+//
+// All read-modify-write operations implemented by this package have
+// acquire-release memory ordering (like sync/atomic).
+package atomicbitops
+
+// AndUint32 atomically applies bitwise AND operation to *addr with val.
+func AndUint32(addr *uint32, val uint32)
+
+// OrUint32 atomically applies bitwise OR operation to *addr with val.
+func OrUint32(addr *uint32, val uint32)
+
+// XorUint32 atomically applies bitwise XOR operation to *addr with val.
+func XorUint32(addr *uint32, val uint32)
+
+// CompareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
+// the value previously stored at addr.
+func CompareAndSwapUint32(addr *uint32, old, new uint32) uint32
+
+// AndUint64 atomically applies bitwise AND operation to *addr with val.
+func AndUint64(addr *uint64, val uint64)
+
+// OrUint64 atomically applies bitwise OR operation to *addr with val.
+func OrUint64(addr *uint64, val uint64)
+
+// XorUint64 atomically applies bitwise XOR operation to *addr with val.
+func XorUint64(addr *uint64, val uint64)
+
+// CompareAndSwapUint64 is like sync/atomic.CompareAndSwapUint64, but returns
+// the value previously stored at addr.
+func CompareAndSwapUint64(addr *uint64, old, new uint64) uint64
diff --git a/pkg/atomicbitops/atomicbitops_amd64.s b/pkg/atomicbitops/atomicbitops_amd64.s
new file mode 100644
index 000000000..54c887ee5
--- /dev/null
+++ b/pkg/atomicbitops/atomicbitops_amd64.s
@@ -0,0 +1,77 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+#include "textflag.h"
+
+TEXT ·AndUint32(SB),$0-12
+  MOVQ  addr+0(FP), BP
+  MOVL  val+8(FP), AX
+  LOCK
+  ANDL   AX, 0(BP)
+  RET
+
+TEXT ·OrUint32(SB),$0-12
+  MOVQ  addr+0(FP), BP
+  MOVL  val+8(FP), AX
+  LOCK
+  ORL   AX, 0(BP)
+  RET
+
+TEXT ·XorUint32(SB),$0-12
+  MOVQ  addr+0(FP), BP
+  MOVL  val+8(FP), AX
+  LOCK
+  XORL   AX, 0(BP)
+  RET
+
+TEXT ·CompareAndSwapUint32(SB),$0-20
+  MOVQ  addr+0(FP), DI
+  MOVL  old+8(FP), AX
+  MOVL  new+12(FP), DX
+  LOCK
+  CMPXCHGL DX, 0(DI)
+  MOVL  AX, ret+16(FP)
+  RET
+
+TEXT ·AndUint64(SB),$0-16
+  MOVQ  addr+0(FP), BP
+  MOVQ  val+8(FP), AX
+  LOCK
+  ANDQ   AX, 0(BP)
+  RET
+
+TEXT ·OrUint64(SB),$0-16
+  MOVQ  addr+0(FP), BP
+  MOVQ  val+8(FP), AX
+  LOCK
+  ORQ   AX, 0(BP)
+  RET
+
+TEXT ·XorUint64(SB),$0-16
+  MOVQ  addr+0(FP), BP
+  MOVQ  val+8(FP), AX
+  LOCK
+  XORQ   AX, 0(BP)
+  RET
+
+TEXT ·CompareAndSwapUint64(SB),$0-32
+  MOVQ  addr+0(FP), DI
+  MOVQ  old+8(FP), AX
+  MOVQ  new+16(FP), DX
+  LOCK
+  CMPXCHGQ DX, 0(DI)
+  MOVQ  AX, ret+24(FP)
+  RET
diff --git a/pkg/atomicbitops/atomicbitops_arm64.s b/pkg/atomicbitops/atomicbitops_arm64.s
new file mode 100644
index 000000000..5c780851b
--- /dev/null
+++ b/pkg/atomicbitops/atomicbitops_arm64.s
@@ -0,0 +1,105 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+#include "textflag.h"
+
+TEXT ·AndUint32(SB),$0-12
+  MOVD    ptr+0(FP), R0
+  MOVW    val+8(FP), R1
+again:
+  LDAXRW  (R0), R2
+  ANDW    R1, R2
+  STLXRW  R2, (R0), R3
+  CBNZ    R3, again
+  RET
+
+TEXT ·OrUint32(SB),$0-12
+  MOVD    ptr+0(FP), R0
+  MOVW    val+8(FP), R1
+again:
+  LDAXRW  (R0), R2
+  ORRW    R1, R2
+  STLXRW  R2, (R0), R3
+  CBNZ    R3, again
+  RET
+
+TEXT ·XorUint32(SB),$0-12
+  MOVD    ptr+0(FP), R0
+  MOVW    val+8(FP), R1
+again:
+  LDAXRW  (R0), R2
+  EORW    R1, R2
+  STLXRW  R2, (R0), R3
+  CBNZ    R3, again
+  RET
+
+TEXT ·CompareAndSwapUint32(SB),$0-20
+  MOVD addr+0(FP), R0
+  MOVW old+8(FP), R1
+  MOVW new+12(FP), R2
+again:
+  LDAXRW (R0), R3
+  CMPW R1, R3
+  BNE done
+  STLXRW R2, (R0), R4
+  CBNZ R4, again
+done:
+  MOVW R3, prev+16(FP)
+  RET
+
+TEXT ·AndUint64(SB),$0-16
+  MOVD    ptr+0(FP), R0
+  MOVD    val+8(FP), R1
+again:
+  LDAXR   (R0), R2
+  AND     R1, R2
+  STLXR   R2, (R0), R3
+  CBNZ    R3, again
+  RET
+
+TEXT ·OrUint64(SB),$0-16
+  MOVD    ptr+0(FP), R0
+  MOVD    val+8(FP), R1
+again:
+  LDAXR   (R0), R2
+  ORR     R1, R2
+  STLXR   R2, (R0), R3
+  CBNZ    R3, again
+  RET
+
+TEXT ·XorUint64(SB),$0-16
+  MOVD    ptr+0(FP), R0
+  MOVD    val+8(FP), R1
+again:
+  LDAXR   (R0), R2
+  EOR     R1, R2
+  STLXR   R2, (R0), R3
+  CBNZ    R3, again
+  RET
+
+TEXT ·CompareAndSwapUint64(SB),$0-32
+  MOVD addr+0(FP), R0
+  MOVD old+8(FP), R1
+  MOVD new+16(FP), R2
+again:
+  LDAXR (R0), R3
+  CMP R1, R3
+  BNE done
+  STLXR R2, (R0), R4
+  CBNZ R4, again
+done:
+  MOVD R3, prev+24(FP)
+  RET
diff --git a/pkg/atomicbitops/atomicbitops_noasm.go b/pkg/atomicbitops/atomicbitops_noasm.go
new file mode 100644
index 000000000..3b2898256
--- /dev/null
+++ b/pkg/atomicbitops/atomicbitops_noasm.go
@@ -0,0 +1,105 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !amd64,!arm64
+
+package atomicbitops
+
+import (
+	"sync/atomic"
+)
+
+func AndUint32(addr *uint32, val uint32) {
+	for {
+		o := atomic.LoadUint32(addr)
+		n := o & val
+		if atomic.CompareAndSwapUint32(addr, o, n) {
+			break
+		}
+	}
+}
+
+func OrUint32(addr *uint32, val uint32) {
+	for {
+		o := atomic.LoadUint32(addr)
+		n := o | val
+		if atomic.CompareAndSwapUint32(addr, o, n) {
+			break
+		}
+	}
+}
+
+func XorUint32(addr *uint32, val uint32) {
+	for {
+		o := atomic.LoadUint32(addr)
+		n := o ^ val
+		if atomic.CompareAndSwapUint32(addr, o, n) {
+			break
+		}
+	}
+}
+
+func CompareAndSwapUint32(addr *uint32, old, new uint32) (prev uint32) {
+	for {
+		prev = atomic.LoadUint32(addr)
+		if prev != old {
+			return
+		}
+		if atomic.CompareAndSwapUint32(addr, old, new) {
+			return
+		}
+	}
+}
+
+func AndUint64(addr *uint64, val uint64) {
+	for {
+		o := atomic.LoadUint64(addr)
+		n := o & val
+		if atomic.CompareAndSwapUint64(addr, o, n) {
+			break
+		}
+	}
+}
+
+func OrUint64(addr *uint64, val uint64) {
+	for {
+		o := atomic.LoadUint64(addr)
+		n := o | val
+		if atomic.CompareAndSwapUint64(addr, o, n) {
+			break
+		}
+	}
+}
+
+func XorUint64(addr *uint64, val uint64) {
+	for {
+		o := atomic.LoadUint64(addr)
+		n := o ^ val
+		if atomic.CompareAndSwapUint64(addr, o, n) {
+			break
+		}
+	}
+}
+
+func CompareAndSwapUint64(addr *uint64, old, new uint64) (prev uint64) {
+	for {
+		prev = atomic.LoadUint64(addr)
+		if prev != old {
+			return
+		}
+		if atomic.CompareAndSwapUint64(addr, old, new) {
+			return
+		}
+	}
+}
diff --git a/pkg/atomicbitops/atomicbitops_test.go b/pkg/atomicbitops/atomicbitops_test.go
new file mode 100644
index 000000000..73af71bb4
--- /dev/null
+++ b/pkg/atomicbitops/atomicbitops_test.go
@@ -0,0 +1,198 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package atomicbitops
+
+import (
+	"runtime"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+const iterations = 100
+
+func detectRaces32(val, target uint32, fn func(*uint32, uint32)) bool {
+	runtime.GOMAXPROCS(100)
+	for n := 0; n < iterations; n++ {
+		x := val
+		var wg sync.WaitGroup
+		for i := uint32(0); i < 32; i++ {
+			wg.Add(1)
+			go func(a *uint32, i uint32) {
+				defer wg.Done()
+				fn(a, uint32(1<<i))
+			}(&x, i)
+		}
+		wg.Wait()
+		if x != target {
+			return true
+		}
+	}
+	return false
+}
+
+func detectRaces64(val, target uint64, fn func(*uint64, uint64)) bool {
+	runtime.GOMAXPROCS(100)
+	for n := 0; n < iterations; n++ {
+		x := val
+		var wg sync.WaitGroup
+		for i := uint64(0); i < 64; i++ {
+			wg.Add(1)
+			go func(a *uint64, i uint64) {
+				defer wg.Done()
+				fn(a, uint64(1<<i))
+			}(&x, i)
+		}
+		wg.Wait()
+		if x != target {
+			return true
+		}
+	}
+	return false
+}
+
+func TestOrUint32(t *testing.T) {
+	if detectRaces32(0x0, 0xffffffff, OrUint32) {
+		t.Error("Data race detected!")
+	}
+}
+
+func TestAndUint32(t *testing.T) {
+	if detectRaces32(0xf0f0f0f0, 0x00000000, AndUint32) {
+		t.Error("Data race detected!")
+	}
+}
+
+func TestXorUint32(t *testing.T) {
+	if detectRaces32(0xf0f0f0f0, 0x0f0f0f0f, XorUint32) {
+		t.Error("Data race detected!")
+	}
+}
+
+func TestOrUint64(t *testing.T) {
+	if detectRaces64(0x0, 0xffffffffffffffff, OrUint64) {
+		t.Error("Data race detected!")
+	}
+}
+
+func TestAndUint64(t *testing.T) {
+	if detectRaces64(0xf0f0f0f0f0f0f0f0, 0x0, AndUint64) {
+		t.Error("Data race detected!")
+	}
+}
+
+func TestXorUint64(t *testing.T) {
+	if detectRaces64(0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f, XorUint64) {
+		t.Error("Data race detected!")
+	}
+}
+
+func TestCompareAndSwapUint32(t *testing.T) {
+	tests := []struct {
+		name string
+		prev uint32
+		old  uint32
+		new  uint32
+		next uint32
+	}{
+		{
+			name: "Successful compare-and-swap with prev == new",
+			prev: 10,
+			old:  10,
+			new:  10,
+			next: 10,
+		},
+		{
+			name: "Successful compare-and-swap with prev != new",
+			prev: 20,
+			old:  20,
+			new:  22,
+			next: 22,
+		},
+		{
+			name: "Failed compare-and-swap with prev == new",
+			prev: 31,
+			old:  30,
+			new:  31,
+			next: 31,
+		},
+		{
+			name: "Failed compare-and-swap with prev != new",
+			prev: 41,
+			old:  40,
+			new:  42,
+			next: 41,
+		},
+	}
+	for _, test := range tests {
+		val := test.prev
+		prev := CompareAndSwapUint32(&val, test.old, test.new)
+		if got, want := prev, test.prev; got != want {
+			t.Errorf("%s: incorrect returned previous value: got %d, expected %d", test.name, got, want)
+		}
+		if got, want := val, test.next; got != want {
+			t.Errorf("%s: incorrect value stored in val: got %d, expected %d", test.name, got, want)
+		}
+	}
+}
+
+func TestCompareAndSwapUint64(t *testing.T) {
+	tests := []struct {
+		name string
+		prev uint64
+		old  uint64
+		new  uint64
+		next uint64
+	}{
+		{
+			name: "Successful compare-and-swap with prev == new",
+			prev: 0x100000000,
+			old:  0x100000000,
+			new:  0x100000000,
+			next: 0x100000000,
+		},
+		{
+			name: "Successful compare-and-swap with prev != new",
+			prev: 0x200000000,
+			old:  0x200000000,
+			new:  0x200000002,
+			next: 0x200000002,
+		},
+		{
+			name: "Failed compare-and-swap with prev == new",
+			prev: 0x300000001,
+			old:  0x300000000,
+			new:  0x300000001,
+			next: 0x300000001,
+		},
+		{
+			name: "Failed compare-and-swap with prev != new",
+			prev: 0x400000001,
+			old:  0x400000000,
+			new:  0x400000002,
+			next: 0x400000001,
+		},
+	}
+	for _, test := range tests {
+		val := test.prev
+		prev := CompareAndSwapUint64(&val, test.old, test.new)
+		if got, want := prev, test.prev; got != want {
+			t.Errorf("%s: incorrect returned previous value: got %d, expected %d", test.name, got, want)
+		}
+		if got, want := val, test.next; got != want {
+			t.Errorf("%s: incorrect value stored in val: got %d, expected %d", test.name, got, want)
+		}
+	}
+}
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index e58a63deb..94d39af60 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/atomicbitops"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -42,8 +41,15 @@ func (mm *MemoryManager) AddressSpace() platform.AddressSpace {
 func (mm *MemoryManager) Activate() error {
 	// Fast path: the MemoryManager already has an active
 	// platform.AddressSpace, and we just need to indicate that we need it too.
-	if atomicbitops.IncUnlessZeroInt32(&mm.active) {
-		return nil
+	for {
+		active := atomic.LoadInt32(&mm.active)
+		if active == 0 {
+			// Fall back to the slow path.
+			break
+		}
+		if atomic.CompareAndSwapInt32(&mm.active, active, active+1) {
+			return nil
+		}
 	}
 
 	for {
@@ -118,8 +124,15 @@ func (mm *MemoryManager) Activate() error {
 func (mm *MemoryManager) Deactivate() {
 	// Fast path: this is not the last goroutine to deactivate the
 	// MemoryManager.
-	if atomicbitops.DecUnlessOneInt32(&mm.active) {
-		return
+	for {
+		active := atomic.LoadInt32(&mm.active)
+		if active == 1 {
+			// Fall back to the slow path.
+			break
+		}
+		if atomic.CompareAndSwapInt32(&mm.active, active, active-1) {
+			return
+		}
 	}
 
 	mm.activeMu.Lock()
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 47b8fbf43..3c263ebaa 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/atomicbitops"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -229,7 +228,15 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 // IncUsers increments mm's user count and returns true. If the user count is
 // already 0, IncUsers does nothing and returns false.
 func (mm *MemoryManager) IncUsers() bool {
-	return atomicbitops.IncUnlessZeroInt32(&mm.users)
+	for {
+		users := atomic.LoadInt32(&mm.users)
+		if users == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt32(&mm.users, users, users+1) {
+			return true
+		}
+	}
 }
 
 // DecUsers decrements mm's user count. If the user count reaches 0, all
-- 
cgit v1.2.3


From a3582de6186edcc88e022af2b9f9c1cef90e44ed Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 13:49:17 -0800
Subject: cpuid: cache the maximum size of xsave state

perf shows that ExtendedStateSize cosumes more than 20% of cpu:

    23.61%    23.61%  [.] pkg/cpuid/cpuid.HostID

PiperOrigin-RevId: 295813263
---
 pkg/cpuid/cpuid_x86.go | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'pkg')

diff --git a/pkg/cpuid/cpuid_x86.go b/pkg/cpuid/cpuid_x86.go
index 333ca0a04..a0bc55ea1 100644
--- a/pkg/cpuid/cpuid_x86.go
+++ b/pkg/cpuid/cpuid_x86.go
@@ -725,6 +725,18 @@ func vendorIDFromRegs(bx, cx, dx uint32) string {
 	return string(bytes)
 }
 
+var maxXsaveSize = func() uint32 {
+	// Leaf 0 of xsaveinfo function returns the size for currently
+	// enabled xsave features in ebx, the maximum size if all valid
+	// features are saved with xsave in ecx, and valid XCR0 bits in
+	// edx:eax.
+	//
+	// If xSaveInfo isn't supported, cpuid will not fault but will
+	// return bogus values.
+	_, _, maxXsaveSize, _ := HostID(uint32(xSaveInfo), 0)
+	return maxXsaveSize
+}()
+
 // ExtendedStateSize returns the number of bytes needed to save the "extended
 // state" for this processor and the boundary it must be aligned to. Extended
 // state includes floating point registers, and other cpu state that's not
@@ -736,12 +748,7 @@ func vendorIDFromRegs(bx, cx, dx uint32) string {
 // about 2.5K worst case, with avx512).
 func (fs *FeatureSet) ExtendedStateSize() (size, align uint) {
 	if fs.UseXsave() {
-		// Leaf 0 of xsaveinfo function returns the size for currently
-		// enabled xsave features in ebx, the maximum size if all valid
-		// features are saved with xsave in ecx, and valid XCR0 bits in
-		// edx:eax.
-		_, _, maxSize, _ := HostID(uint32(xSaveInfo), 0)
-		return uint(maxSize), 64
+		return uint(maxXsaveSize), 64
 	}
 
 	// If we don't support xsave, we fall back to fxsave, which requires
-- 
cgit v1.2.3


From 55c553ae8c7937be4a7e10e0c7a727d132317e89 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 15:17:45 -0800
Subject: Add //pkg/syncevent.

Package syncevent is intended to subsume ~all uses of channels in the sentry
(including //pkg/waiter), as well as //pkg/sleep.

Compared to channels:

- Delivery of events to a syncevent.Receiver allows *synchronous* execution of
  an arbitrary callback, whereas delivery of events to a channel requires a
  goroutine to receive from that channel, resulting in substantial scheduling
  overhead. (This is also part of the motivation for the waiter package.)

- syncevent.Waiter can wait on multiple event sources without the high O(N)
  overhead of select. (This is the same motivation as for the sleep package.)

Compared to the waiter package:

- syncevent.Waiters are intended to be persistent (i.e. per-kernel.Task), and
  syncevent.Broadcaster (analogous to waiter.Queue) is a hash table rather than
  a linked list, such that blocking is (usually) allocation-free.

- syncevent.Source (analogous to waiter.Waitable) does not include an equivalent
  to waiter.Waitable.Readiness(), since this is inappropriate for transient
  events (see e.g. //pkg/sentry/kernel/time.ClockEventSource).

Compared to the sleep package:

- syncevent events are represented by bits in a bitmask rather than discrete
  sleep.Waker objects, reducing overhead and making it feasible to broadcast
  events to multiple syncevent.Receivers.

- syncevent.Receiver invokes an arbitrary callback, which is required by the
  sentry's epoll implementation. (syncevent.Waiter, which is analogous to
  sleep.Sleeper, pairs a syncevent.Receiver with a callback that wakes a
  waiting goroutine; the implementation of this aspect is nearly identical to
  that of sleep.Sleeper, except that it represents *runtime.g as unsafe.Pointer
  rather than uintptr.)

- syncevent.Waiter.Wait (analogous to sleep.Sleeper.Fetch(block=true)) does not
  automatically un-assert returned events. This is useful in cases where the
  path for handling an event is not the same as the path that observes it, such
  as for application signals (a la Linux's TIF_SIGPENDING).

- Unlike sleep.Sleeper, which Fetches Wakers in the order that they were
  Asserted, the event bitmasks used by syncevent.Receiver have no way of
  preserving event arrival order. (This is similar to select, which goes out of
  its way to randomize event ordering.)

The disadvantage of the syncevent package is that, since events are represented
by bits in a uint64 bitmask, each syncevent.Receiver can "only" multiplex
between 64 distinct events; this does not affect any known use case.

Benchmarks:

BenchmarkBroadcasterSubscribeUnsubscribe
BenchmarkBroadcasterSubscribeUnsubscribe-12         	45133884	        26.3 ns/op
BenchmarkMapSubscribeUnsubscribe
BenchmarkMapSubscribeUnsubscribe-12                 	28504662	        41.8 ns/op
BenchmarkQueueSubscribeUnsubscribe
BenchmarkQueueSubscribeUnsubscribe-12               	22747668	        45.6 ns/op
BenchmarkBroadcasterSubscribeUnsubscribeBatch
BenchmarkBroadcasterSubscribeUnsubscribeBatch-12    	31609177	        37.8 ns/op
BenchmarkMapSubscribeUnsubscribeBatch
BenchmarkMapSubscribeUnsubscribeBatch-12            	17563906	        62.1 ns/op
BenchmarkQueueSubscribeUnsubscribeBatch
BenchmarkQueueSubscribeUnsubscribeBatch-12          	26248838	        46.6 ns/op
BenchmarkBroadcasterBroadcastRedundant
BenchmarkBroadcasterBroadcastRedundant/0
BenchmarkBroadcasterBroadcastRedundant/0-12         	100907563	        11.8 ns/op
BenchmarkBroadcasterBroadcastRedundant/1
BenchmarkBroadcasterBroadcastRedundant/1-12         	85103068	        13.3 ns/op
BenchmarkBroadcasterBroadcastRedundant/4
BenchmarkBroadcasterBroadcastRedundant/4-12         	52716502	        22.3 ns/op
BenchmarkBroadcasterBroadcastRedundant/16
BenchmarkBroadcasterBroadcastRedundant/16-12        	20278165	        58.7 ns/op
BenchmarkBroadcasterBroadcastRedundant/64
BenchmarkBroadcasterBroadcastRedundant/64-12        	 5905428	       205 ns/op
BenchmarkMapBroadcastRedundant
BenchmarkMapBroadcastRedundant/0
BenchmarkMapBroadcastRedundant/0-12                 	87532734	        13.5 ns/op
BenchmarkMapBroadcastRedundant/1
BenchmarkMapBroadcastRedundant/1-12                 	28488411	        36.3 ns/op
BenchmarkMapBroadcastRedundant/4
BenchmarkMapBroadcastRedundant/4-12                 	19628920	        60.9 ns/op
BenchmarkMapBroadcastRedundant/16
BenchmarkMapBroadcastRedundant/16-12                	 6026980	       192 ns/op
BenchmarkMapBroadcastRedundant/64
BenchmarkMapBroadcastRedundant/64-12                	 1640858	       754 ns/op
BenchmarkQueueBroadcastRedundant
BenchmarkQueueBroadcastRedundant/0
BenchmarkQueueBroadcastRedundant/0-12               	96904807	        12.0 ns/op
BenchmarkQueueBroadcastRedundant/1
BenchmarkQueueBroadcastRedundant/1-12               	73521873	        16.3 ns/op
BenchmarkQueueBroadcastRedundant/4
BenchmarkQueueBroadcastRedundant/4-12               	39209468	        31.2 ns/op
BenchmarkQueueBroadcastRedundant/16
BenchmarkQueueBroadcastRedundant/16-12              	10810058	       105 ns/op
BenchmarkQueueBroadcastRedundant/64
BenchmarkQueueBroadcastRedundant/64-12              	 2998046	       376 ns/op
BenchmarkBroadcasterBroadcastAck
BenchmarkBroadcasterBroadcastAck/1
BenchmarkBroadcasterBroadcastAck/1-12               	44472397	        26.4 ns/op
BenchmarkBroadcasterBroadcastAck/4
BenchmarkBroadcasterBroadcastAck/4-12               	17653509	        69.7 ns/op
BenchmarkBroadcasterBroadcastAck/16
BenchmarkBroadcasterBroadcastAck/16-12              	 4082617	       260 ns/op
BenchmarkBroadcasterBroadcastAck/64
BenchmarkBroadcasterBroadcastAck/64-12              	 1220534	      1027 ns/op
BenchmarkMapBroadcastAck
BenchmarkMapBroadcastAck/1
BenchmarkMapBroadcastAck/1-12                       	26760705	        44.2 ns/op
BenchmarkMapBroadcastAck/4
BenchmarkMapBroadcastAck/4-12                       	11495636	       100 ns/op
BenchmarkMapBroadcastAck/16
BenchmarkMapBroadcastAck/16-12                      	 2937590	       343 ns/op
BenchmarkMapBroadcastAck/64
BenchmarkMapBroadcastAck/64-12                      	  861037	      1344 ns/op
BenchmarkQueueBroadcastAck
BenchmarkQueueBroadcastAck/1
BenchmarkQueueBroadcastAck/1-12                     	19832679	        55.0 ns/op
BenchmarkQueueBroadcastAck/4
BenchmarkQueueBroadcastAck/4-12                     	 5618214	       189 ns/op
BenchmarkQueueBroadcastAck/16
BenchmarkQueueBroadcastAck/16-12                    	 1569980	       713 ns/op
BenchmarkQueueBroadcastAck/64
BenchmarkQueueBroadcastAck/64-12                    	  437672	      2814 ns/op
BenchmarkWaiterNotifyRedundant
BenchmarkWaiterNotifyRedundant-12                   	650823090	         1.96 ns/op
BenchmarkSleeperNotifyRedundant
BenchmarkSleeperNotifyRedundant-12                  	619871544	         1.61 ns/op
BenchmarkChannelNotifyRedundant
BenchmarkChannelNotifyRedundant-12                  	298903778	         3.67 ns/op
BenchmarkWaiterNotifyWaitAck
BenchmarkWaiterNotifyWaitAck-12                     	68358360	        17.8 ns/op
BenchmarkSleeperNotifyWaitAck
BenchmarkSleeperNotifyWaitAck-12                    	25044883	        41.2 ns/op
BenchmarkChannelNotifyWaitAck
BenchmarkChannelNotifyWaitAck-12                    	29572404	        40.2 ns/op
BenchmarkSleeperMultiNotifyWaitAck
BenchmarkSleeperMultiNotifyWaitAck-12               	16122969	        73.8 ns/op
BenchmarkWaiterTempNotifyWaitAck
BenchmarkWaiterTempNotifyWaitAck-12                 	46111489	        25.8 ns/op
BenchmarkSleeperTempNotifyWaitAck
BenchmarkSleeperTempNotifyWaitAck-12                	15541882	        73.6 ns/op
BenchmarkWaiterNotifyWaitMultiAck
BenchmarkWaiterNotifyWaitMultiAck-12                	65878500	        18.2 ns/op
BenchmarkSleeperNotifyWaitMultiAck
BenchmarkSleeperNotifyWaitMultiAck-12               	28798623	        41.5 ns/op
BenchmarkChannelNotifyWaitMultiAck
BenchmarkChannelNotifyWaitMultiAck-12               	11308468	       101 ns/op
BenchmarkWaiterNotifyAsyncWaitAck
BenchmarkWaiterNotifyAsyncWaitAck-12                	 2475387	       492 ns/op
BenchmarkSleeperNotifyAsyncWaitAck
BenchmarkSleeperNotifyAsyncWaitAck-12               	 2184507	       518 ns/op
BenchmarkChannelNotifyAsyncWaitAck
BenchmarkChannelNotifyAsyncWaitAck-12               	 2120365	       562 ns/op
BenchmarkWaiterNotifyAsyncWaitMultiAck
BenchmarkWaiterNotifyAsyncWaitMultiAck-12           	 2351247	       494 ns/op
BenchmarkSleeperNotifyAsyncWaitMultiAck
BenchmarkSleeperNotifyAsyncWaitMultiAck-12          	 2205799	       522 ns/op
BenchmarkChannelNotifyAsyncWaitMultiAck
BenchmarkChannelNotifyAsyncWaitMultiAck-12          	 1238079	       928 ns/op

Updates #1074

PiperOrigin-RevId: 295834087
---
 pkg/syncevent/BUILD                     |  39 +++
 pkg/syncevent/broadcaster.go            | 218 +++++++++++++++++
 pkg/syncevent/broadcaster_test.go       | 376 +++++++++++++++++++++++++++++
 pkg/syncevent/receiver.go               | 103 ++++++++
 pkg/syncevent/source.go                 |  59 +++++
 pkg/syncevent/syncevent.go              |  32 +++
 pkg/syncevent/syncevent_example_test.go | 108 +++++++++
 pkg/syncevent/waiter_amd64.s            |  32 +++
 pkg/syncevent/waiter_arm64.s            |  34 +++
 pkg/syncevent/waiter_asm_unsafe.go      |  24 ++
 pkg/syncevent/waiter_noasm_unsafe.go    |  39 +++
 pkg/syncevent/waiter_test.go            | 414 ++++++++++++++++++++++++++++++++
 pkg/syncevent/waiter_unsafe.go          | 206 ++++++++++++++++
 13 files changed, 1684 insertions(+)
 create mode 100644 pkg/syncevent/BUILD
 create mode 100644 pkg/syncevent/broadcaster.go
 create mode 100644 pkg/syncevent/broadcaster_test.go
 create mode 100644 pkg/syncevent/receiver.go
 create mode 100644 pkg/syncevent/source.go
 create mode 100644 pkg/syncevent/syncevent.go
 create mode 100644 pkg/syncevent/syncevent_example_test.go
 create mode 100644 pkg/syncevent/waiter_amd64.s
 create mode 100644 pkg/syncevent/waiter_arm64.s
 create mode 100644 pkg/syncevent/waiter_asm_unsafe.go
 create mode 100644 pkg/syncevent/waiter_noasm_unsafe.go
 create mode 100644 pkg/syncevent/waiter_test.go
 create mode 100644 pkg/syncevent/waiter_unsafe.go

(limited to 'pkg')

diff --git a/pkg/syncevent/BUILD b/pkg/syncevent/BUILD
new file mode 100644
index 000000000..0500a22cf
--- /dev/null
+++ b/pkg/syncevent/BUILD
@@ -0,0 +1,39 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+licenses(["notice"])
+
+go_library(
+    name = "syncevent",
+    srcs = [
+        "broadcaster.go",
+        "receiver.go",
+        "source.go",
+        "syncevent.go",
+        "waiter_amd64.s",
+        "waiter_arm64.s",
+        "waiter_asm_unsafe.go",
+        "waiter_noasm_unsafe.go",
+        "waiter_unsafe.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/atomicbitops",
+        "//pkg/sync",
+    ],
+)
+
+go_test(
+    name = "syncevent_test",
+    size = "small",
+    srcs = [
+        "broadcaster_test.go",
+        "syncevent_example_test.go",
+        "waiter_test.go",
+    ],
+    library = ":syncevent",
+    deps = [
+        "//pkg/sleep",
+        "//pkg/sync",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/syncevent/broadcaster.go b/pkg/syncevent/broadcaster.go
new file mode 100644
index 000000000..4bff59e7d
--- /dev/null
+++ b/pkg/syncevent/broadcaster.go
@@ -0,0 +1,218 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// Broadcaster is an implementation of Source that supports any number of
+// subscribed Receivers.
+//
+// The zero value of Broadcaster is valid and has no subscribed Receivers.
+// Broadcaster is not copyable by value.
+//
+// All Broadcaster methods may be called concurrently from multiple goroutines.
+type Broadcaster struct {
+	// Broadcaster is implemented as a hash table where keys are assigned by
+	// the Broadcaster and returned as SubscriptionIDs, making it safe to use
+	// the identity function for hashing. The hash table resolves collisions
+	// using linear probing and features Robin Hood insertion and backward
+	// shift deletion in order to support a relatively high load factor
+	// efficiently, which matters since the cost of Broadcast is linear in the
+	// size of the table.
+
+	// mu protects the following fields.
+	mu sync.Mutex
+
+	// Invariants: len(table) is 0 or a power of 2.
+	table []broadcasterSlot
+
+	// load is the number of entries in table with receiver != nil.
+	load int
+
+	lastID SubscriptionID
+}
+
+type broadcasterSlot struct {
+	// Invariants: If receiver == nil, then filter == NoEvents and id == 0.
+	// Otherwise, id != 0.
+	receiver *Receiver
+	filter   Set
+	id       SubscriptionID
+}
+
+const (
+	broadcasterMinNonZeroTableSize = 2 // must be a power of 2 > 1
+
+	broadcasterMaxLoadNum = 13
+	broadcasterMaxLoadDen = 16
+)
+
+// SubscribeEvents implements Source.SubscribeEvents.
+func (b *Broadcaster) SubscribeEvents(r *Receiver, filter Set) SubscriptionID {
+	b.mu.Lock()
+
+	// Assign an ID for this subscription.
+	b.lastID++
+	id := b.lastID
+
+	// Expand the table if over the maximum load factor:
+	//
+	//          load / len(b.table) > broadcasterMaxLoadNum / broadcasterMaxLoadDen
+	// load * broadcasterMaxLoadDen > broadcasterMaxLoadNum * len(b.table)
+	b.load++
+	if (b.load * broadcasterMaxLoadDen) > (broadcasterMaxLoadNum * len(b.table)) {
+		// Double the number of slots in the new table.
+		newlen := broadcasterMinNonZeroTableSize
+		if len(b.table) != 0 {
+			newlen = 2 * len(b.table)
+		}
+		if newlen <= cap(b.table) {
+			// Reuse excess capacity in the current table, moving entries not
+			// already in their first-probed positions to better ones.
+			newtable := b.table[:newlen]
+			newmask := uint64(newlen - 1)
+			for i := range b.table {
+				if b.table[i].receiver != nil && uint64(b.table[i].id)&newmask != uint64(i) {
+					entry := b.table[i]
+					b.table[i] = broadcasterSlot{}
+					broadcasterTableInsert(newtable, entry.id, entry.receiver, entry.filter)
+				}
+			}
+			b.table = newtable
+		} else {
+			newtable := make([]broadcasterSlot, newlen)
+			// Copy existing entries to the new table.
+			for i := range b.table {
+				if b.table[i].receiver != nil {
+					broadcasterTableInsert(newtable, b.table[i].id, b.table[i].receiver, b.table[i].filter)
+				}
+			}
+			// Switch to the new table.
+			b.table = newtable
+		}
+	}
+
+	broadcasterTableInsert(b.table, id, r, filter)
+	b.mu.Unlock()
+	return id
+}
+
+// Preconditions: table must not be full. len(table) is a power of 2.
+func broadcasterTableInsert(table []broadcasterSlot, id SubscriptionID, r *Receiver, filter Set) {
+	entry := broadcasterSlot{
+		receiver: r,
+		filter:   filter,
+		id:       id,
+	}
+	mask := uint64(len(table) - 1)
+	i := uint64(id) & mask
+	disp := uint64(0)
+	for {
+		if table[i].receiver == nil {
+			table[i] = entry
+			return
+		}
+		// If we've been displaced farther from our first-probed slot than the
+		// element stored in this one, swap elements and switch to inserting
+		// the replaced one. (This is Robin Hood insertion.)
+		slotDisp := (i - uint64(table[i].id)) & mask
+		if disp > slotDisp {
+			table[i], entry = entry, table[i]
+			disp = slotDisp
+		}
+		i = (i + 1) & mask
+		disp++
+	}
+}
+
+// UnsubscribeEvents implements Source.UnsubscribeEvents.
+func (b *Broadcaster) UnsubscribeEvents(id SubscriptionID) {
+	b.mu.Lock()
+
+	mask := uint64(len(b.table) - 1)
+	i := uint64(id) & mask
+	for {
+		if b.table[i].id == id {
+			// Found the element to remove. Move all subsequent elements
+			// backward until we either find an empty slot, or an element that
+			// is already in its first-probed slot. (This is backward shift
+			// deletion.)
+			for {
+				next := (i + 1) & mask
+				if b.table[next].receiver == nil {
+					break
+				}
+				if uint64(b.table[next].id)&mask == next {
+					break
+				}
+				b.table[i] = b.table[next]
+				i = next
+			}
+			b.table[i] = broadcasterSlot{}
+			break
+		}
+		i = (i + 1) & mask
+	}
+
+	// If a table 1/4 of the current size would still be at or under the
+	// maximum load factor (i.e. the current table size is at least two
+	// expansions bigger than necessary), halve the size of the table to reduce
+	// the cost of Broadcast. Since we are concerned with iteration time and
+	// not memory usage, reuse the existing slice to reduce future allocations
+	// from table re-expansion.
+	b.load--
+	if len(b.table) > broadcasterMinNonZeroTableSize && (b.load*(4*broadcasterMaxLoadDen)) <= (broadcasterMaxLoadNum*len(b.table)) {
+		newlen := len(b.table) / 2
+		newtable := b.table[:newlen]
+		for i := newlen; i < len(b.table); i++ {
+			if b.table[i].receiver != nil {
+				broadcasterTableInsert(newtable, b.table[i].id, b.table[i].receiver, b.table[i].filter)
+				b.table[i] = broadcasterSlot{}
+			}
+		}
+		b.table = newtable
+	}
+
+	b.mu.Unlock()
+}
+
+// Broadcast notifies all Receivers subscribed to the Broadcaster of the subset
+// of events to which they subscribed. The order in which Receivers are
+// notified is unspecified.
+func (b *Broadcaster) Broadcast(events Set) {
+	b.mu.Lock()
+	for i := range b.table {
+		if intersection := events & b.table[i].filter; intersection != 0 {
+			// We don't need to check if broadcasterSlot.receiver is nil, since
+			// if it is then broadcasterSlot.filter is 0.
+			b.table[i].receiver.Notify(intersection)
+		}
+	}
+	b.mu.Unlock()
+}
+
+// FilteredEvents returns the set of events for which Broadcast will notify at
+// least one Receiver, i.e. the union of filters for all subscribed Receivers.
+func (b *Broadcaster) FilteredEvents() Set {
+	var es Set
+	b.mu.Lock()
+	for i := range b.table {
+		es |= b.table[i].filter
+	}
+	b.mu.Unlock()
+	return es
+}
diff --git a/pkg/syncevent/broadcaster_test.go b/pkg/syncevent/broadcaster_test.go
new file mode 100644
index 000000000..e88779e23
--- /dev/null
+++ b/pkg/syncevent/broadcaster_test.go
@@ -0,0 +1,376 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"fmt"
+	"math/rand"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func TestBroadcasterFilter(t *testing.T) {
+	const numReceivers = 2 * MaxEvents
+
+	var br Broadcaster
+	ws := make([]Waiter, numReceivers)
+	for i := range ws {
+		ws[i].Init()
+		br.SubscribeEvents(ws[i].Receiver(), 1<<(i%MaxEvents))
+	}
+	for ev := 0; ev < MaxEvents; ev++ {
+		br.Broadcast(1 << ev)
+		for i := range ws {
+			want := NoEvents
+			if i%MaxEvents == ev {
+				want = 1 << ev
+			}
+			if got := ws[i].Receiver().PendingAndAckAll(); got != want {
+				t.Errorf("after Broadcast of event %d: waiter %d has pending event set %#x, wanted %#x", ev, i, got, want)
+			}
+		}
+	}
+}
+
+// TestBroadcasterManySubscriptions tests that subscriptions are not lost by
+// table expansion/compaction.
+func TestBroadcasterManySubscriptions(t *testing.T) {
+	const numReceivers = 5000 // arbitrary
+
+	var br Broadcaster
+	ws := make([]Waiter, numReceivers)
+	for i := range ws {
+		ws[i].Init()
+	}
+
+	ids := make([]SubscriptionID, numReceivers)
+	for i := 0; i < numReceivers; i++ {
+		// Subscribe receiver i.
+		ids[i] = br.SubscribeEvents(ws[i].Receiver(), 1)
+		// Check that receivers [0, i] are subscribed.
+		br.Broadcast(1)
+		for j := 0; j <= i; j++ {
+			if ws[j].Pending() != 1 {
+				t.Errorf("receiver %d did not receive an event after subscription of receiver %d", j, i)
+			}
+			ws[j].Ack(1)
+		}
+	}
+
+	// Generate a random order for unsubscriptions.
+	unsub := rand.Perm(numReceivers)
+	for i := 0; i < numReceivers; i++ {
+		// Unsubscribe receiver unsub[i].
+		br.UnsubscribeEvents(ids[unsub[i]])
+		// Check that receivers [unsub[0], unsub[i]] are not subscribed, and that
+		// receivers (unsub[i], unsub[numReceivers]) are still subscribed.
+		br.Broadcast(1)
+		for j := 0; j <= i; j++ {
+			if ws[unsub[j]].Pending() != 0 {
+				t.Errorf("unsub iteration %d: receiver %d received an event after unsubscription of receiver %d", i, unsub[j], unsub[i])
+			}
+		}
+		for j := i + 1; j < numReceivers; j++ {
+			if ws[unsub[j]].Pending() != 1 {
+				t.Errorf("unsub iteration %d: receiver %d did not receive an event after unsubscription of receiver %d", i, unsub[j], unsub[i])
+			}
+			ws[unsub[j]].Ack(1)
+		}
+	}
+}
+
+var (
+	receiverCountsNonZero       = []int{1, 4, 16, 64}
+	receiverCountsIncludingZero = append([]int{0}, receiverCountsNonZero...)
+)
+
+// BenchmarkBroadcasterX, BenchmarkMapX, and BenchmarkQueueX benchmark usage
+// pattern X (described in terms of Broadcaster) with Broadcaster, a
+// Mutex-protected map[*Receiver]Set, and waiter.Queue respectively.
+
+// BenchmarkXxxSubscribeUnsubscribe measures the cost of a Subscribe/Unsubscribe
+// cycle.
+
+func BenchmarkBroadcasterSubscribeUnsubscribe(b *testing.B) {
+	var br Broadcaster
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		id := br.SubscribeEvents(w.Receiver(), 1)
+		br.UnsubscribeEvents(id)
+	}
+}
+
+func BenchmarkMapSubscribeUnsubscribe(b *testing.B) {
+	var mu sync.Mutex
+	m := make(map[*Receiver]Set)
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		mu.Lock()
+		m[w.Receiver()] = Set(1)
+		mu.Unlock()
+		mu.Lock()
+		delete(m, w.Receiver())
+		mu.Unlock()
+	}
+}
+
+func BenchmarkQueueSubscribeUnsubscribe(b *testing.B) {
+	var q waiter.Queue
+	e, _ := waiter.NewChannelEntry(nil)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		q.EventRegister(&e, 1)
+		q.EventUnregister(&e)
+	}
+}
+
+// BenchmarkXxxSubscribeUnsubscribeBatch is similar to
+// BenchmarkXxxSubscribeUnsubscribe, but subscribes and unsubscribes a large
+// number of Receivers at a time in order to measure the amortized overhead of
+// table expansion/compaction. (Since waiter.Queue is implemented using a
+// linked list, BenchmarkQueueSubscribeUnsubscribe and
+// BenchmarkQueueSubscribeUnsubscribeBatch should produce nearly the same
+// result.)
+
+const numBatchReceivers = 1000
+
+func BenchmarkBroadcasterSubscribeUnsubscribeBatch(b *testing.B) {
+	var br Broadcaster
+	ws := make([]Waiter, numBatchReceivers)
+	for i := range ws {
+		ws[i].Init()
+	}
+	ids := make([]SubscriptionID, numBatchReceivers)
+
+	// Generate a random order for unsubscriptions.
+	unsub := rand.Perm(numBatchReceivers)
+
+	b.ResetTimer()
+	for i := 0; i < b.N/numBatchReceivers; i++ {
+		for j := 0; j < numBatchReceivers; j++ {
+			ids[j] = br.SubscribeEvents(ws[j].Receiver(), 1)
+		}
+		for j := 0; j < numBatchReceivers; j++ {
+			br.UnsubscribeEvents(ids[unsub[j]])
+		}
+	}
+}
+
+func BenchmarkMapSubscribeUnsubscribeBatch(b *testing.B) {
+	var mu sync.Mutex
+	m := make(map[*Receiver]Set)
+	ws := make([]Waiter, numBatchReceivers)
+	for i := range ws {
+		ws[i].Init()
+	}
+
+	// Generate a random order for unsubscriptions.
+	unsub := rand.Perm(numBatchReceivers)
+
+	b.ResetTimer()
+	for i := 0; i < b.N/numBatchReceivers; i++ {
+		for j := 0; j < numBatchReceivers; j++ {
+			mu.Lock()
+			m[ws[j].Receiver()] = Set(1)
+			mu.Unlock()
+		}
+		for j := 0; j < numBatchReceivers; j++ {
+			mu.Lock()
+			delete(m, ws[unsub[j]].Receiver())
+			mu.Unlock()
+		}
+	}
+}
+
+func BenchmarkQueueSubscribeUnsubscribeBatch(b *testing.B) {
+	var q waiter.Queue
+	es := make([]waiter.Entry, numBatchReceivers)
+	for i := range es {
+		es[i], _ = waiter.NewChannelEntry(nil)
+	}
+
+	// Generate a random order for unsubscriptions.
+	unsub := rand.Perm(numBatchReceivers)
+
+	b.ResetTimer()
+	for i := 0; i < b.N/numBatchReceivers; i++ {
+		for j := 0; j < numBatchReceivers; j++ {
+			q.EventRegister(&es[j], 1)
+		}
+		for j := 0; j < numBatchReceivers; j++ {
+			q.EventUnregister(&es[unsub[j]])
+		}
+	}
+}
+
+// BenchmarkXxxBroadcastRedundant measures how long it takes to Broadcast
+// already-pending events to multiple Receivers.
+
+func BenchmarkBroadcasterBroadcastRedundant(b *testing.B) {
+	for _, n := range receiverCountsIncludingZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var br Broadcaster
+			ws := make([]Waiter, n)
+			for i := range ws {
+				ws[i].Init()
+				br.SubscribeEvents(ws[i].Receiver(), 1)
+			}
+			br.Broadcast(1)
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				br.Broadcast(1)
+			}
+		})
+	}
+}
+
+func BenchmarkMapBroadcastRedundant(b *testing.B) {
+	for _, n := range receiverCountsIncludingZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var mu sync.Mutex
+			m := make(map[*Receiver]Set)
+			ws := make([]Waiter, n)
+			for i := range ws {
+				ws[i].Init()
+				m[ws[i].Receiver()] = Set(1)
+			}
+			mu.Lock()
+			for r := range m {
+				r.Notify(1)
+			}
+			mu.Unlock()
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				mu.Lock()
+				for r := range m {
+					r.Notify(1)
+				}
+				mu.Unlock()
+			}
+		})
+	}
+}
+
+func BenchmarkQueueBroadcastRedundant(b *testing.B) {
+	for _, n := range receiverCountsIncludingZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var q waiter.Queue
+			for i := 0; i < n; i++ {
+				e, _ := waiter.NewChannelEntry(nil)
+				q.EventRegister(&e, 1)
+			}
+			q.Notify(1)
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				q.Notify(1)
+			}
+		})
+	}
+}
+
+// BenchmarkXxxBroadcastAck measures how long it takes to Broadcast events to
+// multiple Receivers, check that all Receivers have received the event, and
+// clear the event from all Receivers.
+
+func BenchmarkBroadcasterBroadcastAck(b *testing.B) {
+	for _, n := range receiverCountsNonZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var br Broadcaster
+			ws := make([]Waiter, n)
+			for i := range ws {
+				ws[i].Init()
+				br.SubscribeEvents(ws[i].Receiver(), 1)
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				br.Broadcast(1)
+				for j := range ws {
+					if got, want := ws[j].Pending(), Set(1); got != want {
+						b.Fatalf("Receiver.Pending(): got %#x, wanted %#x", got, want)
+					}
+					ws[j].Ack(1)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkMapBroadcastAck(b *testing.B) {
+	for _, n := range receiverCountsNonZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var mu sync.Mutex
+			m := make(map[*Receiver]Set)
+			ws := make([]Waiter, n)
+			for i := range ws {
+				ws[i].Init()
+				m[ws[i].Receiver()] = Set(1)
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				mu.Lock()
+				for r := range m {
+					r.Notify(1)
+				}
+				mu.Unlock()
+				for j := range ws {
+					if got, want := ws[j].Pending(), Set(1); got != want {
+						b.Fatalf("Receiver.Pending(): got %#x, wanted %#x", got, want)
+					}
+					ws[j].Ack(1)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkQueueBroadcastAck(b *testing.B) {
+	for _, n := range receiverCountsNonZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var q waiter.Queue
+			chs := make([]chan struct{}, n)
+			for i := range chs {
+				e, ch := waiter.NewChannelEntry(nil)
+				q.EventRegister(&e, 1)
+				chs[i] = ch
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				q.Notify(1)
+				for _, ch := range chs {
+					select {
+					case <-ch:
+					default:
+						b.Fatalf("channel did not receive event")
+					}
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/syncevent/receiver.go b/pkg/syncevent/receiver.go
new file mode 100644
index 000000000..5c86e5400
--- /dev/null
+++ b/pkg/syncevent/receiver.go
@@ -0,0 +1,103 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/atomicbitops"
+)
+
+// Receiver is an event sink that holds pending events and invokes a callback
+// whenever new events become pending. Receiver's methods may be called
+// concurrently from multiple goroutines.
+//
+// Receiver.Init() must be called before first use.
+type Receiver struct {
+	// pending is the set of pending events. pending is accessed using atomic
+	// memory operations.
+	pending uint64
+
+	// cb is notified when new events become pending. cb is immutable after
+	// Init().
+	cb ReceiverCallback
+}
+
+// ReceiverCallback receives callbacks from a Receiver.
+type ReceiverCallback interface {
+	// NotifyPending is called when the corresponding Receiver has new pending
+	// events.
+	//
+	// NotifyPending is called synchronously from Receiver.Notify(), so
+	// implementations must not take locks that may be held by callers of
+	// Receiver.Notify(). NotifyPending may be called concurrently from
+	// multiple goroutines.
+	NotifyPending()
+}
+
+// Init must be called before first use of r.
+func (r *Receiver) Init(cb ReceiverCallback) {
+	r.cb = cb
+}
+
+// Pending returns the set of pending events.
+func (r *Receiver) Pending() Set {
+	return Set(atomic.LoadUint64(&r.pending))
+}
+
+// Notify sets the given events as pending.
+func (r *Receiver) Notify(es Set) {
+	p := Set(atomic.LoadUint64(&r.pending))
+	// Optimization: Skip the atomic CAS on r.pending if all events are
+	// already pending.
+	if p&es == es {
+		return
+	}
+	// When this is uncontended (the common case), CAS is faster than
+	// atomic-OR because the former is inlined and the latter (which we
+	// implement in assembly ourselves) is not.
+	if !atomic.CompareAndSwapUint64(&r.pending, uint64(p), uint64(p|es)) {
+		// If the CAS fails, fall back to atomic-OR.
+		atomicbitops.OrUint64(&r.pending, uint64(es))
+	}
+	r.cb.NotifyPending()
+}
+
+// Ack unsets the given events as pending.
+func (r *Receiver) Ack(es Set) {
+	p := Set(atomic.LoadUint64(&r.pending))
+	// Optimization: Skip the atomic CAS on r.pending if all events are
+	// already not pending.
+	if p&es == 0 {
+		return
+	}
+	// When this is uncontended (the common case), CAS is faster than
+	// atomic-AND because the former is inlined and the latter (which we
+	// implement in assembly ourselves) is not.
+	if !atomic.CompareAndSwapUint64(&r.pending, uint64(p), uint64(p&^es)) {
+		// If the CAS fails, fall back to atomic-AND.
+		atomicbitops.AndUint64(&r.pending, ^uint64(es))
+	}
+}
+
+// PendingAndAckAll unsets all events as pending and returns the set of
+// previously-pending events.
+//
+// PendingAndAckAll should only be used in preference to a call to Pending
+// followed by a conditional call to Ack when the caller expects events to be
+// pending (e.g. after a call to ReceiverCallback.NotifyPending()).
+func (r *Receiver) PendingAndAckAll() Set {
+	return Set(atomic.SwapUint64(&r.pending, 0))
+}
diff --git a/pkg/syncevent/source.go b/pkg/syncevent/source.go
new file mode 100644
index 000000000..ddffb171a
--- /dev/null
+++ b/pkg/syncevent/source.go
@@ -0,0 +1,59 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+// Source represents an event source.
+type Source interface {
+	// SubscribeEvents causes the Source to notify the given Receiver of the
+	// given subset of events.
+	//
+	// Preconditions: r != nil. The ReceiverCallback for r must not take locks
+	// that are ordered prior to the Source; for example, it cannot call any
+	// Source methods.
+	SubscribeEvents(r *Receiver, filter Set) SubscriptionID
+
+	// UnsubscribeEvents causes the Source to stop notifying the Receiver
+	// subscribed by a previous call to SubscribeEvents that returned the given
+	// SubscriptionID.
+	//
+	// Preconditions: UnsubscribeEvents may be called at most once for any
+	// given SubscriptionID.
+	UnsubscribeEvents(id SubscriptionID)
+}
+
+// SubscriptionID identifies a call to Source.SubscribeEvents.
+type SubscriptionID uint64
+
+// UnsubscribeAndAck is a convenience function that unsubscribes r from the
+// given events from src and also clears them from r.
+func UnsubscribeAndAck(src Source, r *Receiver, filter Set, id SubscriptionID) {
+	src.UnsubscribeEvents(id)
+	r.Ack(filter)
+}
+
+// NoopSource implements Source by never sending events to subscribed
+// Receivers.
+type NoopSource struct{}
+
+// SubscribeEvents implements Source.SubscribeEvents.
+func (NoopSource) SubscribeEvents(*Receiver, Set) SubscriptionID {
+	return 0
+}
+
+// UnsubscribeEvents implements Source.UnsubscribeEvents.
+func (NoopSource) UnsubscribeEvents(SubscriptionID) {
+}
+
+// See Broadcaster for a non-noop implementations of Source.
diff --git a/pkg/syncevent/syncevent.go b/pkg/syncevent/syncevent.go
new file mode 100644
index 000000000..9fb6a06de
--- /dev/null
+++ b/pkg/syncevent/syncevent.go
@@ -0,0 +1,32 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package syncevent provides efficient primitives for goroutine
+// synchronization based on event bitmasks.
+package syncevent
+
+// Set is a bitmask where each bit represents a distinct user-defined event.
+// The event package does not treat any bits in Set specially.
+type Set uint64
+
+const (
+	// NoEvents is a Set containing no events.
+	NoEvents = Set(0)
+
+	// AllEvents is a Set containing all possible events.
+	AllEvents = ^Set(0)
+
+	// MaxEvents is the number of distinct events that can be represented by a Set.
+	MaxEvents = 64
+)
diff --git a/pkg/syncevent/syncevent_example_test.go b/pkg/syncevent/syncevent_example_test.go
new file mode 100644
index 000000000..bfb18e2ea
--- /dev/null
+++ b/pkg/syncevent/syncevent_example_test.go
@@ -0,0 +1,108 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"fmt"
+	"sync/atomic"
+	"time"
+)
+
+func Example_ioReadinessInterrputible() {
+	const (
+		evReady = Set(1 << iota)
+		evInterrupt
+	)
+	errNotReady := fmt.Errorf("not ready for I/O")
+
+	// State of some I/O object.
+	var (
+		br    Broadcaster
+		ready uint32
+	)
+	doIO := func() error {
+		if atomic.LoadUint32(&ready) == 0 {
+			return errNotReady
+		}
+		return nil
+	}
+	go func() {
+		// The I/O object eventually becomes ready for I/O.
+		time.Sleep(100 * time.Millisecond)
+		// When it does, it first ensures that future calls to isReady() return
+		// true, then broadcasts the readiness event to Receivers.
+		atomic.StoreUint32(&ready, 1)
+		br.Broadcast(evReady)
+	}()
+
+	// Each user of the I/O object owns a Waiter.
+	var w Waiter
+	w.Init()
+	// The Waiter may be asynchronously interruptible, e.g. for signal
+	// handling in the sentry.
+	go func() {
+		time.Sleep(200 * time.Millisecond)
+		w.Receiver().Notify(evInterrupt)
+	}()
+
+	// To use the I/O object:
+	//
+	// Optionally, if the I/O object is likely to be ready, attempt I/O first.
+	err := doIO()
+	if err == nil {
+		// Success, we're done.
+		return /* nil */
+	}
+	if err != errNotReady {
+		// Failure, I/O failed for some reason other than readiness.
+		return /* err */
+	}
+	// Subscribe for readiness events from the I/O object.
+	id := br.SubscribeEvents(w.Receiver(), evReady)
+	// When we are finished blocking, unsubscribe from readiness events and
+	// remove readiness events from the pending event set.
+	defer UnsubscribeAndAck(&br, w.Receiver(), evReady, id)
+	for {
+		// Attempt I/O again. This must be done after the call to SubscribeEvents,
+		// since the I/O object might have become ready between the previous call
+		// to doIO and the call to SubscribeEvents.
+		err = doIO()
+		if err == nil {
+			return /* nil */
+		}
+		if err != errNotReady {
+			return /* err */
+		}
+		// Block until either the I/O object indicates it is ready, or we are
+		// interrupted.
+		events := w.Wait()
+		if events&evInterrupt != 0 {
+			// In the specific case of sentry signal handling, signal delivery
+			// is handled by another system, so we aren't responsible for
+			// acknowledging evInterrupt.
+			return /* errInterrupted */
+		}
+		// Note that, in a concurrent context, the I/O object might become
+		// ready and then not ready again. To handle this:
+		//
+		// - evReady must be acknowledged before calling doIO() again (rather
+		// than after), so that if the I/O object becomes ready *again* after
+		// the call to doIO(), the readiness event is not lost.
+		//
+		// - We must loop instead of just calling doIO() once after receiving
+		// evReady.
+		w.Ack(evReady)
+	}
+}
diff --git a/pkg/syncevent/waiter_amd64.s b/pkg/syncevent/waiter_amd64.s
new file mode 100644
index 000000000..985b56ae5
--- /dev/null
+++ b/pkg/syncevent/waiter_amd64.s
@@ -0,0 +1,32 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// See waiter_noasm_unsafe.go for a description of waiterUnlock.
+//
+// func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool
+TEXT ·waiterUnlock(SB),NOSPLIT,$0-24
+	MOVQ g+0(FP), DI
+	MOVQ wg+8(FP), SI
+
+	MOVQ $·preparingG(SB), AX
+	LOCK
+	CMPXCHGQ DI, 0(SI)
+
+	SETEQ AX
+	MOVB AX, ret+16(FP)
+
+	RET
+
diff --git a/pkg/syncevent/waiter_arm64.s b/pkg/syncevent/waiter_arm64.s
new file mode 100644
index 000000000..20d7ac23b
--- /dev/null
+++ b/pkg/syncevent/waiter_arm64.s
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// See waiter_noasm_unsafe.go for a description of waiterUnlock.
+//
+// func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool
+TEXT ·waiterUnlock(SB),NOSPLIT,$0-24
+	MOVD wg+8(FP), R0
+	MOVD $·preparingG(SB), R1
+	MOVD g+0(FP), R2
+again:
+	LDAXR (R0), R3
+	CMP R1, R3
+	BNE ok
+	STLXR R2, (R0), R3
+	CBNZ R3, again
+ok:
+	CSET EQ, R0
+	MOVB R0, ret+16(FP)
+	RET
+
diff --git a/pkg/syncevent/waiter_asm_unsafe.go b/pkg/syncevent/waiter_asm_unsafe.go
new file mode 100644
index 000000000..0995e9053
--- /dev/null
+++ b/pkg/syncevent/waiter_asm_unsafe.go
@@ -0,0 +1,24 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64 arm64
+
+package syncevent
+
+import (
+	"unsafe"
+)
+
+// See waiter_noasm_unsafe.go for a description of waiterUnlock.
+func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool
diff --git a/pkg/syncevent/waiter_noasm_unsafe.go b/pkg/syncevent/waiter_noasm_unsafe.go
new file mode 100644
index 000000000..1c4b0e39a
--- /dev/null
+++ b/pkg/syncevent/waiter_noasm_unsafe.go
@@ -0,0 +1,39 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// waiterUnlock is called from g0, so when the race detector is enabled,
+// waiterUnlock must be implemented in assembly since no race context is
+// available.
+//
+// +build !race
+// +build !amd64,!arm64
+
+package syncevent
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// waiterUnlock is the "unlock function" passed to runtime.gopark by
+// Waiter.Wait*. wg is &Waiter.g, and g is a pointer to the calling runtime.g.
+// waiterUnlock returns true if Waiter.Wait should sleep and false if sleeping
+// should be aborted.
+//
+//go:nosplit
+func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool {
+	// The only way this CAS can fail is if a call to Waiter.NotifyPending()
+	// has replaced *wg with nil, in which case we should not sleep.
+	return atomic.CompareAndSwapPointer(wg, (unsafe.Pointer)(&preparingG), g)
+}
diff --git a/pkg/syncevent/waiter_test.go b/pkg/syncevent/waiter_test.go
new file mode 100644
index 000000000..3c8cbcdd8
--- /dev/null
+++ b/pkg/syncevent/waiter_test.go
@@ -0,0 +1,414 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+func TestWaiterAlreadyPending(t *testing.T) {
+	var w Waiter
+	w.Init()
+	want := Set(1)
+	w.Notify(want)
+	if got := w.Wait(); got != want {
+		t.Errorf("Waiter.Wait: got %#x, wanted %#x", got, want)
+	}
+}
+
+func TestWaiterAsyncNotify(t *testing.T) {
+	var w Waiter
+	w.Init()
+	want := Set(1)
+	go func() {
+		time.Sleep(100 * time.Millisecond)
+		w.Notify(want)
+	}()
+	if got := w.Wait(); got != want {
+		t.Errorf("Waiter.Wait: got %#x, wanted %#x", got, want)
+	}
+}
+
+func TestWaiterWaitFor(t *testing.T) {
+	var w Waiter
+	w.Init()
+	evWaited := Set(1)
+	evOther := Set(2)
+	w.Notify(evOther)
+	notifiedEvent := uint32(0)
+	go func() {
+		time.Sleep(100 * time.Millisecond)
+		atomic.StoreUint32(&notifiedEvent, 1)
+		w.Notify(evWaited)
+	}()
+	if got, want := w.WaitFor(evWaited), evWaited|evOther; got != want {
+		t.Errorf("Waiter.WaitFor: got %#x, wanted %#x", got, want)
+	}
+	if atomic.LoadUint32(&notifiedEvent) == 0 {
+		t.Errorf("Waiter.WaitFor returned before goroutine notified waited-for event")
+	}
+}
+
+func TestWaiterWaitAndAckAll(t *testing.T) {
+	var w Waiter
+	w.Init()
+	w.Notify(AllEvents)
+	if got := w.WaitAndAckAll(); got != AllEvents {
+		t.Errorf("Waiter.WaitAndAckAll: got %#x, wanted %#x", got, AllEvents)
+	}
+	if got := w.Pending(); got != NoEvents {
+		t.Errorf("Waiter.WaitAndAckAll did not ack all events: got %#x, wanted 0", got)
+	}
+}
+
+// BenchmarkWaiterX, BenchmarkSleeperX, and BenchmarkChannelX benchmark usage
+// pattern X (described in terms of Waiter) with Waiter, sleep.Sleeper, and
+// buffered chan struct{} respectively. When the maximum number of event
+// sources is relevant, we use 3 event sources because this is representative
+// of the kernel.Task.block() use case: an interrupt source, a timeout source,
+// and the actual event source being waited on.
+
+// Event set used by most benchmarks.
+const evBench Set = 1
+
+// BenchmarkXxxNotifyRedundant measures how long it takes to notify a Waiter of
+// an event that is already pending.
+
+func BenchmarkWaiterNotifyRedundant(b *testing.B) {
+	var w Waiter
+	w.Init()
+	w.Notify(evBench)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Notify(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyRedundant(b *testing.B) {
+	var s sleep.Sleeper
+	var w sleep.Waker
+	s.AddWaker(&w, 0)
+	w.Assert()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Assert()
+	}
+}
+
+func BenchmarkChannelNotifyRedundant(b *testing.B) {
+	ch := make(chan struct{}, 1)
+	ch <- struct{}{}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		select {
+		case ch <- struct{}{}:
+		default:
+		}
+	}
+}
+
+// BenchmarkXxxNotifyWaitAck measures how long it takes to notify a Waiter an
+// event, return that event using a blocking check, and then unset the event as
+// pending.
+
+func BenchmarkWaiterNotifyWaitAck(b *testing.B) {
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Notify(evBench)
+		w.Wait()
+		w.Ack(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyWaitAck(b *testing.B) {
+	var s sleep.Sleeper
+	var w sleep.Waker
+	s.AddWaker(&w, 0)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Assert()
+		s.Fetch(true)
+	}
+}
+
+func BenchmarkChannelNotifyWaitAck(b *testing.B) {
+	ch := make(chan struct{}, 1)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		// notify
+		select {
+		case ch <- struct{}{}:
+		default:
+		}
+
+		// wait + ack
+		<-ch
+	}
+}
+
+// BenchmarkSleeperMultiNotifyWaitAck is equivalent to
+// BenchmarkSleeperNotifyWaitAck, but also includes allocation of a
+// temporary sleep.Waker. This is necessary when multiple goroutines may wait
+// for the same event, since each sleep.Waker can wake only a single
+// sleep.Sleeper.
+//
+// The syncevent package does not require a distinct object for each
+// waiter-waker relationship, so BenchmarkWaiterNotifyWaitAck and
+// BenchmarkWaiterMultiNotifyWaitAck would be identical. The analogous state
+// for channels, runtime.sudog, is inescapably runtime-allocated, so
+// BenchmarkChannelNotifyWaitAck and BenchmarkChannelMultiNotifyWaitAck would
+// also be identical.
+
+func BenchmarkSleeperMultiNotifyWaitAck(b *testing.B) {
+	var s sleep.Sleeper
+	// The sleep package doesn't provide sync.Pool allocation of Wakers;
+	// we do for a fairer comparison.
+	wakerPool := sync.Pool{
+		New: func() interface{} {
+			return &sleep.Waker{}
+		},
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w := wakerPool.Get().(*sleep.Waker)
+		s.AddWaker(w, 0)
+		w.Assert()
+		s.Fetch(true)
+		s.Done()
+		wakerPool.Put(w)
+	}
+}
+
+// BenchmarkXxxTempNotifyWaitAck is equivalent to NotifyWaitAck, but also
+// includes allocation of a temporary Waiter. This models the case where a
+// goroutine not already associated with a Waiter needs one in order to block.
+//
+// The analogous state for channels is built into runtime.g, so
+// BenchmarkChannelNotifyWaitAck and BenchmarkChannelTempNotifyWaitAck would be
+// identical.
+
+func BenchmarkWaiterTempNotifyWaitAck(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w := GetWaiter()
+		w.Notify(evBench)
+		w.Wait()
+		w.Ack(evBench)
+		PutWaiter(w)
+	}
+}
+
+func BenchmarkSleeperTempNotifyWaitAck(b *testing.B) {
+	// The sleep package doesn't provide sync.Pool allocation of Sleepers;
+	// we do for a fairer comparison.
+	sleeperPool := sync.Pool{
+		New: func() interface{} {
+			return &sleep.Sleeper{}
+		},
+	}
+	var w sleep.Waker
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s := sleeperPool.Get().(*sleep.Sleeper)
+		s.AddWaker(&w, 0)
+		w.Assert()
+		s.Fetch(true)
+		s.Done()
+		sleeperPool.Put(s)
+	}
+}
+
+// BenchmarkXxxNotifyWaitMultiAck is equivalent to NotifyWaitAck, but allows
+// for multiple event sources.
+
+func BenchmarkWaiterNotifyWaitMultiAck(b *testing.B) {
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Notify(evBench)
+		if e := w.Wait(); e != evBench {
+			b.Fatalf("Wait: got %#x, wanted %#x", e, evBench)
+		}
+		w.Ack(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyWaitMultiAck(b *testing.B) {
+	var s sleep.Sleeper
+	var ws [3]sleep.Waker
+	for i := range ws {
+		s.AddWaker(&ws[i], i)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		ws[0].Assert()
+		if id, _ := s.Fetch(true); id != 0 {
+			b.Fatalf("Fetch: got %d, wanted 0", id)
+		}
+	}
+}
+
+func BenchmarkChannelNotifyWaitMultiAck(b *testing.B) {
+	ch0 := make(chan struct{}, 1)
+	ch1 := make(chan struct{}, 1)
+	ch2 := make(chan struct{}, 1)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		// notify
+		select {
+		case ch0 <- struct{}{}:
+		default:
+		}
+
+		// wait + clear
+		select {
+		case <-ch0:
+			// ok
+		case <-ch1:
+			b.Fatalf("received from ch1")
+		case <-ch2:
+			b.Fatalf("received from ch2")
+		}
+	}
+}
+
+// BenchmarkXxxNotifyAsyncWaitAck measures how long it takes to wait for an
+// event while another goroutine signals the event. This assumes that a new
+// goroutine doesn't run immediately (i.e. the creator of a new goroutine is
+// allowed to go to sleep before the new goroutine has a chance to run).
+
+func BenchmarkWaiterNotifyAsyncWaitAck(b *testing.B) {
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			w.Notify(1)
+		}()
+		w.Wait()
+		w.Ack(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyAsyncWaitAck(b *testing.B) {
+	var s sleep.Sleeper
+	var w sleep.Waker
+	s.AddWaker(&w, 0)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			w.Assert()
+		}()
+		s.Fetch(true)
+	}
+}
+
+func BenchmarkChannelNotifyAsyncWaitAck(b *testing.B) {
+	ch := make(chan struct{}, 1)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			select {
+			case ch <- struct{}{}:
+			default:
+			}
+		}()
+		<-ch
+	}
+}
+
+// BenchmarkXxxNotifyAsyncWaitMultiAck is equivalent to NotifyAsyncWaitAck, but
+// allows for multiple event sources.
+
+func BenchmarkWaiterNotifyAsyncWaitMultiAck(b *testing.B) {
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			w.Notify(evBench)
+		}()
+		if e := w.Wait(); e != evBench {
+			b.Fatalf("Wait: got %#x, wanted %#x", e, evBench)
+		}
+		w.Ack(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyAsyncWaitMultiAck(b *testing.B) {
+	var s sleep.Sleeper
+	var ws [3]sleep.Waker
+	for i := range ws {
+		s.AddWaker(&ws[i], i)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			ws[0].Assert()
+		}()
+		if id, _ := s.Fetch(true); id != 0 {
+			b.Fatalf("Fetch: got %d, expected 0", id)
+		}
+	}
+}
+
+func BenchmarkChannelNotifyAsyncWaitMultiAck(b *testing.B) {
+	ch0 := make(chan struct{}, 1)
+	ch1 := make(chan struct{}, 1)
+	ch2 := make(chan struct{}, 1)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			select {
+			case ch0 <- struct{}{}:
+			default:
+			}
+		}()
+
+		select {
+		case <-ch0:
+			// ok
+		case <-ch1:
+			b.Fatalf("received from ch1")
+		case <-ch2:
+			b.Fatalf("received from ch2")
+		}
+	}
+}
diff --git a/pkg/syncevent/waiter_unsafe.go b/pkg/syncevent/waiter_unsafe.go
new file mode 100644
index 000000000..112e0e604
--- /dev/null
+++ b/pkg/syncevent/waiter_unsafe.go
@@ -0,0 +1,206 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.11
+// +build !go1.15
+
+// Check go:linkname function signatures when updating Go version.
+
+package syncevent
+
+import (
+	"sync/atomic"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+//go:linkname gopark runtime.gopark
+func gopark(unlockf func(unsafe.Pointer, *unsafe.Pointer) bool, wg *unsafe.Pointer, reason uint8, traceEv byte, traceskip int)
+
+//go:linkname goready runtime.goready
+func goready(g unsafe.Pointer, traceskip int)
+
+const (
+	waitReasonSelect     = 9  // Go: src/runtime/runtime2.go
+	traceEvGoBlockSelect = 24 // Go: src/runtime/trace.go
+)
+
+// Waiter allows a goroutine to block on pending events received by a Receiver.
+//
+// Waiter.Init() must be called before first use.
+type Waiter struct {
+	r Receiver
+
+	// g is one of:
+	//
+	// - nil: No goroutine is blocking in Wait.
+	//
+	// - &preparingG: A goroutine is in Wait preparing to sleep, but hasn't yet
+	// completed waiterUnlock(). Thus the wait can only be interrupted by
+	// replacing the value of g with nil (the G may not be in state Gwaiting
+	// yet, so we can't call goready.)
+	//
+	// - Otherwise: g is a pointer to the runtime.g in state Gwaiting for the
+	// goroutine blocked in Wait, which can only be woken by calling goready.
+	g unsafe.Pointer `state:"zerovalue"`
+}
+
+// Sentinel object for Waiter.g.
+var preparingG struct{}
+
+// Init must be called before first use of w.
+func (w *Waiter) Init() {
+	w.r.Init(w)
+}
+
+// Receiver returns the Receiver that receives events that unblock calls to
+// w.Wait().
+func (w *Waiter) Receiver() *Receiver {
+	return &w.r
+}
+
+// Pending returns the set of pending events.
+func (w *Waiter) Pending() Set {
+	return w.r.Pending()
+}
+
+// Wait blocks until at least one event is pending, then returns the set of
+// pending events. It does not affect the set of pending events; callers must
+// call w.Ack() to do so, or use w.WaitAndAck() instead.
+//
+// Precondition: Only one goroutine may call any Wait* method at a time.
+func (w *Waiter) Wait() Set {
+	return w.WaitFor(AllEvents)
+}
+
+// WaitFor blocks until at least one event in es is pending, then returns the
+// set of pending events (including those not in es). It does not affect the
+// set of pending events; callers must call w.Ack() to do so.
+//
+// Precondition: Only one goroutine may call any Wait* method at a time.
+func (w *Waiter) WaitFor(es Set) Set {
+	for {
+		// Optimization: Skip the atomic store to w.g if an event is already
+		// pending.
+		if p := w.r.Pending(); p&es != NoEvents {
+			return p
+		}
+
+		// Indicate that we're preparing to go to sleep.
+		atomic.StorePointer(&w.g, (unsafe.Pointer)(&preparingG))
+
+		// If an event is pending, abort the sleep.
+		if p := w.r.Pending(); p&es != NoEvents {
+			atomic.StorePointer(&w.g, nil)
+			return p
+		}
+
+		// If w.g is still preparingG (i.e. w.NotifyPending() has not been
+		// called or has not reached atomic.SwapPointer()), go to sleep until
+		// w.NotifyPending() => goready().
+		gopark(waiterUnlock, &w.g, waitReasonSelect, traceEvGoBlockSelect, 0)
+	}
+}
+
+// Ack marks the given events as not pending.
+func (w *Waiter) Ack(es Set) {
+	w.r.Ack(es)
+}
+
+// WaitAndAckAll blocks until at least one event is pending, then marks all
+// events as not pending and returns the set of previously-pending events.
+//
+// Precondition: Only one goroutine may call any Wait* method at a time.
+func (w *Waiter) WaitAndAckAll() Set {
+	// Optimization: Skip the atomic store to w.g if an event is already
+	// pending. Call Pending() first since, in the common case that events are
+	// not yet pending, this skips an atomic swap on w.r.pending.
+	if w.r.Pending() != NoEvents {
+		if p := w.r.PendingAndAckAll(); p != NoEvents {
+			return p
+		}
+	}
+
+	for {
+		// Indicate that we're preparing to go to sleep.
+		atomic.StorePointer(&w.g, (unsafe.Pointer)(&preparingG))
+
+		// If an event is pending, abort the sleep.
+		if w.r.Pending() != NoEvents {
+			if p := w.r.PendingAndAckAll(); p != NoEvents {
+				atomic.StorePointer(&w.g, nil)
+				return p
+			}
+		}
+
+		// If w.g is still preparingG (i.e. w.NotifyPending() has not been
+		// called or has not reached atomic.SwapPointer()), go to sleep until
+		// w.NotifyPending() => goready().
+		gopark(waiterUnlock, &w.g, waitReasonSelect, traceEvGoBlockSelect, 0)
+
+		// Check for pending events. We call PendingAndAckAll() directly now since
+		// we only expect to be woken after events become pending.
+		if p := w.r.PendingAndAckAll(); p != NoEvents {
+			return p
+		}
+	}
+}
+
+// Notify marks the given events as pending, possibly unblocking concurrent
+// calls to w.Wait() or w.WaitFor().
+func (w *Waiter) Notify(es Set) {
+	w.r.Notify(es)
+}
+
+// NotifyPending implements ReceiverCallback.NotifyPending. Users of Waiter
+// should not call NotifyPending.
+func (w *Waiter) NotifyPending() {
+	// Optimization: Skip the atomic swap on w.g if there is no sleeping
+	// goroutine. NotifyPending is called after w.r.Pending() is updated, so
+	// concurrent and future calls to w.Wait() will observe pending events and
+	// abort sleeping.
+	if atomic.LoadPointer(&w.g) == nil {
+		return
+	}
+	// Wake a sleeping G, or prevent a G that is preparing to sleep from doing
+	// so. Swap is needed here to ensure that only one call to NotifyPending
+	// calls goready.
+	if g := atomic.SwapPointer(&w.g, nil); g != nil && g != (unsafe.Pointer)(&preparingG) {
+		goready(g, 0)
+	}
+}
+
+var waiterPool = sync.Pool{
+	New: func() interface{} {
+		w := &Waiter{}
+		w.Init()
+		return w
+	},
+}
+
+// GetWaiter returns an unused Waiter. PutWaiter should be called to release
+// the Waiter once it is no longer needed.
+//
+// Where possible, users should prefer to associate each goroutine that calls
+// Waiter.Wait() with a distinct pre-allocated Waiter to avoid allocation of
+// Waiters in hot paths.
+func GetWaiter() *Waiter {
+	return waiterPool.Get().(*Waiter)
+}
+
+// PutWaiter releases an unused Waiter previously returned by GetWaiter.
+func PutWaiter(w *Waiter) {
+	waiterPool.Put(w)
+}
-- 
cgit v1.2.3


From 56fd9504aab44a738d3df164cbee8e572b309f28 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 15:44:22 -0800
Subject: Enable IPV6_RECVTCLASS socket option for datagram sockets

Added the ability to get/set the IP_RECVTCLASS socket option on UDP endpoints.
If enabled, traffic class from the incoming Network Header passed as ancillary
data in the ControlMessages.

Adding Get/SetSockOptBool to decrease the overhead of getting/setting simple
options. (This was absorbed in a CL that will be landing before this one).

Test:
* Added unit test to udp_test.go that tests getting/setting as well as
verifying that we receive expected TOS from incoming packet.
* Added a syscall test for verifying getting/setting
* Removed test skip for existing syscall test to enable end to end test.
PiperOrigin-RevId: 295840218
---
 pkg/sentry/socket/control/control.go         |   2 +-
 pkg/sentry/socket/netstack/netstack.go       |  27 +++++-
 pkg/tcpip/checker/checker.go                 |  14 +++
 pkg/tcpip/tcpip.go                           |  15 ++-
 pkg/tcpip/transport/udp/endpoint.go          |  38 +++++++-
 pkg/tcpip/transport/udp/udp_test.go          | 120 ++++++++++++++----------
 test/syscalls/linux/ip_socket_test_util.h    |  16 ++--
 test/syscalls/linux/socket_ip_udp_generic.cc | 133 +++++++++++++++++++--------
 test/syscalls/linux/udp_socket_test_cases.cc |   4 -
 9 files changed, 260 insertions(+), 109 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 4667373d2..8834a1e1a 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -329,7 +329,7 @@ func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
 }
 
 // PackTClass packs an IPV6_TCLASS socket control message.
-func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
+func PackTClass(t *kernel.Task, tClass uint32, buf []byte) []byte {
 	return putCmsgStruct(
 		buf,
 		linux.SOL_IPV6,
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 9757fbfba..e187276c5 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1318,6 +1318,22 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		}
 		return ib, nil
 
+	case linux.IPV6_RECVTCLASS:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveTClassOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		var o int32
+		if v {
+			o = 1
+		}
+		return o, nil
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
@@ -1803,6 +1819,14 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv6TrafficClassOption(v)))
 
+	case linux.IPV6_RECVTCLASS:
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0))
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
@@ -2086,7 +2110,6 @@ func emitUnimplementedEventIPv6(t *kernel.Task, name int) {
 		linux.IPV6_RECVPATHMTU,
 		linux.IPV6_RECVPKTINFO,
 		linux.IPV6_RECVRTHDR,
-		linux.IPV6_RECVTCLASS,
 		linux.IPV6_RTHDR,
 		linux.IPV6_RTHDRDSTOPTS,
 		linux.IPV6_TCLASS,
@@ -2424,6 +2447,8 @@ func (s *SocketOperations) controlMessages() socket.ControlMessages {
 			Timestamp:       s.readCM.Timestamp,
 			HasTOS:          s.readCM.HasTOS,
 			TOS:             s.readCM.TOS,
+			HasTClass:       s.readCM.HasTClass,
+			TClass:          s.readCM.TClass,
 			HasIPPacketInfo: s.readCM.HasIPPacketInfo,
 			PacketInfo:      s.readCM.PacketInfo,
 		},
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 4d6ae0871..c6c160dfc 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -161,6 +161,20 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	}
 }
 
+// ReceiveTClass creates a checker that checks the TCLASS field in
+// ControlMessages.
+func ReceiveTClass(want uint32) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasTClass {
+			t.Fatalf("got cm.HasTClass = %t, want cm.TClass = %d", cm.HasTClass, want)
+		}
+		if got := cm.TClass; got != want {
+			t.Fatalf("got cm.TClass = %d, want %d", got, want)
+		}
+	}
+}
+
 // ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
 func ReceiveTOS(want uint8) ControlMessagesChecker {
 	return func(t *testing.T, cm tcpip.ControlMessages) {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 9ca39ce40..ce5527391 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -323,11 +323,11 @@ type ControlMessages struct {
 	// TOS is the IPv4 type of service of the associated packet.
 	TOS uint8
 
-	// HasTClass indicates whether Tclass is valid/set.
+	// HasTClass indicates whether TClass is valid/set.
 	HasTClass bool
 
-	// Tclass is the IPv6 traffic class of the associated packet.
-	TClass int32
+	// TClass is the IPv6 traffic class of the associated packet.
+	TClass uint32
 
 	// HasIPPacketInfo indicates whether PacketInfo is set.
 	HasIPPacketInfo bool
@@ -502,9 +502,13 @@ type WriteOptions struct {
 type SockOptBool int
 
 const (
+	// ReceiveTClassOption is used by SetSockOpt/GetSockOpt to specify if the
+	// IPV6_TCLASS ancillary message is passed with incoming packets.
+	ReceiveTClassOption SockOptBool = iota
+
 	// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
 	// ancillary message is passed with incoming packets.
-	ReceiveTOSOption SockOptBool = iota
+	ReceiveTOSOption
 
 	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
 	// socket is to be restricted to sending and receiving IPv6 packets only.
@@ -514,6 +518,9 @@ const (
 	// if more inforamtion is provided with incoming packets such
 	// as interface index and address.
 	ReceiveIPPacketInfoOption
+
+	// TODO(b/146901447): convert existing bool socket options to be handled via
+	// Get/SetSockOptBool
 )
 
 // SockOptInt represents socket options which values have the int type.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 3fe91cac2..eff7f3600 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -32,7 +32,8 @@ type udpPacket struct {
 	packetInfo    tcpip.IPPacketInfo
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
-	tos           uint8
+	// tos stores either the receiveTOS or receiveTClass value.
+	tos uint8
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -119,6 +120,10 @@ type endpoint struct {
 	// as ancillary data to ControlMessages on Read.
 	receiveTOS bool
 
+	// receiveTClass determines if the incoming IPv6 TClass header field is
+	// passed as ancillary data to ControlMessages on Read.
+	receiveTClass bool
+
 	// receiveIPPacketInfo determines if the packet info is returned by Read.
 	receiveIPPacketInfo bool
 
@@ -258,13 +263,18 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 	}
 	e.mu.RLock()
 	receiveTOS := e.receiveTOS
+	receiveTClass := e.receiveTClass
 	receiveIPPacketInfo := e.receiveIPPacketInfo
 	e.mu.RUnlock()
 	if receiveTOS {
 		cm.HasTOS = true
 		cm.TOS = p.tos
 	}
-
+	if receiveTClass {
+		cm.HasTClass = true
+		// Although TClass is an 8-bit value it's read in the CMsg as a uint32.
+		cm.TClass = uint32(p.tos)
+	}
 	if receiveIPPacketInfo {
 		cm.HasIPPacketInfo = true
 		cm.PacketInfo = p.packetInfo
@@ -490,6 +500,17 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return tcpip.ErrNotSupported
+		}
+
+		e.mu.Lock()
+		e.receiveTClass = v
+		e.mu.Unlock()
+		return nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -709,6 +730,17 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		e.mu.RUnlock()
 		return v, nil
 
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return false, tcpip.ErrNotSupported
+		}
+
+		e.mu.RLock()
+		v := e.receiveTClass
+		e.mu.RUnlock()
+		return v, nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -1273,6 +1305,8 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		packet.packetInfo.LocalAddr = r.LocalAddress
 		packet.packetInfo.DestinationAddr = r.RemoteAddress
 		packet.packetInfo.NIC = r.NICID()
+	case header.IPv6ProtocolNumber:
+		packet.tos, _ = header.IPv6(pkt.NetworkHeader).TOS()
 	}
 
 	packet.timestamp = e.stack.NowNanoseconds()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index f0ff3fe71..34b7c2360 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -409,6 +409,7 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 	// Initialize the IP header.
 	ip := header.IPv6(buf)
 	ip.Encode(&header.IPv6Fields{
+		TrafficClass:  testTOS,
 		PayloadLength: uint16(header.UDPMinimumSize + len(payload)),
 		NextHeader:    uint8(udp.ProtocolNumber),
 		HopLimit:      65,
@@ -1336,7 +1337,7 @@ func TestSetTTL(t *testing.T) {
 	}
 }
 
-func TestTOSV4(t *testing.T) {
+func TestSetTOS(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4, multicastV4, broadcast} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
 			c := newDualTestContext(t, defaultMTU)
@@ -1347,23 +1348,23 @@ func TestTOSV4(t *testing.T) {
 			const tos = testTOS
 			var v tcpip.IPv4TOSOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, 0)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, 0)
 			}
 
 			if err := c.ep.SetSockOpt(tcpip.IPv4TOSOption(tos)); err != nil {
-				c.t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.IPv4TOSOption(tos), err)
+				c.t.Errorf("SetSockOpt(%T, 0x%x) failed: %s", v, tcpip.IPv4TOSOption(tos), err)
 			}
 
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 
 			if want := tcpip.IPv4TOSOption(tos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, want)
 			}
 
 			testWrite(c, flow, checker.TOS(tos, 0))
@@ -1371,7 +1372,7 @@ func TestTOSV4(t *testing.T) {
 	}
 }
 
-func TestTOSV6(t *testing.T) {
+func TestSetTClass(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4in6, unicastV6, unicastV6Only, multicastV4in6, multicastV6, broadcastIn6} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
 			c := newDualTestContext(t, defaultMTU)
@@ -1379,71 +1380,92 @@ func TestTOSV6(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = testTOS
+			const tClass = testTOS
 			var v tcpip.IPv6TrafficClassOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, 0)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, 0)
 			}
 
-			if err := c.ep.SetSockOpt(tcpip.IPv6TrafficClassOption(tos)); err != nil {
-				c.t.Errorf("SetSockOpt failed: %s", err)
+			if err := c.ep.SetSockOpt(tcpip.IPv6TrafficClassOption(tClass)); err != nil {
+				c.t.Errorf("SetSockOpt(%T, 0x%x) failed: %s", v, tcpip.IPv6TrafficClassOption(tClass), err)
 			}
 
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 
-			if want := tcpip.IPv6TrafficClassOption(tos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+			if want := tcpip.IPv6TrafficClassOption(tClass); v != want {
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, want)
 			}
 
-			testWrite(c, flow, checker.TOS(tos, 0))
+			// The header getter for TClass is called TOS, so use that checker.
+			testWrite(c, flow, checker.TOS(tClass, 0))
 		})
 	}
 }
 
-func TestReceiveTOSV4(t *testing.T) {
-	for _, flow := range []testFlow{unicastV4, broadcast} {
-		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
-			c := newDualTestContext(t, defaultMTU)
-			defer c.cleanup()
+func TestReceiveTosTClass(t *testing.T) {
+	testCases := []struct {
+		name             string
+		getReceiveOption tcpip.SockOptBool
+		tests            []testFlow
+	}{
+		{"ReceiveTosOption", tcpip.ReceiveTOSOption, []testFlow{unicastV4, broadcast}},
+		{"ReceiveTClassOption", tcpip.ReceiveTClassOption, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}},
+	}
+	for _, testCase := range testCases {
+		for _, flow := range testCase.tests {
+			t.Run(fmt.Sprintf("%s:flow:%s", testCase.name, flow), func(t *testing.T) {
+				c := newDualTestContext(t, defaultMTU)
+				defer c.cleanup()
 
-			c.createEndpointForFlow(flow)
+				c.createEndpointForFlow(flow)
+				option := testCase.getReceiveOption
+				name := testCase.name
 
-			// Verify that setting and reading the option works.
-			v, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
-			if err != nil {
-				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
-			}
-			// Test for expected default value.
-			if v != false {
-				c.t.Errorf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", v, false)
-			}
+				// Verify that setting and reading the option works.
+				v, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockoptBool(%s) failed: %s", name, err)
+				}
+				// Test for expected default value.
+				if v != false {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, v, false)
+				}
 
-			want := true
-			if err := c.ep.SetSockOptBool(tcpip.ReceiveTOSOption, want); err != nil {
-				c.t.Fatalf("SetSockOptBool(tcpip.ReceiveTOSOption, %t) failed: %s", want, err)
-			}
+				want := true
+				if err := c.ep.SetSockOptBool(option, want); err != nil {
+					c.t.Fatalf("SetSockOptBool(%s, %t) failed: %s", name, want, err)
+				}
 
-			got, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
-			if err != nil {
-				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
-			}
-			if got != want {
-				c.t.Fatalf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", got, want)
-			}
+				got, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockoptBool(%s) failed: %s", name, err)
+				}
 
-			// Verify that the correct received TOS is handed through as
-			// ancillary data to the ControlMessages struct.
-			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-				c.t.Fatal("Bind failed:", err)
-			}
-			testRead(c, flow, checker.ReceiveTOS(testTOS))
-		})
+				if got != want {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, got, want)
+				}
+
+				// Verify that the correct received TOS or TClass is handed through as
+				// ancillary data to the ControlMessages struct.
+				if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+					c.t.Fatalf("Bind failed: %s", err)
+				}
+				switch option {
+				case tcpip.ReceiveTClassOption:
+					testRead(c, flow, checker.ReceiveTClass(testTOS))
+				case tcpip.ReceiveTOSOption:
+					testRead(c, flow, checker.ReceiveTOS(testTOS))
+				default:
+					t.Fatalf("unknown test variant: %s", name)
+				}
+			})
+		}
 	}
 }
 
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 083ebbcf0..39fd6709d 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -84,20 +84,20 @@ SocketPairKind DualStackUDPBidirectionalBindSocketPair(int type);
 // SocketPairs created with AF_INET and the given type.
 SocketPairKind IPv4UDPUnboundSocketPair(int type);
 
-// IPv4UDPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET, SOCK_DGRAM, and the given type.
+// IPv4UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_DGRAM, and the given type.
 SocketKind IPv4UDPUnboundSocket(int type);
 
-// IPv6UDPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET6, SOCK_DGRAM, and the given type.
+// IPv6UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_DGRAM, and the given type.
 SocketKind IPv6UDPUnboundSocket(int type);
 
-// IPv4TCPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET, SOCK_STREAM and the given type.
+// IPv4TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_STREAM and the given type.
 SocketKind IPv4TCPUnboundSocket(int type);
 
-// IPv6TCPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET6, SOCK_STREAM and the given type.
+// IPv6TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_STREAM and the given type.
 SocketKind IPv6TCPUnboundSocket(int type);
 
 // IfAddrHelper is a helper class that determines the local interfaces present
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index db5663ecd..1c533fdf2 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -14,6 +14,7 @@
 
 #include "test/syscalls/linux/socket_ip_udp_generic.h"
 
+#include <errno.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <poll.h>
@@ -209,46 +210,6 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
-// Ensure that Receiving TOS is off by default.
-TEST_P(UDPSocketPairTest, RecvTosDefault) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-}
-
-// Test that setting and getting IP_RECVTOS works as expected.
-TEST_P(UDPSocketPairTest, SetRecvTos) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOff, sizeof(kSockOptOff)),
-              SyscallSucceeds());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOn, sizeof(kSockOptOn)),
-              SyscallSucceeds());
-
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOn);
-}
-
 TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -401,5 +362,97 @@ TEST_P(UDPSocketPairTest, SetAndGetIPPKTINFO) {
   EXPECT_EQ(get_len, sizeof(get));
 }
 
+// Holds TOS or TClass information for IPv4 or IPv6 respectively.
+struct RecvTosOption {
+  int level;
+  int option;
+};
+
+RecvTosOption GetRecvTosOption(int domain) {
+  TEST_CHECK(domain == AF_INET || domain == AF_INET6);
+  RecvTosOption opt;
+  switch (domain) {
+    case AF_INET:
+      opt.level = IPPROTO_IP;
+      opt.option = IP_RECVTOS;
+      break;
+    case AF_INET6:
+      opt.level = IPPROTO_IPV6;
+      opt.option = IPV6_RECVTCLASS;
+      break;
+  }
+  return opt;
+}
+
+// Ensure that Receiving TOS or TCLASS is off by default.
+TEST_P(UDPSocketPairTest, RecvTosDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(GetParam().domain);
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test that setting and getting IP_RECVTOS or IPV6_RECVTCLASS works as
+// expected.
+TEST_P(UDPSocketPairTest, SetRecvTos) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(GetParam().domain);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
+// Test that any socket (including IPv6 only) accepts the IPv4 TOS option: this
+// mirrors behavior in linux.
+TEST_P(UDPSocketPairTest, TOSRecvMismatch) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(AF_INET);
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+}
+
+// Test that an IPv4 socket does not support the IPv6 TClass option.
+TEST_P(UDPSocketPairTest, TClassRecvMismatch) {
+  // This should only test AF_INET sockets for the mismatch behavior.
+  SKIP_IF(GetParam().domain != AF_INET);
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IPV6, IPV6_RECVTCLASS,
+                         &get, &get_len),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 9f8de6b48..57b1a357c 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,9 +1349,6 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
-  SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
-          !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1422,7 +1419,6 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
   // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From 92d2d78876a938871327685e1104d7b4ff46986e Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 18 Feb 2020 21:20:41 -0800
Subject: Fix mis-named comment.

---
 pkg/tcpip/stack/registration.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index ec91f60dd..d83adf0ec 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -277,7 +277,7 @@ type NetworkProtocol interface {
 	// DefaultPrefixLen returns the protocol's default prefix length.
 	DefaultPrefixLen() int
 
-	// ParsePorts returns the source and destination addresses stored in a
+	// ParseAddresses returns the source and destination addresses stored in a
 	// packet of this protocol.
 	ParseAddresses(v buffer.View) (src, dst tcpip.Address)
 
-- 
cgit v1.2.3


From 2daa21e4d73f2297a8bca32c76100333e9ac4af4 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 19 Feb 2020 16:47:58 -0800
Subject: Internal change.

PiperOrigin-RevId: 296088213
---
 pkg/sentry/socket/netstack/provider.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go
index 5afff2564..5f181f017 100644
--- a/pkg/sentry/socket/netstack/provider.go
+++ b/pkg/sentry/socket/netstack/provider.go
@@ -75,6 +75,8 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in
 		switch protocol {
 		case syscall.IPPROTO_ICMP:
 			return header.ICMPv4ProtocolNumber, true, nil
+		case syscall.IPPROTO_ICMPV6:
+			return header.ICMPv6ProtocolNumber, true, nil
 		case syscall.IPPROTO_UDP:
 			return header.UDPProtocolNumber, true, nil
 		case syscall.IPPROTO_TCP:
-- 
cgit v1.2.3


From de68e1d8c437e6234f5e413d7cad6f892a4452d3 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Thu, 20 Feb 2020 01:12:03 -0500
Subject: Code Clean:Move getUserRegisters into dieArchSetup() and other small
 changes.

Consistent with QEMU, getUserRegisters() should be an arch-specific
function. So, it should be called in dieArchSetup().

With this patch and the pagetable/pcid patch, the kvm modules on Arm64 can be
built successfully.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/kvm/bluepill.go              |  6 ------
 pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go |  6 ++++++
 pkg/sentry/platform/kvm/machine_arm64_unsafe.go  | 24 ------------------------
 3 files changed, 6 insertions(+), 30 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index 35cd55fef..4b23f7803 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -81,12 +81,6 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) {
 	// Save the death message, which will be thrown.
 	c.dieState.message = msg
 
-	// Reload all registers to have an accurate stack trace when we return
-	// to host mode. This means that the stack should be unwound correctly.
-	if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 {
-		throw(msg)
-	}
-
 	// Setup the trampoline.
 	dieArchSetup(c, context, &c.dieState.guestRegs)
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index a63a6a071..99cac665d 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -31,6 +31,12 @@ import (
 //
 //go:nosplit
 func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
+	// Reload all registers to have an accurate stack trace when we return
+	// to host mode. This means that the stack should be unwound correctly.
+	if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 {
+		throw(c.dieState.message)
+	}
+
 	// If the vCPU is in user mode, we set the stack to the stored stack
 	// value in the vCPU itself. We don't want to unwind the user stack.
 	if guestRegs.RFLAGS&ring0.UserFlagsSet == ring0.UserFlagsSet {
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 1c8384e6b..b531f2f85 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -29,30 +29,6 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// setMemoryRegion initializes a region.
-//
-// This may be called from bluepillHandler, and therefore returns an errno
-// directly (instead of wrapping in an error) to avoid allocations.
-//
-//go:nosplit
-func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno {
-	userRegion := userMemoryRegion{
-		slot:          uint32(slot),
-		flags:         0,
-		guestPhysAddr: uint64(physical),
-		memorySize:    uint64(length),
-		userspaceAddr: uint64(virtual),
-	}
-
-	// Set the region.
-	_, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(m.fd),
-		_KVM_SET_USER_MEMORY_REGION,
-		uintptr(unsafe.Pointer(&userRegion)))
-	return errno
-}
-
 type kvmVcpuInit struct {
 	target   uint32
 	features [7]uint32
-- 
cgit v1.2.3


From a369c88c0c4ece5239855000d28df045111c1be7 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Wed, 29 Jan 2020 04:33:06 -0500
Subject: Lazy-fpsimd support patch series#1: add Arm64-fpsimd support to arch
 module

This patch defines the structures and
adds the implementations for fpsimd initialization.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/arch/arch_aarch64.go | 30 +++++++++++++++++++-----------
 pkg/sentry/arch/arch_arm64.go   | 16 +++++++++++++---
 pkg/sentry/arch/signal_arm64.go | 19 ++++++++++++++++---
 3 files changed, 48 insertions(+), 17 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 3b6987665..53039465d 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -32,29 +32,35 @@ import (
 const (
 	// SyscallWidth is the width of insturctions.
 	SyscallWidth = 4
+
+	// fpsimdMagic is the magic number which is used in fpsimd_context.
+	fpsimdMagic = 0x46508001
+
+	// fpsimdContextSize is the size of fpsimd_context.
+	fpsimdContextSize = 0x210
 )
 
 // aarch64FPState is aarch64 floating point state.
 type aarch64FPState []byte
 
-// initAarch64FPState (defined in asm files) sets up initial state.
-func initAarch64FPState(data *FloatingPointData) {
-	// TODO(gvisor.dev/issue/1238): floating-point is not supported.
+// initAarch64FPState sets up initial state.
+func initAarch64FPState(data aarch64FPState) {
+	binary.LittleEndian.PutUint32(data, fpsimdMagic)
+	binary.LittleEndian.PutUint32(data[4:], fpsimdContextSize)
 }
 
 func newAarch64FPStateSlice() []byte {
-	return alignedBytes(4096, 32)[:4096]
+	return alignedBytes(4096, 16)[:fpsimdContextSize]
 }
 
 // newAarch64FPState returns an initialized floating point state.
 //
 // The returned state is large enough to store all floating point state
 // supported by host, even if the app won't use much of it due to a restricted
-// FeatureSet. Since they may still be able to see state not advertised by
-// CPUID we must ensure it does not contain any sentry state.
+// FeatureSet.
 func newAarch64FPState() aarch64FPState {
 	f := aarch64FPState(newAarch64FPStateSlice())
-	initAarch64FPState(f.FloatingPointData())
+	initAarch64FPState(f)
 	return f
 }
 
@@ -133,10 +139,10 @@ func (s State) Proto() *rpb.Registers {
 
 // Fork creates and returns an identical copy of the state.
 func (s *State) Fork() State {
-	// TODO(gvisor.dev/issue/1238): floating-point is not supported.
 	return State{
-		Regs:       s.Regs,
-		FeatureSet: s.FeatureSet,
+		Regs:           s.Regs,
+		aarch64FPState: s.aarch64FPState.fork(),
+		FeatureSet:     s.FeatureSet,
 	}
 }
 
@@ -285,8 +291,10 @@ func New(arch Arch, fs *cpuid.FeatureSet) Context {
 	case ARM64:
 		return &context64{
 			State{
-				FeatureSet: fs,
+				aarch64FPState: newAarch64FPState(),
+				FeatureSet:     fs,
 			},
+			[]aarch64FPState(nil),
 		}
 	}
 	panic(fmt.Sprintf("unknown architecture %v", arch))
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index 94f1a808f..372b650b9 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -68,6 +68,7 @@ const (
 // context64 represents an ARM64 context.
 type context64 struct {
 	State
+	sigFPState []aarch64FPState // fpstate to be restored on sigreturn.
 }
 
 // Arch implements Context.Arch.
@@ -75,10 +76,19 @@ func (c *context64) Arch() Arch {
 	return ARM64
 }
 
+func (c *context64) copySigFPState() []aarch64FPState {
+	var sigfps []aarch64FPState
+	for _, s := range c.sigFPState {
+		sigfps = append(sigfps, s.fork())
+	}
+	return sigfps
+}
+
 // Fork returns an exact copy of this context.
 func (c *context64) Fork() Context {
 	return &context64{
-		State: c.State.Fork(),
+		State:      c.State.Fork(),
+		sigFPState: c.copySigFPState(),
 	}
 }
 
@@ -137,8 +147,8 @@ func (c *context64) SetTLS(value uintptr) bool {
 	return false
 }
 
-// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP.
-func (c *context64) SetRSEQInterruptedIP(value uintptr) {
+// SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP.
+func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
 	c.Regs.Regs[3] = uint64(value)
 }
 
diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go
index 4f4cc46a8..28615f97f 100644
--- a/pkg/sentry/arch/signal_arm64.go
+++ b/pkg/sentry/arch/signal_arm64.go
@@ -30,14 +30,27 @@ type SignalContext64 struct {
 	Sp        uint64
 	Pc        uint64
 	Pstate    uint64
-	_pad      [8]byte // __attribute__((__aligned__(16)))
-	Reserved  [4096]uint8
+	_pad      [8]byte       // __attribute__((__aligned__(16)))
+	Fpsimd64  FpsimdContext // size = 528
+	Reserved  [3568]uint8
+}
+
+type aarch64Ctx struct {
+	Magic uint32
+	Size  uint32
+}
+
+type FpsimdContext struct {
+	Head  aarch64Ctx
+	Fpsr  uint32
+	Fpcr  uint32
+	Vregs [64]uint64 // actually [32]uint128
 }
 
 // UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h).
 type UContext64 struct {
 	Flags  uint64
-	Link   *UContext64
+	Link   uint64
 	Stack  SignalStack
 	Sigset linux.SignalSet
 	// glibc uses a 1024-bit sigset_t
-- 
cgit v1.2.3


From 10ed60e4778e01a2813eea6e3c201826f75982a5 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 09:57:06 -0800
Subject: VFS2: Support memory mapping in tmpfs.

tmpfs.fileDescription now implements ConfigureMMap. And tmpfs.regularFile
implement memmap.Mappable. The methods are mostly unchanged from VFS1 tmpfs.

PiperOrigin-RevId: 296234557
---
 pkg/sentry/fsimpl/tmpfs/filesystem.go   |   8 +-
 pkg/sentry/fsimpl/tmpfs/regular_file.go | 233 +++++++++++++++++++++++++++-----
 pkg/sentry/fsimpl/tmpfs/tmpfs.go        |  16 ++-
 3 files changed, 213 insertions(+), 44 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 7f7b791c4..e1b551422 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -16,7 +16,6 @@ package tmpfs
 
 import (
 	"fmt"
-	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -347,10 +346,9 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 			return nil, err
 		}
 		if opts.Flags&linux.O_TRUNC != 0 {
-			impl.mu.Lock()
-			impl.data.Truncate(0, impl.memFile)
-			atomic.StoreUint64(&impl.size, 0)
-			impl.mu.Unlock()
+			if _, err := impl.truncate(0); err != nil {
+				return nil, err
+			}
 		}
 		return &fd.vfsfd, nil
 	case *directory:
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index dab346a41..711442424 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -15,6 +15,7 @@
 package tmpfs
 
 import (
+	"fmt"
 	"io"
 	"math"
 	"sync/atomic"
@@ -22,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -34,25 +36,53 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// regularFile is a regular (=S_IFREG) tmpfs file.
 type regularFile struct {
 	inode inode
 
 	// memFile is a platform.File used to allocate pages to this regularFile.
 	memFile *pgalloc.MemoryFile
 
-	// mu protects the fields below.
-	mu sync.RWMutex
+	// mapsMu protects mappings.
+	mapsMu sync.Mutex `state:"nosave"`
+
+	// mappings tracks mappings of the file into memmap.MappingSpaces.
+	//
+	// Protected by mapsMu.
+	mappings memmap.MappingSet
+
+	// writableMappingPages tracks how many pages of virtual memory are mapped
+	// as potentially writable from this file. If a page has multiple mappings,
+	// each mapping is counted separately.
+	//
+	// This counter is susceptible to overflow as we can potentially count
+	// mappings from many VMAs. We count pages rather than bytes to slightly
+	// mitigate this.
+	//
+	// Protected by mapsMu.
+	writableMappingPages uint64
+
+	// dataMu protects the fields below.
+	dataMu sync.RWMutex
 
 	// data maps offsets into the file to offsets into memFile that store
 	// the file's data.
+	//
+	// Protected by dataMu.
 	data fsutil.FileRangeSet
 
-	// size is the size of data, but accessed using atomic memory
-	// operations to avoid locking in inode.stat().
-	size uint64
-
 	// seals represents file seals on this inode.
+	//
+	// Protected by dataMu.
 	seals uint32
+
+	// size is the size of data.
+	//
+	// Protected by both dataMu and inode.mu; reading it requires holding
+	// either mutex, while writing requires holding both AND using atomics.
+	// Readers that do not require consistency (like Stat) may read the
+	// value atomically without holding either lock.
+	size uint64
 }
 
 func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
@@ -66,39 +96,170 @@ func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMod
 
 // truncate grows or shrinks the file to the given size. It returns true if the
 // file size was updated.
-func (rf *regularFile) truncate(size uint64) (bool, error) {
-	rf.mu.Lock()
-	defer rf.mu.Unlock()
+func (rf *regularFile) truncate(newSize uint64) (bool, error) {
+	rf.inode.mu.Lock()
+	defer rf.inode.mu.Unlock()
+	return rf.truncateLocked(newSize)
+}
 
-	if size == rf.size {
+// Preconditions: rf.inode.mu must be held.
+func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) {
+	oldSize := rf.size
+	if newSize == oldSize {
 		// Nothing to do.
 		return false, nil
 	}
 
-	if size > rf.size {
-		// Growing the file.
+	// Need to hold inode.mu and dataMu while modifying size.
+	rf.dataMu.Lock()
+	if newSize > oldSize {
+		// Can we grow the file?
 		if rf.seals&linux.F_SEAL_GROW != 0 {
-			// Seal does not allow growth.
+			rf.dataMu.Unlock()
 			return false, syserror.EPERM
 		}
-		rf.size = size
+		// We only need to update the file size.
+		atomic.StoreUint64(&rf.size, newSize)
+		rf.dataMu.Unlock()
 		return true, nil
 	}
 
-	// Shrinking the file
+	// We are shrinking the file. First check if this is allowed.
 	if rf.seals&linux.F_SEAL_SHRINK != 0 {
-		// Seal does not allow shrink.
+		rf.dataMu.Unlock()
 		return false, syserror.EPERM
 	}
 
-	// TODO(gvisor.dev/issues/1197): Invalidate mappings once we have
-	// mappings.
+	// Update the file size.
+	atomic.StoreUint64(&rf.size, newSize)
+	rf.dataMu.Unlock()
+
+	// Invalidate past translations of truncated pages.
+	oldpgend := fs.OffsetPageEnd(int64(oldSize))
+	newpgend := fs.OffsetPageEnd(int64(newSize))
+	if newpgend < oldpgend {
+		rf.mapsMu.Lock()
+		rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+			// Compare Linux's mm/shmem.c:shmem_setattr() =>
+			// mm/memory.c:unmap_mapping_range(evencows=1).
+			InvalidatePrivate: true,
+		})
+		rf.mapsMu.Unlock()
+	}
 
-	rf.data.Truncate(size, rf.memFile)
-	rf.size = size
+	// We are now guaranteed that there are no translations of truncated pages,
+	// and can remove them.
+	rf.dataMu.Lock()
+	rf.data.Truncate(newSize, rf.memFile)
+	rf.dataMu.Unlock()
 	return true, nil
 }
 
+// AddMapping implements memmap.Mappable.AddMapping.
+func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	rf.mapsMu.Lock()
+	defer rf.mapsMu.Unlock()
+	rf.dataMu.RLock()
+	defer rf.dataMu.RUnlock()
+
+	// Reject writable mapping if F_SEAL_WRITE is set.
+	if rf.seals&linux.F_SEAL_WRITE != 0 && writable {
+		return syserror.EPERM
+	}
+
+	rf.mappings.AddMapping(ms, ar, offset, writable)
+	if writable {
+		pagesBefore := rf.writableMappingPages
+
+		// ar is guaranteed to be page aligned per memmap.Mappable.
+		rf.writableMappingPages += uint64(ar.Length() / usermem.PageSize)
+
+		if rf.writableMappingPages < pagesBefore {
+			panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
+		}
+	}
+
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	rf.mapsMu.Lock()
+	defer rf.mapsMu.Unlock()
+
+	rf.mappings.RemoveMapping(ms, ar, offset, writable)
+
+	if writable {
+		pagesBefore := rf.writableMappingPages
+
+		// ar is guaranteed to be page aligned per memmap.Mappable.
+		rf.writableMappingPages -= uint64(ar.Length() / usermem.PageSize)
+
+		if rf.writableMappingPages > pagesBefore {
+			panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
+		}
+	}
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return rf.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	rf.dataMu.Lock()
+	defer rf.dataMu.Unlock()
+
+	// Constrain translations to f.attr.Size (rounded up) to prevent
+	// translation to pages that may be concurrently truncated.
+	pgend := fs.OffsetPageEnd(int64(rf.size))
+	var beyondEOF bool
+	if required.End > pgend {
+		if required.Start >= pgend {
+			return nil, &memmap.BusError{io.EOF}
+		}
+		beyondEOF = true
+		required.End = pgend
+	}
+	if optional.End > pgend {
+		optional.End = pgend
+	}
+
+	cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+		// Newly-allocated pages are zeroed, so we don't need to do anything.
+		return dsts.NumBytes(), nil
+	})
+
+	var ts []memmap.Translation
+	var translatedEnd uint64
+	for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
+		segMR := seg.Range().Intersect(optional)
+		ts = append(ts, memmap.Translation{
+			Source: segMR,
+			File:   rf.memFile,
+			Offset: seg.FileRangeOf(segMR).Start,
+			Perms:  usermem.AnyAccess,
+		})
+		translatedEnd = segMR.End
+	}
+
+	// Don't return the error returned by f.data.Fill if it occurred outside of
+	// required.
+	if translatedEnd < required.End && cerr != nil {
+		return ts, &memmap.BusError{cerr}
+	}
+	if beyondEOF {
+		return ts, &memmap.BusError{io.EOF}
+	}
+	return ts, nil
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (*regularFile) InvalidateUnsavable(context.Context) error {
+	return nil
+}
+
 type regularFileFD struct {
 	fileDescription
 
@@ -152,8 +313,10 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 		// Overflow.
 		return 0, syserror.EFBIG
 	}
+	f.inode.mu.Lock()
 	rw := getRegularFileReadWriter(f, offset)
 	n, err := src.CopyInTo(ctx, rw)
+	f.inode.mu.Unlock()
 	putRegularFileReadWriter(rw)
 	return n, err
 }
@@ -215,6 +378,12 @@ func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng
 	return nil
 }
 
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	file := fd.inode().impl.(*regularFile)
+	return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts)
+}
+
 // regularFileReadWriter implements safemem.Reader and Safemem.Writer.
 type regularFileReadWriter struct {
 	file *regularFile
@@ -244,14 +413,15 @@ func putRegularFileReadWriter(rw *regularFileReadWriter) {
 
 // ReadToBlocks implements safemem.Reader.ReadToBlocks.
 func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
-	rw.file.mu.RLock()
+	rw.file.dataMu.RLock()
+	defer rw.file.dataMu.RUnlock()
+	size := rw.file.size
 
 	// Compute the range to read (limited by file size and overflow-checked).
-	if rw.off >= rw.file.size {
-		rw.file.mu.RUnlock()
+	if rw.off >= size {
 		return 0, io.EOF
 	}
-	end := rw.file.size
+	end := size
 	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
 		end = rend
 	}
@@ -265,7 +435,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er
 			// Get internal mappings.
 			ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
 			if err != nil {
-				rw.file.mu.RUnlock()
 				return done, err
 			}
 
@@ -275,7 +444,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er
 			rw.off += uint64(n)
 			dsts = dsts.DropFirst64(n)
 			if err != nil {
-				rw.file.mu.RUnlock()
 				return done, err
 			}
 
@@ -291,7 +459,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er
 			rw.off += uint64(n)
 			dsts = dsts.DropFirst64(n)
 			if err != nil {
-				rw.file.mu.RUnlock()
 				return done, err
 			}
 
@@ -299,13 +466,16 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er
 			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
 		}
 	}
-	rw.file.mu.RUnlock()
 	return done, nil
 }
 
 // WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+//
+// Preconditions: inode.mu must be held.
 func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
-	rw.file.mu.Lock()
+	// Hold dataMu so we can modify size.
+	rw.file.dataMu.Lock()
+	defer rw.file.dataMu.Unlock()
 
 	// Compute the range to write (overflow-checked).
 	end := rw.off + srcs.NumBytes()
@@ -316,7 +486,6 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64,
 	// Check if seals prevent either file growth or all writes.
 	switch {
 	case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
-		rw.file.mu.Unlock()
 		return 0, syserror.EPERM
 	case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
 		// When growth is sealed, Linux effectively allows writes which would
@@ -338,7 +507,6 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64,
 		}
 		if end <= rw.off {
 			// Truncation would result in no data being written.
-			rw.file.mu.Unlock()
 			return 0, syserror.EPERM
 		}
 	}
@@ -395,9 +563,8 @@ exitLoop:
 	// If the write ends beyond the file's previous size, it causes the
 	// file to grow.
 	if rw.off > rw.file.size {
-		atomic.StoreUint64(&rw.file.size, rw.off)
+		rw.file.size = rw.off
 	}
 
-	rw.file.mu.Unlock()
 	return done, retErr
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index c5bb17562..521206305 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -18,9 +18,10 @@
 // Lock order:
 //
 // filesystem.mu
-//   regularFileFD.offMu
-//     regularFile.mu
 //   inode.mu
+//     regularFileFD.offMu
+//       regularFile.mapsMu
+//         regularFile.dataMu
 package tmpfs
 
 import (
@@ -226,12 +227,15 @@ func (i *inode) tryIncRef() bool {
 
 func (i *inode) decRef() {
 	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
-		// This is unnecessary; it's mostly to simulate what tmpfs would do.
 		if regFile, ok := i.impl.(*regularFile); ok {
-			regFile.mu.Lock()
+			// Hold inode.mu and regFile.dataMu while mutating
+			// size.
+			i.mu.Lock()
+			regFile.dataMu.Lock()
 			regFile.data.DropAll(regFile.memFile)
 			atomic.StoreUint64(&regFile.size, 0)
-			regFile.mu.Unlock()
+			regFile.dataMu.Unlock()
+			i.mu.Unlock()
 		}
 	} else if refs < 0 {
 		panic("tmpfs.inode.decRef() called without holding a reference")
@@ -320,7 +324,7 @@ func (i *inode) setStat(stat linux.Statx) error {
 	if mask&linux.STATX_SIZE != 0 {
 		switch impl := i.impl.(type) {
 		case *regularFile:
-			updated, err := impl.truncate(stat.Size)
+			updated, err := impl.truncateLocked(stat.Size)
 			if err != nil {
 				return err
 			}
-- 
cgit v1.2.3


From 9a4e3e63ef3c771e9fab3d19ee8ad0a173c7c4eb Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 11:14:53 -0800
Subject: Re-add atomicbitops_arm64.s to BUILD.

This was inadverently dropped by cl/295811743.

PiperOrigin-RevId: 296254482
---
 pkg/atomicbitops/BUILD | 1 +
 1 file changed, 1 insertion(+)

(limited to 'pkg')

diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index ba8b06071..1a30f6967 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "atomicbitops.go",
         "atomicbitops_amd64.s",
+        "atomicbitops_arm64.s",
         "atomicbitops_noasm.go",
     ],
     visibility = ["//:sandbox"],
-- 
cgit v1.2.3


From 9bad87339a10545d267903e7739f8cd978fbd82a Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 11:29:59 -0800
Subject: Better strace logging for epoll syscalls.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Example:

epoll_ctl(0x3 anon_inode:[eventpoll], EPOLL_CTL_ADD, 0x6 anon_inode:[eventfd], 0x7efe2fd92a80 {events=EPOLLIN|EPOLLOUT data=0x10203040506070a}) = 0x0 (4.411µs)

epoll_wait(0x3 anon_inode:[eventpoll], 0x7efe2fd92b50 {{events=EPOLLOUT data=0x102030405060708}{events=EPOLLOUT data=0x102030405060708}{events=EPOLLOUT data=0x102030405060708}}, 0x3, 0xffffffff) = 0x3 (29.891µs)

PiperOrigin-RevId: 296258146
---
 pkg/abi/linux/epoll.go             |  7 +++
 pkg/sentry/strace/BUILD            |  1 +
 pkg/sentry/strace/epoll.go         | 89 ++++++++++++++++++++++++++++++++++++++
 pkg/sentry/strace/linux64_amd64.go |  6 +--
 pkg/sentry/strace/linux64_arm64.go |  4 +-
 pkg/sentry/strace/strace.go        |  8 ++++
 pkg/sentry/strace/syscalls.go      | 10 +++++
 7 files changed, 120 insertions(+), 5 deletions(-)
 create mode 100644 pkg/sentry/strace/epoll.go

(limited to 'pkg')

diff --git a/pkg/abi/linux/epoll.go b/pkg/abi/linux/epoll.go
index 6e4de69da..1121a1a92 100644
--- a/pkg/abi/linux/epoll.go
+++ b/pkg/abi/linux/epoll.go
@@ -14,6 +14,10 @@
 
 package linux
 
+import (
+	"gvisor.dev/gvisor/pkg/binary"
+)
+
 // Event masks.
 const (
 	EPOLLIN     = 0x1
@@ -53,3 +57,6 @@ const (
 	EPOLL_CTL_DEL = 0x2
 	EPOLL_CTL_MOD = 0x3
 )
+
+// SizeOfEpollEvent is the size of EpollEvent struct.
+var SizeOfEpollEvent = int(binary.Size(EpollEvent{}))
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 2f39a6f2b..88d5db9fc 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "capability.go",
         "clone.go",
+        "epoll.go",
         "futex.go",
         "linux64_amd64.go",
         "linux64_arm64.go",
diff --git a/pkg/sentry/strace/epoll.go b/pkg/sentry/strace/epoll.go
new file mode 100644
index 000000000..a6e48b836
--- /dev/null
+++ b/pkg/sentry/strace/epoll.go
@@ -0,0 +1,89 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"fmt"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func epollEvent(t *kernel.Task, eventAddr usermem.Addr) string {
+	var e linux.EpollEvent
+	if _, err := t.CopyIn(eventAddr, &e); err != nil {
+		return fmt.Sprintf("%#x {error reading event: %v}", eventAddr, err)
+	}
+	var sb strings.Builder
+	fmt.Fprintf(&sb, "%#x ", eventAddr)
+	writeEpollEvent(&sb, e)
+	return sb.String()
+}
+
+func epollEvents(t *kernel.Task, eventsAddr usermem.Addr, numEvents, maxBytes uint64) string {
+	var sb strings.Builder
+	fmt.Fprintf(&sb, "%#x {", eventsAddr)
+	addr := eventsAddr
+	for i := uint64(0); i < numEvents; i++ {
+		var e linux.EpollEvent
+		if _, err := t.CopyIn(addr, &e); err != nil {
+			fmt.Fprintf(&sb, "{error reading event at %#x: %v}", addr, err)
+			continue
+		}
+		writeEpollEvent(&sb, e)
+		if uint64(sb.Len()) >= maxBytes {
+			sb.WriteString("...")
+			break
+		}
+		if _, ok := addr.AddLength(uint64(linux.SizeOfEpollEvent)); !ok {
+			fmt.Fprintf(&sb, "{error reading event at %#x: EFAULT}", addr)
+			continue
+		}
+	}
+	sb.WriteString("}")
+	return sb.String()
+}
+
+func writeEpollEvent(sb *strings.Builder, e linux.EpollEvent) {
+	events := epollEventEvents.Parse(uint64(e.Events))
+	fmt.Fprintf(sb, "{events=%s data=[%#x, %#x]}", events, e.Data[0], e.Data[1])
+}
+
+var epollCtlOps = abi.ValueSet{
+	linux.EPOLL_CTL_ADD: "EPOLL_CTL_ADD",
+	linux.EPOLL_CTL_DEL: "EPOLL_CTL_DEL",
+	linux.EPOLL_CTL_MOD: "EPOLL_CTL_MOD",
+}
+
+var epollEventEvents = abi.FlagSet{
+	{Flag: linux.EPOLLIN, Name: "EPOLLIN"},
+	{Flag: linux.EPOLLPRI, Name: "EPOLLPRI"},
+	{Flag: linux.EPOLLOUT, Name: "EPOLLOUT"},
+	{Flag: linux.EPOLLERR, Name: "EPOLLERR"},
+	{Flag: linux.EPOLLHUP, Name: "EPULLHUP"},
+	{Flag: linux.EPOLLRDNORM, Name: "EPOLLRDNORM"},
+	{Flag: linux.EPOLLRDBAND, Name: "EPOLLRDBAND"},
+	{Flag: linux.EPOLLWRNORM, Name: "EPOLLWRNORM"},
+	{Flag: linux.EPOLLWRBAND, Name: "EPOLLWRBAND"},
+	{Flag: linux.EPOLLMSG, Name: "EPOLLMSG"},
+	{Flag: linux.EPOLLRDHUP, Name: "EPOLLRDHUP"},
+	{Flag: linux.EPOLLEXCLUSIVE, Name: "EPOLLEXCLUSIVE"},
+	{Flag: linux.EPOLLWAKEUP, Name: "EPOLLWAKEUP"},
+	{Flag: linux.EPOLLONESHOT, Name: "EPOLLONESHOT"},
+	{Flag: linux.EPOLLET, Name: "EPOLLET"},
+}
diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go
index a4de545e9..71b92eaee 100644
--- a/pkg/sentry/strace/linux64_amd64.go
+++ b/pkg/sentry/strace/linux64_amd64.go
@@ -256,8 +256,8 @@ var linuxAMD64 = SyscallMap{
 	229: makeSyscallInfo("clock_getres", Hex, PostTimespec),
 	230: makeSyscallInfo("clock_nanosleep", Hex, Hex, Timespec, PostTimespec),
 	231: makeSyscallInfo("exit_group", Hex),
-	232: makeSyscallInfo("epoll_wait", Hex, Hex, Hex, Hex),
-	233: makeSyscallInfo("epoll_ctl", Hex, Hex, FD, Hex),
+	232: makeSyscallInfo("epoll_wait", FD, EpollEvents, Hex, Hex),
+	233: makeSyscallInfo("epoll_ctl", FD, EpollCtlOp, FD, EpollEvent),
 	234: makeSyscallInfo("tgkill", Hex, Hex, Signal),
 	235: makeSyscallInfo("utimes", Path, Timeval),
 	// 236: vserver (not implemented in the Linux kernel)
@@ -305,7 +305,7 @@ var linuxAMD64 = SyscallMap{
 	278: makeSyscallInfo("vmsplice", FD, Hex, Hex, Hex),
 	279: makeSyscallInfo("move_pages", Hex, Hex, Hex, Hex, Hex, Hex),
 	280: makeSyscallInfo("utimensat", FD, Path, UTimeTimespec, Hex),
-	281: makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex),
+	281: makeSyscallInfo("epoll_pwait", FD, EpollEvents, Hex, Hex, SigSet, Hex),
 	282: makeSyscallInfo("signalfd", Hex, Hex, Hex),
 	283: makeSyscallInfo("timerfd_create", Hex, Hex),
 	284: makeSyscallInfo("eventfd", Hex),
diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go
index 8bc38545f..bd7361a52 100644
--- a/pkg/sentry/strace/linux64_arm64.go
+++ b/pkg/sentry/strace/linux64_arm64.go
@@ -45,8 +45,8 @@ var linuxARM64 = SyscallMap{
 	18:  makeSyscallInfo("lookup_dcookie", Hex, Hex, Hex),
 	19:  makeSyscallInfo("eventfd2", Hex, Hex),
 	20:  makeSyscallInfo("epoll_create1", Hex),
-	21:  makeSyscallInfo("epoll_ctl", Hex, Hex, FD, Hex),
-	22:  makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex),
+	21:  makeSyscallInfo("epoll_ctl", FD, EpollCtlOp, FD, EpollEvent),
+	22:  makeSyscallInfo("epoll_pwait", FD, EpollEvents, Hex, Hex, SigSet, Hex),
 	23:  makeSyscallInfo("dup", FD),
 	24:  makeSyscallInfo("dup3", FD, FD, Hex),
 	25:  makeSyscallInfo("fcntl", FD, Hex, Hex),
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 46cb2a1cc..77655558e 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -481,6 +481,12 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			output = append(output, capData(t, args[arg-1].Pointer(), args[arg].Pointer()))
 		case PollFDs:
 			output = append(output, pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), false))
+		case EpollCtlOp:
+			output = append(output, epollCtlOps.Parse(uint64(args[arg].Int())))
+		case EpollEvent:
+			output = append(output, epollEvent(t, args[arg].Pointer()))
+		case EpollEvents:
+			output = append(output, epollEvents(t, args[arg].Pointer(), 0 /* numEvents */, uint64(maximumBlobSize)))
 		case SelectFDSet:
 			output = append(output, fdSet(t, int(args[0].Int()), args[arg].Pointer()))
 		case Oct:
@@ -549,6 +555,8 @@ func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uint
 			output[arg] = capData(t, args[arg-1].Pointer(), args[arg].Pointer())
 		case PollFDs:
 			output[arg] = pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), true)
+		case EpollEvents:
+			output[arg] = epollEvents(t, args[arg].Pointer(), uint64(rval), uint64(maximumBlobSize))
 		case GetSockOptVal:
 			output[arg] = getSockOptVal(t, args[arg-2].Uint64() /* level */, args[arg-1].Uint64() /* optName */, args[arg].Pointer() /* optVal */, args[arg+1].Pointer() /* optLen */, maximumBlobSize, rval)
 		case SetSockOptVal:
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 446d1e0f6..7e69b9279 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -228,6 +228,16 @@ const (
 	// SockOptLevel is the optname argument in getsockopt(2) and
 	// setsockopt(2).
 	SockOptName
+
+	// EpollCtlOp is the op argument to epoll_ctl(2).
+	EpollCtlOp
+
+	// EpollEvent is the event argument in epoll_ctl(2).
+	EpollEvent
+
+	// EpollEvents is an array of struct epoll_event. It is the events
+	// argument in epoll_wait(2)/epoll_pwait(2).
+	EpollEvents
 )
 
 // defaultFormat is the syscall argument format to use if the actual format is
-- 
cgit v1.2.3


From d90d71474f4c82f742140fdf026821709845cece Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 14:28:31 -0800
Subject: Remove bytes read/written from marshal.Marshallable API.

Users of the API only care about whether the copy in/out succeeds in
their entirety, which is already signalled by the returned error.

PiperOrigin-RevId: 296297843
---
 pkg/sentry/kernel/rseq.go                          |  2 +-
 pkg/sentry/syscalls/linux/sys_stat.go              |  6 ++----
 tools/go_marshal/gomarshal/generator_interfaces.go | 21 +++++++++++----------
 tools/go_marshal/gomarshal/generator_tests.go      |  2 +-
 tools/go_marshal/marshal/marshal.go                |  4 ++--
 5 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 18416643b..ded95f532 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -304,7 +304,7 @@ func (t *Task) rseqAddrInterrupt() {
 	}
 
 	var cs linux.RSeqCriticalSection
-	if _, err := cs.CopyIn(t, critAddr); err != nil {
+	if err := cs.CopyIn(t, critAddr); err != nil {
 		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
 		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 8b66a9006..11f25e00d 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -131,8 +131,7 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
 		return err
 	}
 	s := statFromAttrs(t, d.Inode.StableAttr, uattr)
-	_, err = s.CopyOut(t, statAddr)
-	return err
+	return s.CopyOut(t, statAddr)
 }
 
 // fstat implements fstat for the given *fs.File.
@@ -142,8 +141,7 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error {
 		return err
 	}
 	s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr)
-	_, err = s.CopyOut(t, statAddr)
-	return err
+	return s.CopyOut(t, statAddr)
 }
 
 // Statx implements linux syscall statx(2).
diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
index 3aa299ccd..834c58cee 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -507,13 +507,14 @@ func (g *interfaceGenerator) emitMarshallable() {
 	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
 	g.recordUsedImport("marshal")
 	g.recordUsedImport("usermem")
-	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		fallback := func() {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
 			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
 			g.emit("%s.MarshalBytes(buf)\n", g.r)
-			g.emit("return task.CopyOutBytes(addr, buf)\n")
+			g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
+			g.emit("return err\n")
 		}
 		if thisPacked {
 			g.recordUsedImport("reflect")
@@ -539,11 +540,11 @@ func (g *interfaceGenerator) emitMarshallable() {
 			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
 			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
 
-			g.emit("len, err := task.CopyOutBytes(addr, buf)\n")
+			g.emit("_, err := task.CopyOutBytes(addr, buf)\n")
 			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
 			g.emit("// must live until after the CopyOutBytes.\n")
 			g.emit("runtime.KeepAlive(%s)\n", g.r)
-			g.emit("return len, err\n")
+			g.emit("return err\n")
 		} else {
 			fallback()
 		}
@@ -553,20 +554,20 @@ func (g *interfaceGenerator) emitMarshallable() {
 	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
 	g.recordUsedImport("marshal")
 	g.recordUsedImport("usermem")
-	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) error {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		fallback := func() {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
 			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
-			g.emit("n, err := task.CopyInBytes(addr, buf)\n")
+			g.emit("_, err := task.CopyInBytes(addr, buf)\n")
 			g.emit("if err != nil {\n")
 			g.inIndent(func() {
-				g.emit("return n, err\n")
+				g.emit("return err\n")
 			})
 			g.emit("}\n")
 
 			g.emit("%s.UnmarshalBytes(buf)\n", g.r)
-			g.emit("return n, nil\n")
+			g.emit("return nil\n")
 		}
 		if thisPacked {
 			g.recordUsedImport("reflect")
@@ -592,11 +593,11 @@ func (g *interfaceGenerator) emitMarshallable() {
 			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
 			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
 
-			g.emit("len, err := task.CopyInBytes(addr, buf)\n")
+			g.emit("_, err := task.CopyInBytes(addr, buf)\n")
 			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
 			g.emit("// must live until after the CopyInBytes.\n")
 			g.emit("runtime.KeepAlive(%s)\n", g.r)
-			g.emit("return len, err\n")
+			g.emit("return err\n")
 		} else {
 			fallback()
 		}
diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go
index 8c28b00d0..2326e7a07 100644
--- a/tools/go_marshal/gomarshal/generator_tests.go
+++ b/tools/go_marshal/gomarshal/generator_tests.go
@@ -92,7 +92,7 @@ func (g *testGenerator) emitTestNonZeroSize() {
 		g.emit("x := &%s{}\n", g.typeName())
 		g.emit("if x.SizeBytes() == 0 {\n")
 		g.inIndent(func() {
-			g.emit("t.Fatal(\"Marshallable.Size() should not return zero\")\n")
+			g.emit("t.Fatal(\"Marshallable.SizeBytes() should not return zero\")\n")
 		})
 		g.emit("}\n")
 	})
diff --git a/tools/go_marshal/marshal/marshal.go b/tools/go_marshal/marshal/marshal.go
index 20353850d..f129788e0 100644
--- a/tools/go_marshal/marshal/marshal.go
+++ b/tools/go_marshal/marshal/marshal.go
@@ -91,12 +91,12 @@ type Marshallable interface {
 	// marshalled does not escape. The implementation should avoid creating
 	// extra copies in memory by directly deserializing to the object's
 	// underlying memory.
-	CopyIn(task Task, addr usermem.Addr) (int, error)
+	CopyIn(task Task, addr usermem.Addr) error
 
 	// CopyOut serializes a Marshallable type to a task's memory. This may only
 	// be called from a task goroutine. This is more efficient than calling
 	// MarshalUnsafe on Marshallable.Packed types, as the type being serialized
 	// does not escape. The implementation should avoid creating extra copies in
 	// memory by directly serializing from the object's underlying memory.
-	CopyOut(task Task, addr usermem.Addr) (int, error)
+	CopyOut(task Task, addr usermem.Addr) error
 }
-- 
cgit v1.2.3


From 67b615b86f2aa1d4ded3dcf2eb8aca4e7fec57a0 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 14:31:39 -0800
Subject: Support disabling a NIC

- Disabled NICs will have their associated NDP state cleared.
- Disabled NICs will not accept incoming packets.
- Writes through a Route with a disabled NIC will return an invalid
  endpoint state error.
- stack.Stack.FindRoute will not return a route with a disabled NIC.
- NIC's Running flag will report the NIC's enabled status.

Tests:
- stack_test.TestDisableUnknownNIC
- stack_test.TestDisabledNICsNICInfoAndCheckNIC
- stack_test.TestRoutesWithDisabledNIC
- stack_test.TestRouteWritePacketWithDisabledNIC
- stack_test.TestStopStartSolicitingRouters
- stack_test.TestCleanupNDPState
- stack_test.TestAddRemoveIPv4BroadcastAddressOnNICEnableDisable
- stack_test.TestJoinLeaveAllNodesMulticastOnNICEnableDisable
PiperOrigin-RevId: 296298588
---
 pkg/tcpip/stack/ndp.go        |  23 +-
 pkg/tcpip/stack/ndp_test.go   | 822 ++++++++++++++++++++++++------------------
 pkg/tcpip/stack/nic.go        | 110 +++++-
 pkg/tcpip/stack/stack.go      |  45 ++-
 pkg/tcpip/stack/stack_test.go | 377 ++++++++++++++++++-
 5 files changed, 998 insertions(+), 379 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 045409bda..19bd05aa3 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -1148,22 +1148,27 @@ func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bo
 	return true
 }
 
-// cleanupHostOnlyState cleans up any state that is only useful for hosts.
+// cleanupState cleans up ndp's state.
 //
-// cleanupHostOnlyState MUST be called when ndp's NIC is transitioning from a
-// host to a router. This function will invalidate all discovered on-link
-// prefixes, discovered routers, and auto-generated addresses as routers do not
-// normally process Router Advertisements to discover default routers and
-// on-link prefixes, and auto-generate addresses via SLAAC.
+// If hostOnly is true, then only host-specific state will be cleaned up.
+//
+// cleanupState MUST be called with hostOnly set to true when ndp's NIC is
+// transitioning from a host to a router. This function will invalidate all
+// discovered on-link prefixes, discovered routers, and auto-generated
+// addresses.
+//
+// If hostOnly is true, then the link-local auto-generated address will not be
+// invalidated as routers are also expected to generate a link-local address.
 //
 // The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) cleanupHostOnlyState() {
+func (ndp *ndpState) cleanupState(hostOnly bool) {
 	linkLocalSubnet := header.IPv6LinkLocalPrefix.Subnet()
 	linkLocalAddrs := 0
 	for addr := range ndp.autoGenAddresses {
 		// RFC 4862 section 5 states that routers are also expected to generate a
-		// link-local address so we do not invalidate them.
-		if linkLocalSubnet.Contains(addr) {
+		// link-local address so we do not invalidate them if we are cleaning up
+		// host-only state.
+		if hostOnly && linkLocalSubnet.Contains(addr) {
 			linkLocalAddrs++
 			continue
 		}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 1f6f77439..f7b75b74e 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -592,70 +592,94 @@ func TestDADFail(t *testing.T) {
 	}
 }
 
-// TestDADStop tests to make sure that the DAD process stops when an address is
-// removed.
 func TestDADStop(t *testing.T) {
 	const nicID = 1
 
-	ndpDisp := ndpDispatcher{
-		dadC: make(chan ndpDADEvent, 1),
-	}
-	ndpConfigs := stack.NDPConfigurations{
-		RetransmitTimer:        time.Second,
-		DupAddrDetectTransmits: 2,
-	}
-	opts := stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPDisp:          &ndpDisp,
-		NDPConfigs:       ndpConfigs,
-	}
+	tests := []struct {
+		name   string
+		stopFn func(t *testing.T, s *stack.Stack)
+	}{
+		// Tests to make sure that DAD stops when an address is removed.
+		{
+			name: "Remove address",
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				if err := s.RemoveAddress(nicID, addr1); err != nil {
+					t.Fatalf("RemoveAddress(%d, %s): %s", nicID, addr1, err)
+				}
+			},
+		},
 
-	e := channel.New(0, 1280, linkAddr1)
-	s := stack.New(opts)
-	if err := s.CreateNIC(nicID, e); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+		// Tests to make sure that DAD stops when the NIC is disabled.
+		{
+			name: "Disable NIC",
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				if err := s.DisableNIC(nicID); err != nil {
+					t.Fatalf("DisableNIC(%d): %s", nicID, err)
+				}
+			},
+		},
 	}
 
-	if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
-		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
-	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				dadC: make(chan ndpDADEvent, 1),
+			}
+			ndpConfigs := stack.NDPConfigurations{
+				RetransmitTimer:        time.Second,
+				DupAddrDetectTransmits: 2,
+			}
+			opts := stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPDisp:          &ndpDisp,
+				NDPConfigs:       ndpConfigs,
+			}
 
-	// Address should not be considered bound to the NIC yet (DAD ongoing).
-	addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
-	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
-	}
-	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
-	}
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(opts)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
 
-	// Remove the address. This should stop DAD.
-	if err := s.RemoveAddress(nicID, addr1); err != nil {
-		t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr1, err)
-	}
+			if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, header.IPv6ProtocolNumber, addr1, err)
+			}
 
-	// Wait for DAD to fail (since the address was removed during DAD).
-	select {
-	case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
-		// If we don't get a failure event after the expected resolution
-		// time + extra 1s buffer, something is wrong.
-		t.Fatal("timed out waiting for DAD failure")
-	case e := <-ndpDisp.dadC:
-		if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" {
-			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
-		}
-	}
-	addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
-	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
-	}
-	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
-	}
+			// Address should not be considered bound to the NIC yet (DAD ongoing).
+			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+			}
+
+			test.stopFn(t, s)
+
+			// Wait for DAD to fail (since the address was removed during DAD).
+			select {
+			case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+				// If we don't get a failure event after the expected resolution
+				// time + extra 1s buffer, something is wrong.
+				t.Fatal("timed out waiting for DAD failure")
+			case e := <-ndpDisp.dadC:
+				if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+				}
+			}
+			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Errorf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+			}
 
-	// Should not have sent more than 1 NS message.
-	if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got > 1 {
-		t.Fatalf("got NeighborSolicit = %d, want <= 1", got)
+			// Should not have sent more than 1 NS message.
+			if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got > 1 {
+				t.Errorf("got NeighborSolicit = %d, want <= 1", got)
+			}
+		})
 	}
 }
 
@@ -2886,17 +2910,16 @@ func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
 	}
 }
 
-// TestCleanupHostOnlyStateOnBecomingRouter tests that all discovered routers
-// and prefixes, and non-linklocal auto-generated addresses are invalidated when
-// a NIC becomes a router.
-func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
+// TestCleanupNDPState tests that all discovered routers and prefixes, and
+// auto-generated addresses are invalidated when a NIC becomes a router.
+func TestCleanupNDPState(t *testing.T) {
 	t.Parallel()
 
 	const (
-		lifetimeSeconds = 5
-		maxEvents       = 4
-		nicID1          = 1
-		nicID2          = 2
+		lifetimeSeconds          = 5
+		maxRouterAndPrefixEvents = 4
+		nicID1                   = 1
+		nicID2                   = 2
 	)
 
 	prefix1, subnet1, e1Addr1 := prefixSubnetAddr(0, linkAddr1)
@@ -2912,254 +2935,308 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		PrefixLen: 64,
 	}
 
-	ndpDisp := ndpDispatcher{
-		routerC:        make(chan ndpRouterEvent, maxEvents),
-		rememberRouter: true,
-		prefixC:        make(chan ndpPrefixEvent, maxEvents),
-		rememberPrefix: true,
-		autoGenAddrC:   make(chan ndpAutoGenAddrEvent, maxEvents),
-	}
-	s := stack.New(stack.Options{
-		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-		AutoGenIPv6LinkLocal: true,
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverDefaultRouters: true,
-			DiscoverOnLinkPrefixes: true,
-			AutoGenGlobalAddresses: true,
+	tests := []struct {
+		name                 string
+		cleanupFn            func(t *testing.T, s *stack.Stack)
+		keepAutoGenLinkLocal bool
+		maxAutoGenAddrEvents int
+	}{
+		// A NIC should still keep its auto-generated link-local address when
+		// becoming a router.
+		{
+			name: "Forwarding Enable",
+			cleanupFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+				s.SetForwarding(true)
+			},
+			keepAutoGenLinkLocal: true,
+			maxAutoGenAddrEvents: 4,
 		},
-		NDPDisp: &ndpDisp,
-	})
 
-	expectRouterEvent := func() (bool, ndpRouterEvent) {
-		select {
-		case e := <-ndpDisp.routerC:
-			return true, e
-		default:
-		}
+		// A NIC should cleanup all NDP state when it is disabled.
+		{
+			name: "NIC Disable",
+			cleanupFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
 
-		return false, ndpRouterEvent{}
+				if err := s.DisableNIC(nicID1); err != nil {
+					t.Fatalf("s.DisableNIC(%d): %s", nicID1, err)
+				}
+				if err := s.DisableNIC(nicID2); err != nil {
+					t.Fatalf("s.DisableNIC(%d): %s", nicID2, err)
+				}
+			},
+			keepAutoGenLinkLocal: false,
+			maxAutoGenAddrEvents: 6,
+		},
 	}
 
-	expectPrefixEvent := func() (bool, ndpPrefixEvent) {
-		select {
-		case e := <-ndpDisp.prefixC:
-			return true, e
-		default:
-		}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				routerC:        make(chan ndpRouterEvent, maxRouterAndPrefixEvents),
+				rememberRouter: true,
+				prefixC:        make(chan ndpPrefixEvent, maxRouterAndPrefixEvents),
+				rememberPrefix: true,
+				autoGenAddrC:   make(chan ndpAutoGenAddrEvent, test.maxAutoGenAddrEvents),
+			}
+			s := stack.New(stack.Options{
+				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+				AutoGenIPv6LinkLocal: true,
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:              true,
+					DiscoverDefaultRouters: true,
+					DiscoverOnLinkPrefixes: true,
+					AutoGenGlobalAddresses: true,
+				},
+				NDPDisp: &ndpDisp,
+			})
 
-		return false, ndpPrefixEvent{}
-	}
+			expectRouterEvent := func() (bool, ndpRouterEvent) {
+				select {
+				case e := <-ndpDisp.routerC:
+					return true, e
+				default:
+				}
 
-	expectAutoGenAddrEvent := func() (bool, ndpAutoGenAddrEvent) {
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			return true, e
-		default:
-		}
+				return false, ndpRouterEvent{}
+			}
 
-		return false, ndpAutoGenAddrEvent{}
-	}
+			expectPrefixEvent := func() (bool, ndpPrefixEvent) {
+				select {
+				case e := <-ndpDisp.prefixC:
+					return true, e
+				default:
+				}
 
-	e1 := channel.New(0, 1280, linkAddr1)
-	if err := s.CreateNIC(nicID1, e1); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
-	}
-	// We have other tests that make sure we receive the *correct* events
-	// on normal discovery of routers/prefixes, and auto-generated
-	// addresses. Here we just make sure we get an event and let other tests
-	// handle the correctness check.
-	expectAutoGenAddrEvent()
+				return false, ndpPrefixEvent{}
+			}
 
-	e2 := channel.New(0, 1280, linkAddr2)
-	if err := s.CreateNIC(nicID2, e2); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
-	}
-	expectAutoGenAddrEvent()
+			expectAutoGenAddrEvent := func() (bool, ndpAutoGenAddrEvent) {
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					return true, e
+				default:
+				}
 
-	// Receive RAs on NIC(1) and NIC(2) from default routers (llAddr3 and
-	// llAddr4) w/ PI (for prefix1 in RA from llAddr3 and prefix2 in RA from
-	// llAddr4) to discover multiple routers and prefixes, and auto-gen
-	// multiple addresses.
+				return false, ndpAutoGenAddrEvent{}
+			}
 
-	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
-	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID1)
-	}
-	if ok, _ := expectPrefixEvent(); !ok {
-		t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1)
-	}
-	if ok, _ := expectAutoGenAddrEvent(); !ok {
-		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr1, nicID1)
-	}
+			e1 := channel.New(0, 1280, linkAddr1)
+			if err := s.CreateNIC(nicID1, e1); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
+			}
+			// We have other tests that make sure we receive the *correct* events
+			// on normal discovery of routers/prefixes, and auto-generated
+			// addresses. Here we just make sure we get an event and let other tests
+			// handle the correctness check.
+			expectAutoGenAddrEvent()
 
-	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
-	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID1)
-	}
-	if ok, _ := expectPrefixEvent(); !ok {
-		t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1)
-	}
-	if ok, _ := expectAutoGenAddrEvent(); !ok {
-		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID1)
-	}
+			e2 := channel.New(0, 1280, linkAddr2)
+			if err := s.CreateNIC(nicID2, e2); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
+			}
+			expectAutoGenAddrEvent()
 
-	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
-	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID2)
-	}
-	if ok, _ := expectPrefixEvent(); !ok {
-		t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2)
-	}
-	if ok, _ := expectAutoGenAddrEvent(); !ok {
-		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID2)
-	}
+			// Receive RAs on NIC(1) and NIC(2) from default routers (llAddr3 and
+			// llAddr4) w/ PI (for prefix1 in RA from llAddr3 and prefix2 in RA from
+			// llAddr4) to discover multiple routers and prefixes, and auto-gen
+			// multiple addresses.
 
-	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
-	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID2)
-	}
-	if ok, _ := expectPrefixEvent(); !ok {
-		t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2)
-	}
-	if ok, _ := expectAutoGenAddrEvent(); !ok {
-		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e2Addr2, nicID2)
-	}
+			e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID1)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr1, nicID1)
+			}
 
-	// We should have the auto-generated addresses added.
-	nicinfo := s.NICInfo()
-	nic1Addrs := nicinfo[nicID1].ProtocolAddresses
-	nic2Addrs := nicinfo[nicID2].ProtocolAddresses
-	if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
-	}
-	if !containsV6Addr(nic1Addrs, e1Addr1) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
-	}
-	if !containsV6Addr(nic1Addrs, e1Addr2) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
-	}
-	if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
-	}
-	if !containsV6Addr(nic2Addrs, e2Addr1) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
-	}
-	if !containsV6Addr(nic2Addrs, e2Addr2) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
-	}
+			e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID1)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID1)
+			}
 
-	// We can't proceed any further if we already failed the test (missing
-	// some discovery/auto-generated address events or addresses).
-	if t.Failed() {
-		t.FailNow()
-	}
+			e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID2)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID2)
+			}
 
-	s.SetForwarding(true)
+			e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID2)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e2Addr2, nicID2)
+			}
 
-	// Collect invalidation events after becoming a router
-	gotRouterEvents := make(map[ndpRouterEvent]int)
-	for i := 0; i < maxEvents; i++ {
-		ok, e := expectRouterEvent()
-		if !ok {
-			t.Errorf("expected %d router events after becoming a router; got = %d", maxEvents, i)
-			break
-		}
-		gotRouterEvents[e]++
-	}
-	gotPrefixEvents := make(map[ndpPrefixEvent]int)
-	for i := 0; i < maxEvents; i++ {
-		ok, e := expectPrefixEvent()
-		if !ok {
-			t.Errorf("expected %d prefix events after becoming a router; got = %d", maxEvents, i)
-			break
-		}
-		gotPrefixEvents[e]++
-	}
-	gotAutoGenAddrEvents := make(map[ndpAutoGenAddrEvent]int)
-	for i := 0; i < maxEvents; i++ {
-		ok, e := expectAutoGenAddrEvent()
-		if !ok {
-			t.Errorf("expected %d auto-generated address events after becoming a router; got = %d", maxEvents, i)
-			break
-		}
-		gotAutoGenAddrEvents[e]++
-	}
+			// We should have the auto-generated addresses added.
+			nicinfo := s.NICInfo()
+			nic1Addrs := nicinfo[nicID1].ProtocolAddresses
+			nic2Addrs := nicinfo[nicID2].ProtocolAddresses
+			if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+			}
+			if !containsV6Addr(nic1Addrs, e1Addr1) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
+			}
+			if !containsV6Addr(nic1Addrs, e1Addr2) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
+			}
+			if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+			}
+			if !containsV6Addr(nic2Addrs, e2Addr1) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
+			}
+			if !containsV6Addr(nic2Addrs, e2Addr2) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
+			}
 
-	// No need to proceed any further if we already failed the test (missing
-	// some invalidation events).
-	if t.Failed() {
-		t.FailNow()
-	}
+			// We can't proceed any further if we already failed the test (missing
+			// some discovery/auto-generated address events or addresses).
+			if t.Failed() {
+				t.FailNow()
+			}
 
-	expectedRouterEvents := map[ndpRouterEvent]int{
-		{nicID: nicID1, addr: llAddr3, discovered: false}: 1,
-		{nicID: nicID1, addr: llAddr4, discovered: false}: 1,
-		{nicID: nicID2, addr: llAddr3, discovered: false}: 1,
-		{nicID: nicID2, addr: llAddr4, discovered: false}: 1,
-	}
-	if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" {
-		t.Errorf("router events mismatch (-want +got):\n%s", diff)
-	}
-	expectedPrefixEvents := map[ndpPrefixEvent]int{
-		{nicID: nicID1, prefix: subnet1, discovered: false}: 1,
-		{nicID: nicID1, prefix: subnet2, discovered: false}: 1,
-		{nicID: nicID2, prefix: subnet1, discovered: false}: 1,
-		{nicID: nicID2, prefix: subnet2, discovered: false}: 1,
-	}
-	if diff := cmp.Diff(expectedPrefixEvents, gotPrefixEvents); diff != "" {
-		t.Errorf("prefix events mismatch (-want +got):\n%s", diff)
-	}
-	expectedAutoGenAddrEvents := map[ndpAutoGenAddrEvent]int{
-		{nicID: nicID1, addr: e1Addr1, eventType: invalidatedAddr}: 1,
-		{nicID: nicID1, addr: e1Addr2, eventType: invalidatedAddr}: 1,
-		{nicID: nicID2, addr: e2Addr1, eventType: invalidatedAddr}: 1,
-		{nicID: nicID2, addr: e2Addr2, eventType: invalidatedAddr}: 1,
-	}
-	if diff := cmp.Diff(expectedAutoGenAddrEvents, gotAutoGenAddrEvents); diff != "" {
-		t.Errorf("auto-generated address events mismatch (-want +got):\n%s", diff)
-	}
+			test.cleanupFn(t, s)
 
-	// Make sure the auto-generated addresses got removed.
-	nicinfo = s.NICInfo()
-	nic1Addrs = nicinfo[nicID1].ProtocolAddresses
-	nic2Addrs = nicinfo[nicID2].ProtocolAddresses
-	if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
-	}
-	if containsV6Addr(nic1Addrs, e1Addr1) {
-		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
-	}
-	if containsV6Addr(nic1Addrs, e1Addr2) {
-		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
-	}
-	if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
-	}
-	if containsV6Addr(nic2Addrs, e2Addr1) {
-		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
-	}
-	if containsV6Addr(nic2Addrs, e2Addr2) {
-		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
-	}
+			// Collect invalidation events after having NDP state cleaned up.
+			gotRouterEvents := make(map[ndpRouterEvent]int)
+			for i := 0; i < maxRouterAndPrefixEvents; i++ {
+				ok, e := expectRouterEvent()
+				if !ok {
+					t.Errorf("expected %d router events after becoming a router; got = %d", maxRouterAndPrefixEvents, i)
+					break
+				}
+				gotRouterEvents[e]++
+			}
+			gotPrefixEvents := make(map[ndpPrefixEvent]int)
+			for i := 0; i < maxRouterAndPrefixEvents; i++ {
+				ok, e := expectPrefixEvent()
+				if !ok {
+					t.Errorf("expected %d prefix events after becoming a router; got = %d", maxRouterAndPrefixEvents, i)
+					break
+				}
+				gotPrefixEvents[e]++
+			}
+			gotAutoGenAddrEvents := make(map[ndpAutoGenAddrEvent]int)
+			for i := 0; i < test.maxAutoGenAddrEvents; i++ {
+				ok, e := expectAutoGenAddrEvent()
+				if !ok {
+					t.Errorf("expected %d auto-generated address events after becoming a router; got = %d", test.maxAutoGenAddrEvents, i)
+					break
+				}
+				gotAutoGenAddrEvents[e]++
+			}
 
-	// Should not get any more events (invalidation timers should have been
-	// cancelled when we transitioned into a router).
-	time.Sleep(lifetimeSeconds*time.Second + defaultTimeout)
-	select {
-	case <-ndpDisp.routerC:
-		t.Error("unexpected router event")
-	default:
-	}
-	select {
-	case <-ndpDisp.prefixC:
-		t.Error("unexpected prefix event")
-	default:
-	}
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Error("unexpected auto-generated address event")
-	default:
+			// No need to proceed any further if we already failed the test (missing
+			// some invalidation events).
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			expectedRouterEvents := map[ndpRouterEvent]int{
+				{nicID: nicID1, addr: llAddr3, discovered: false}: 1,
+				{nicID: nicID1, addr: llAddr4, discovered: false}: 1,
+				{nicID: nicID2, addr: llAddr3, discovered: false}: 1,
+				{nicID: nicID2, addr: llAddr4, discovered: false}: 1,
+			}
+			if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" {
+				t.Errorf("router events mismatch (-want +got):\n%s", diff)
+			}
+			expectedPrefixEvents := map[ndpPrefixEvent]int{
+				{nicID: nicID1, prefix: subnet1, discovered: false}: 1,
+				{nicID: nicID1, prefix: subnet2, discovered: false}: 1,
+				{nicID: nicID2, prefix: subnet1, discovered: false}: 1,
+				{nicID: nicID2, prefix: subnet2, discovered: false}: 1,
+			}
+			if diff := cmp.Diff(expectedPrefixEvents, gotPrefixEvents); diff != "" {
+				t.Errorf("prefix events mismatch (-want +got):\n%s", diff)
+			}
+			expectedAutoGenAddrEvents := map[ndpAutoGenAddrEvent]int{
+				{nicID: nicID1, addr: e1Addr1, eventType: invalidatedAddr}: 1,
+				{nicID: nicID1, addr: e1Addr2, eventType: invalidatedAddr}: 1,
+				{nicID: nicID2, addr: e2Addr1, eventType: invalidatedAddr}: 1,
+				{nicID: nicID2, addr: e2Addr2, eventType: invalidatedAddr}: 1,
+			}
+
+			if !test.keepAutoGenLinkLocal {
+				expectedAutoGenAddrEvents[ndpAutoGenAddrEvent{nicID: nicID1, addr: llAddrWithPrefix1, eventType: invalidatedAddr}] = 1
+				expectedAutoGenAddrEvents[ndpAutoGenAddrEvent{nicID: nicID2, addr: llAddrWithPrefix2, eventType: invalidatedAddr}] = 1
+			}
+
+			if diff := cmp.Diff(expectedAutoGenAddrEvents, gotAutoGenAddrEvents); diff != "" {
+				t.Errorf("auto-generated address events mismatch (-want +got):\n%s", diff)
+			}
+
+			// Make sure the auto-generated addresses got removed.
+			nicinfo = s.NICInfo()
+			nic1Addrs = nicinfo[nicID1].ProtocolAddresses
+			nic2Addrs = nicinfo[nicID2].ProtocolAddresses
+			if containsV6Addr(nic1Addrs, llAddrWithPrefix1) != test.keepAutoGenLinkLocal {
+				if test.keepAutoGenLinkLocal {
+					t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+				} else {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+				}
+			}
+			if containsV6Addr(nic1Addrs, e1Addr1) {
+				t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
+			}
+			if containsV6Addr(nic1Addrs, e1Addr2) {
+				t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
+			}
+			if containsV6Addr(nic2Addrs, llAddrWithPrefix2) != test.keepAutoGenLinkLocal {
+				if test.keepAutoGenLinkLocal {
+					t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+				} else {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+				}
+			}
+			if containsV6Addr(nic2Addrs, e2Addr1) {
+				t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
+			}
+			if containsV6Addr(nic2Addrs, e2Addr2) {
+				t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
+			}
+
+			// Should not get any more events (invalidation timers should have been
+			// cancelled when the NDP state was cleaned up).
+			time.Sleep(lifetimeSeconds*time.Second + defaultTimeout)
+			select {
+			case <-ndpDisp.routerC:
+				t.Error("unexpected router event")
+			default:
+			}
+			select {
+			case <-ndpDisp.prefixC:
+				t.Error("unexpected prefix event")
+			default:
+			}
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Error("unexpected auto-generated address event")
+			default:
+			}
+		})
 	}
 }
 
@@ -3406,77 +3483,130 @@ func TestRouterSolicitation(t *testing.T) {
 	})
 }
 
-// TestStopStartSolicitingRouters tests that when forwarding is enabled or
-// disabled, router solicitations are stopped or started, respecitively.
 func TestStopStartSolicitingRouters(t *testing.T) {
 	t.Parallel()
 
+	const nicID = 1
 	const interval = 500 * time.Millisecond
 	const delay = time.Second
 	const maxRtrSolicitations = 3
-	e := channel.New(maxRtrSolicitations, 1280, linkAddr1)
-	waitForPkt := func(timeout time.Duration) {
-		t.Helper()
-		ctx, _ := context.WithTimeout(context.Background(), timeout)
-		p, ok := e.ReadContext(ctx)
-		if !ok {
-			t.Fatal("timed out waiting for packet")
-			return
-		}
 
-		if p.Proto != header.IPv6ProtocolNumber {
-			t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
-		}
-		checker.IPv6(t, p.Pkt.Header.View(),
-			checker.SrcAddr(header.IPv6Any),
-			checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
-			checker.TTL(header.NDPHopLimit),
-			checker.NDPRS())
-	}
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			MaxRtrSolicitations:     maxRtrSolicitations,
-			RtrSolicitationInterval: interval,
-			MaxRtrSolicitationDelay: delay,
+	tests := []struct {
+		name    string
+		startFn func(t *testing.T, s *stack.Stack)
+		stopFn  func(t *testing.T, s *stack.Stack)
+	}{
+		// Tests that when forwarding is enabled or disabled, router solicitations
+		// are stopped or started, respectively.
+		{
+			name: "Forwarding enabled and disabled",
+			startFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+				s.SetForwarding(false)
+			},
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+				s.SetForwarding(true)
+			},
 		},
-	})
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(1) = %s", err)
-	}
 
-	// Enable forwarding which should stop router solicitations.
-	s.SetForwarding(true)
-	ctx, _ := context.WithTimeout(context.Background(), delay+defaultTimeout)
-	if _, ok := e.ReadContext(ctx); ok {
-		// A single RS may have been sent before forwarding was enabled.
-		ctx, _ = context.WithTimeout(context.Background(), interval+defaultTimeout)
-		if _, ok = e.ReadContext(ctx); ok {
-			t.Fatal("Should not have sent more than one RS message")
-		}
-	}
+		// Tests that when a NIC is enabled or disabled, router solicitations
+		// are started or stopped, respectively.
+		{
+			name: "NIC disabled and enabled",
+			startFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
 
-	// Enabling forwarding again should do nothing.
-	s.SetForwarding(true)
-	ctx, _ = context.WithTimeout(context.Background(), delay+defaultTimeout)
-	if _, ok := e.ReadContext(ctx); ok {
-		t.Fatal("unexpectedly got a packet after becoming a router")
-	}
+				if err := s.EnableNIC(nicID); err != nil {
+					t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+				}
+			},
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
 
-	// Disable forwarding which should start router solicitations.
-	s.SetForwarding(false)
-	waitForPkt(delay + defaultAsyncEventTimeout)
-	waitForPkt(interval + defaultAsyncEventTimeout)
-	waitForPkt(interval + defaultAsyncEventTimeout)
-	ctx, _ = context.WithTimeout(context.Background(), interval+defaultTimeout)
-	if _, ok := e.ReadContext(ctx); ok {
-		t.Fatal("unexpectedly got an extra packet after sending out the expected RSs")
+				if err := s.DisableNIC(nicID); err != nil {
+					t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+				}
+			},
+		},
 	}
 
-	// Disabling forwarding again should do nothing.
-	s.SetForwarding(false)
-	ctx, _ = context.WithTimeout(context.Background(), delay+defaultTimeout)
-	if _, ok := e.ReadContext(ctx); ok {
-		t.Fatal("unexpectedly got a packet after becoming a router")
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e := channel.New(maxRtrSolicitations, 1280, linkAddr1)
+			waitForPkt := func(timeout time.Duration) {
+				t.Helper()
+
+				ctx, cancel := context.WithTimeout(context.Background(), timeout)
+				defer cancel()
+				p, ok := e.ReadContext(ctx)
+				if !ok {
+					t.Fatal("timed out waiting for packet")
+					return
+				}
+
+				if p.Proto != header.IPv6ProtocolNumber {
+					t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+				}
+				checker.IPv6(t, p.Pkt.Header.View(),
+					checker.SrcAddr(header.IPv6Any),
+					checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+					checker.TTL(header.NDPHopLimit),
+					checker.NDPRS())
+			}
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					MaxRtrSolicitations:     maxRtrSolicitations,
+					RtrSolicitationInterval: interval,
+					MaxRtrSolicitationDelay: delay,
+				},
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+
+			// Stop soliciting routers.
+			test.stopFn(t, s)
+			ctx, cancel := context.WithTimeout(context.Background(), delay+defaultTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				// A single RS may have been sent before forwarding was enabled.
+				ctx, cancel := context.WithTimeout(context.Background(), interval+defaultTimeout)
+				defer cancel()
+				if _, ok = e.ReadContext(ctx); ok {
+					t.Fatal("should not have sent more than one RS message")
+				}
+			}
+
+			// Stopping router solicitations after it has already been stopped should
+			// do nothing.
+			test.stopFn(t, s)
+			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				t.Fatal("unexpectedly got a packet after router solicitation has been stopepd")
+			}
+
+			// Start soliciting routers.
+			test.startFn(t, s)
+			waitForPkt(delay + defaultAsyncEventTimeout)
+			waitForPkt(interval + defaultAsyncEventTimeout)
+			waitForPkt(interval + defaultAsyncEventTimeout)
+			ctx, cancel = context.WithTimeout(context.Background(), interval+defaultTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				t.Fatal("unexpectedly got an extra packet after sending out the expected RSs")
+			}
+
+			// Starting router solicitations after it has already completed should do
+			// nothing.
+			test.startFn(t, s)
+			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				t.Fatal("unexpectedly got a packet after finishing router solicitations")
+			}
+		})
 	}
 }
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index ca3a7a07e..b2be18e47 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -27,6 +27,14 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
+var ipv4BroadcastAddr = tcpip.ProtocolAddress{
+	Protocol: header.IPv4ProtocolNumber,
+	AddressWithPrefix: tcpip.AddressWithPrefix{
+		Address:   header.IPv4Broadcast,
+		PrefixLen: 8 * header.IPv4AddressSize,
+	},
+}
+
 // NIC represents a "network interface card" to which the networking stack is
 // attached.
 type NIC struct {
@@ -36,7 +44,8 @@ type NIC struct {
 	linkEP  LinkEndpoint
 	context NICContext
 
-	stats NICStats
+	stats  NICStats
+	attach sync.Once
 
 	mu struct {
 		sync.RWMutex
@@ -135,7 +144,69 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 	return nic
 }
 
-// enable enables the NIC. enable will attach the link to its LinkEndpoint and
+// enabled returns true if n is enabled.
+func (n *NIC) enabled() bool {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	n.mu.RUnlock()
+	return enabled
+}
+
+// disable disables n.
+//
+// It undoes the work done by enable.
+func (n *NIC) disable() *tcpip.Error {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	n.mu.RUnlock()
+	if !enabled {
+		return nil
+	}
+
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if !n.mu.enabled {
+		return nil
+	}
+
+	// TODO(b/147015577): Should Routes that are currently bound to n be
+	// invalidated? Currently, Routes will continue to work when a NIC is enabled
+	// again, and applications may not know that the underlying NIC was ever
+	// disabled.
+
+	if _, ok := n.stack.networkProtocols[header.IPv6ProtocolNumber]; ok {
+		n.mu.ndp.stopSolicitingRouters()
+		n.mu.ndp.cleanupState(false /* hostOnly */)
+
+		// Stop DAD for all the unicast IPv6 endpoints that are in the
+		// permanentTentative state.
+		for _, r := range n.mu.endpoints {
+			if addr := r.ep.ID().LocalAddress; r.getKind() == permanentTentative && header.IsV6UnicastAddress(addr) {
+				n.mu.ndp.stopDuplicateAddressDetection(addr)
+			}
+		}
+
+		// The NIC may have already left the multicast group.
+		if err := n.leaveGroupLocked(header.IPv6AllNodesMulticastAddress); err != nil && err != tcpip.ErrBadLocalAddress {
+			return err
+		}
+	}
+
+	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
+		// The address may have already been removed.
+		if err := n.removePermanentAddressLocked(ipv4BroadcastAddr.AddressWithPrefix.Address); err != nil && err != tcpip.ErrBadLocalAddress {
+			return err
+		}
+	}
+
+	// TODO(b/147015577): Should n detach from its LinkEndpoint?
+
+	n.mu.enabled = false
+	return nil
+}
+
+// enable enables n. enable will attach the nic to its LinkEndpoint and
 // join the IPv6 All-Nodes Multicast address (ff02::1).
 func (n *NIC) enable() *tcpip.Error {
 	n.mu.RLock()
@@ -158,10 +229,7 @@ func (n *NIC) enable() *tcpip.Error {
 
 	// Create an endpoint to receive broadcast packets on this interface.
 	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
-		if _, err := n.addAddressLocked(tcpip.ProtocolAddress{
-			Protocol:          header.IPv4ProtocolNumber,
-			AddressWithPrefix: tcpip.AddressWithPrefix{header.IPv4Broadcast, 8 * header.IPv4AddressSize},
-		}, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
+		if _, err := n.addAddressLocked(ipv4BroadcastAddr, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
 			return err
 		}
 	}
@@ -183,6 +251,14 @@ func (n *NIC) enable() *tcpip.Error {
 		return nil
 	}
 
+	// Join the All-Nodes multicast group before starting DAD as responses to DAD
+	// (NDP NS) messages may be sent to the All-Nodes multicast group if the
+	// source address of the NDP NS is the unspecified address, as per RFC 4861
+	// section 7.2.4.
+	if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil {
+		return err
+	}
+
 	// Perform DAD on the all the unicast IPv6 endpoints that are in the permanent
 	// state.
 	//
@@ -200,10 +276,6 @@ func (n *NIC) enable() *tcpip.Error {
 		}
 	}
 
-	if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil {
-		return err
-	}
-
 	// Do not auto-generate an IPv6 link-local address for loopback devices.
 	if n.stack.autoGenIPv6LinkLocal && !n.isLoopback() {
 		// The valid and preferred lifetime is infinite for the auto-generated
@@ -234,7 +306,7 @@ func (n *NIC) becomeIPv6Router() {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	n.mu.ndp.cleanupHostOnlyState()
+	n.mu.ndp.cleanupState(true /* hostOnly */)
 	n.mu.ndp.stopSolicitingRouters()
 }
 
@@ -252,7 +324,9 @@ func (n *NIC) becomeIPv6Host() {
 // attachLinkEndpoint attaches the NIC to the endpoint, which will enable it
 // to start delivering packets.
 func (n *NIC) attachLinkEndpoint() {
-	n.linkEP.Attach(n)
+	n.attach.Do(func() {
+		n.linkEP.Attach(n)
+	})
 }
 
 // setPromiscuousMode enables or disables promiscuous mode.
@@ -712,6 +786,7 @@ func (n *NIC) AllAddresses() []tcpip.ProtocolAddress {
 		case permanentExpired, temporary:
 			continue
 		}
+
 		addrs = append(addrs, tcpip.ProtocolAddress{
 			Protocol: ref.protocol,
 			AddressWithPrefix: tcpip.AddressWithPrefix{
@@ -1009,6 +1084,15 @@ func (n *NIC) leaveGroupLocked(addr tcpip.Address) *tcpip.Error {
 	return nil
 }
 
+// isInGroup returns true if n has joined the multicast group addr.
+func (n *NIC) isInGroup(addr tcpip.Address) bool {
+	n.mu.RLock()
+	joins := n.mu.mcastJoins[NetworkEndpointID{addr}]
+	n.mu.RUnlock()
+
+	return joins != 0
+}
+
 func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt tcpip.PacketBuffer) {
 	r := makeRoute(protocol, dst, src, localLinkAddr, ref, false /* handleLocal */, false /* multicastLoop */)
 	r.RemoteLinkAddress = remotelinkAddr
@@ -1411,7 +1495,7 @@ func (r *referencedNetworkEndpoint) isValidForOutgoing() bool {
 //
 // r's NIC must be read locked.
 func (r *referencedNetworkEndpoint) isValidForOutgoingRLocked() bool {
-	return r.getKind() != permanentExpired || r.nic.mu.spoofing
+	return r.nic.mu.enabled && (r.getKind() != permanentExpired || r.nic.mu.spoofing)
 }
 
 // decRef decrements the ref count and cleans up the endpoint once it reaches
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 6eac16e16..fabc976a7 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -921,23 +921,38 @@ func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	nic := s.nics[id]
-	if nic == nil {
+	nic, ok := s.nics[id]
+	if !ok {
 		return tcpip.ErrUnknownNICID
 	}
 
 	return nic.enable()
 }
 
+// DisableNIC disables the given NIC.
+func (s *Stack) DisableNIC(id tcpip.NICID) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.disable()
+}
+
 // CheckNIC checks if a NIC is usable.
 func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	s.mu.RLock()
+	defer s.mu.RUnlock()
+
 	nic, ok := s.nics[id]
-	s.mu.RUnlock()
-	if ok {
-		return nic.linkEP.IsAttached()
+	if !ok {
+		return false
 	}
-	return false
+
+	return nic.enabled()
 }
 
 // NICAddressRanges returns a map of NICIDs to their associated subnets.
@@ -989,7 +1004,7 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
 	for id, nic := range s.nics {
 		flags := NICStateFlags{
 			Up:          true, // Netstack interfaces are always up.
-			Running:     nic.linkEP.IsAttached(),
+			Running:     nic.enabled(),
 			Promiscuous: nic.isPromiscuousMode(),
 			Loopback:    nic.isLoopback(),
 		}
@@ -1151,7 +1166,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)
 	needRoute := !(isBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr))
 	if id != 0 && !needRoute {
-		if nic, ok := s.nics[id]; ok {
+		if nic, ok := s.nics[id]; ok && nic.enabled() {
 			if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
 				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()), nil
 			}
@@ -1161,7 +1176,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 			if (id != 0 && id != route.NIC) || (len(remoteAddr) != 0 && !route.Destination.Contains(remoteAddr)) {
 				continue
 			}
-			if nic, ok := s.nics[route.NIC]; ok {
+			if nic, ok := s.nics[route.NIC]; ok && nic.enabled() {
 				if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
 					if len(remoteAddr) == 0 {
 						// If no remote address was provided, then the route
@@ -1614,6 +1629,18 @@ func (s *Stack) LeaveGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NIC
 	return tcpip.ErrUnknownNICID
 }
 
+// IsInGroup returns true if the NIC with ID nicID has joined the multicast
+// group multicastAddr.
+func (s *Stack) IsInGroup(nicID tcpip.NICID, multicastAddr tcpip.Address) (bool, *tcpip.Error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[nicID]; ok {
+		return nic.isInGroup(multicastAddr), nil
+	}
+	return false, tcpip.ErrUnknownNICID
+}
+
 // IPTables returns the stack's iptables.
 func (s *Stack) IPTables() iptables.IPTables {
 	s.tablesMu.RLock()
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 7ba604442..eb6f7d1fc 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -33,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
@@ -509,6 +510,257 @@ func testNoRoute(t *testing.T, s *stack.Stack, nic tcpip.NICID, srcAddr, dstAddr
 	}
 }
 
+func TestDisableUnknownNIC(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	if err := s.DisableNIC(1); err != tcpip.ErrUnknownNICID {
+		t.Fatalf("got s.DisableNIC(1) = %v, want = %s", err, tcpip.ErrUnknownNICID)
+	}
+}
+
+func TestDisabledNICsNICInfoAndCheckNIC(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	e := loopback.New()
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	checkNIC := func(enabled bool) {
+		t.Helper()
+
+		allNICInfo := s.NICInfo()
+		nicInfo, ok := allNICInfo[nicID]
+		if !ok {
+			t.Errorf("entry for %d missing from allNICInfo = %+v", nicID, allNICInfo)
+		} else if nicInfo.Flags.Running != enabled {
+			t.Errorf("got nicInfo.Flags.Running = %t, want = %t", nicInfo.Flags.Running, enabled)
+		}
+
+		if got := s.CheckNIC(nicID); got != enabled {
+			t.Errorf("got s.CheckNIC(%d) = %t, want = %t", nicID, got, enabled)
+		}
+	}
+
+	// NIC should initially report itself as disabled.
+	checkNIC(false)
+
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	checkNIC(true)
+
+	// If the NIC is not reporting a correct enabled status, we cannot trust the
+	// next check so end the test here.
+	if t.Failed() {
+		t.FailNow()
+	}
+
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	checkNIC(false)
+}
+
+func TestRoutesWithDisabledNIC(t *testing.T) {
+	const unspecifiedNIC = 0
+	const nicID1 = 1
+	const nicID2 = 2
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep1 := channel.New(0, defaultMTU, "")
+	if err := s.CreateNIC(nicID1, ep1); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+	}
+
+	addr1 := tcpip.Address("\x01")
+	if err := s.AddAddress(nicID1, fakeNetNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID1, fakeNetNumber, addr1, err)
+	}
+
+	ep2 := channel.New(0, defaultMTU, "")
+	if err := s.CreateNIC(nicID2, ep2); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
+	}
+
+	addr2 := tcpip.Address("\x02")
+	if err := s.AddAddress(nicID2, fakeNetNumber, addr2); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID2, fakeNetNumber, addr2, err)
+	}
+
+	// Set a route table that sends all packets with odd destination
+	// addresses through the first NIC, and all even destination address
+	// through the second one.
+	{
+		subnet0, err := tcpip.NewSubnet("\x00", "\x01")
+		if err != nil {
+			t.Fatal(err)
+		}
+		subnet1, err := tcpip.NewSubnet("\x01", "\x01")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{
+			{Destination: subnet1, Gateway: "\x00", NIC: nicID1},
+			{Destination: subnet0, Gateway: "\x00", NIC: nicID2},
+		})
+	}
+
+	// Test routes to odd address.
+	testRoute(t, s, unspecifiedNIC, "", "\x05", addr1)
+	testRoute(t, s, unspecifiedNIC, addr1, "\x05", addr1)
+	testRoute(t, s, nicID1, addr1, "\x05", addr1)
+
+	// Test routes to even address.
+	testRoute(t, s, unspecifiedNIC, "", "\x06", addr2)
+	testRoute(t, s, unspecifiedNIC, addr2, "\x06", addr2)
+	testRoute(t, s, nicID2, addr2, "\x06", addr2)
+
+	// Disabling NIC1 should result in no routes to odd addresses. Routes to even
+	// addresses should continue to be available as NIC2 is still enabled.
+	if err := s.DisableNIC(nicID1); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID1, err)
+	}
+	nic1Dst := tcpip.Address("\x05")
+	testNoRoute(t, s, unspecifiedNIC, "", nic1Dst)
+	testNoRoute(t, s, unspecifiedNIC, addr1, nic1Dst)
+	testNoRoute(t, s, nicID1, addr1, nic1Dst)
+	nic2Dst := tcpip.Address("\x06")
+	testRoute(t, s, unspecifiedNIC, "", nic2Dst, addr2)
+	testRoute(t, s, unspecifiedNIC, addr2, nic2Dst, addr2)
+	testRoute(t, s, nicID2, addr2, nic2Dst, addr2)
+
+	// Disabling NIC2 should result in no routes to even addresses. No route
+	// should be available to any address as routes to odd addresses were made
+	// unavailable by disabling NIC1 above.
+	if err := s.DisableNIC(nicID2); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID2, err)
+	}
+	testNoRoute(t, s, unspecifiedNIC, "", nic1Dst)
+	testNoRoute(t, s, unspecifiedNIC, addr1, nic1Dst)
+	testNoRoute(t, s, nicID1, addr1, nic1Dst)
+	testNoRoute(t, s, unspecifiedNIC, "", nic2Dst)
+	testNoRoute(t, s, unspecifiedNIC, addr2, nic2Dst)
+	testNoRoute(t, s, nicID2, addr2, nic2Dst)
+
+	// Enabling NIC1 should make routes to odd addresses available again. Routes
+	// to even addresses should continue to be unavailable as NIC2 is still
+	// disabled.
+	if err := s.EnableNIC(nicID1); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID1, err)
+	}
+	testRoute(t, s, unspecifiedNIC, "", nic1Dst, addr1)
+	testRoute(t, s, unspecifiedNIC, addr1, nic1Dst, addr1)
+	testRoute(t, s, nicID1, addr1, nic1Dst, addr1)
+	testNoRoute(t, s, unspecifiedNIC, "", nic2Dst)
+	testNoRoute(t, s, unspecifiedNIC, addr2, nic2Dst)
+	testNoRoute(t, s, nicID2, addr2, nic2Dst)
+}
+
+func TestRouteWritePacketWithDisabledNIC(t *testing.T) {
+	const unspecifiedNIC = 0
+	const nicID1 = 1
+	const nicID2 = 2
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep1 := channel.New(1, defaultMTU, "")
+	if err := s.CreateNIC(nicID1, ep1); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+	}
+
+	addr1 := tcpip.Address("\x01")
+	if err := s.AddAddress(nicID1, fakeNetNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID1, fakeNetNumber, addr1, err)
+	}
+
+	ep2 := channel.New(1, defaultMTU, "")
+	if err := s.CreateNIC(nicID2, ep2); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
+	}
+
+	addr2 := tcpip.Address("\x02")
+	if err := s.AddAddress(nicID2, fakeNetNumber, addr2); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID2, fakeNetNumber, addr2, err)
+	}
+
+	// Set a route table that sends all packets with odd destination
+	// addresses through the first NIC, and all even destination address
+	// through the second one.
+	{
+		subnet0, err := tcpip.NewSubnet("\x00", "\x01")
+		if err != nil {
+			t.Fatal(err)
+		}
+		subnet1, err := tcpip.NewSubnet("\x01", "\x01")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{
+			{Destination: subnet1, Gateway: "\x00", NIC: nicID1},
+			{Destination: subnet0, Gateway: "\x00", NIC: nicID2},
+		})
+	}
+
+	nic1Dst := tcpip.Address("\x05")
+	r1, err := s.FindRoute(nicID1, addr1, nic1Dst, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Errorf("FindRoute(%d, %s, %s, %d, false): %s", nicID1, addr1, nic1Dst, fakeNetNumber, err)
+	}
+	defer r1.Release()
+
+	nic2Dst := tcpip.Address("\x06")
+	r2, err := s.FindRoute(nicID2, addr2, nic2Dst, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Errorf("FindRoute(%d, %s, %s, %d, false): %s", nicID2, addr2, nic2Dst, fakeNetNumber, err)
+	}
+	defer r2.Release()
+
+	// If we failed to get routes r1 or r2, we cannot proceed with the test.
+	if t.Failed() {
+		t.FailNow()
+	}
+
+	buf := buffer.View([]byte{1})
+	testSend(t, r1, ep1, buf)
+	testSend(t, r2, ep2, buf)
+
+	// Writes with Routes that use the disabled NIC1 should fail.
+	if err := s.DisableNIC(nicID1); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID1, err)
+	}
+	testFailingSend(t, r1, ep1, buf, tcpip.ErrInvalidEndpointState)
+	testSend(t, r2, ep2, buf)
+
+	// Writes with Routes that use the disabled NIC2 should fail.
+	if err := s.DisableNIC(nicID2); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID2, err)
+	}
+	testFailingSend(t, r1, ep1, buf, tcpip.ErrInvalidEndpointState)
+	testFailingSend(t, r2, ep2, buf, tcpip.ErrInvalidEndpointState)
+
+	// Writes with Routes that use the re-enabled NIC1 should succeed.
+	// TODO(b/147015577): Should we instead completely invalidate all Routes that
+	// were bound to a disabled NIC at some point?
+	if err := s.EnableNIC(nicID1); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID1, err)
+	}
+	testSend(t, r1, ep1, buf)
+	testFailingSend(t, r2, ep2, buf, tcpip.ErrInvalidEndpointState)
+}
+
 func TestRoutes(t *testing.T) {
 	// Create a stack with the fake network protocol, two nics, and two
 	// addresses per nic, the first nic has odd address, the second one has
@@ -2173,13 +2425,29 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 
 			e := channel.New(0, 1280, test.linkAddr)
 			s := stack.New(opts)
-			nicOpts := stack.NICOptions{Name: test.nicName}
+			nicOpts := stack.NICOptions{Name: test.nicName, Disabled: true}
 			if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
 				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
 			}
 
-			var expectedMainAddr tcpip.AddressWithPrefix
+			// A new disabled NIC should not have any address, even if auto generation
+			// was enabled.
+			allStackAddrs := s.AllAddresses()
+			allNICAddrs, ok := allStackAddrs[nicID]
+			if !ok {
+				t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+			}
+			if l := len(allNICAddrs); l != 0 {
+				t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+			}
 
+			// Enabling the NIC should attempt auto-generation of a link-local
+			// address.
+			if err := s.EnableNIC(nicID); err != nil {
+				t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+			}
+
+			var expectedMainAddr tcpip.AddressWithPrefix
 			if test.shouldGen {
 				expectedMainAddr = tcpip.AddressWithPrefix{
 					Address:   test.expectedAddr,
@@ -2609,6 +2877,111 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 	}
 }
 
+func TestAddRemoveIPv4BroadcastAddressOnNICEnableDisable(t *testing.T) {
+	const nicID = 1
+
+	e := loopback.New()
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()},
+	})
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	allStackAddrs := s.AllAddresses()
+	allNICAddrs, ok := allStackAddrs[nicID]
+	if !ok {
+		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+	}
+	if l := len(allNICAddrs); l != 0 {
+		t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+	}
+
+	// Enabling the NIC should add the IPv4 broadcast address.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	allStackAddrs = s.AllAddresses()
+	allNICAddrs, ok = allStackAddrs[nicID]
+	if !ok {
+		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+	}
+	if l := len(allNICAddrs); l != 1 {
+		t.Fatalf("got len(allNICAddrs) = %d, want = 1", l)
+	}
+	want := tcpip.ProtocolAddress{
+		Protocol: header.IPv4ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   header.IPv4Broadcast,
+			PrefixLen: 32,
+		},
+	}
+	if allNICAddrs[0] != want {
+		t.Fatalf("got allNICAddrs[0] = %+v, want = %+v", allNICAddrs[0], want)
+	}
+
+	// Disabling the NIC should remove the IPv4 broadcast address.
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	allStackAddrs = s.AllAddresses()
+	allNICAddrs, ok = allStackAddrs[nicID]
+	if !ok {
+		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+	}
+	if l := len(allNICAddrs); l != 0 {
+		t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+	}
+}
+
+func TestJoinLeaveAllNodesMulticastOnNICEnableDisable(t *testing.T) {
+	const nicID = 1
+
+	e := loopback.New()
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+	})
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	// Should not be in the IPv6 all-nodes multicast group yet because the NIC has
+	// not been enabled yet.
+	isInGroup, err := s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
+	}
+	if isInGroup {
+		t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, header.IPv6AllNodesMulticastAddress)
+	}
+
+	// The all-nodes multicast group should be joined when the NIC is enabled.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	isInGroup, err = s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
+	}
+	if !isInGroup {
+		t.Fatalf("got IsInGroup(%d, %s) = false, want = true", nicID, header.IPv6AllNodesMulticastAddress)
+	}
+
+	// The all-nodes multicast group should be left when the NIC is disabled.
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	isInGroup, err = s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
+	}
+	if isInGroup {
+		t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, header.IPv6AllNodesMulticastAddress)
+	}
+}
+
 // TestDoDADWhenNICEnabled tests that IPv6 endpoints that were added while a NIC
 // was disabled have DAD performed on them when the NIC is enabled.
 func TestDoDADWhenNICEnabled(t *testing.T) {
-- 
cgit v1.2.3


From 4a73bae269ae9f52a962ae3b08a17ccaacf7ba80 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 15:19:40 -0800
Subject: Initial network namespace support.

TCP/IP will work with netstack networking. hostinet doesn't work, and sockets
will have the same behavior as it is now.

Before the userspace is able to create device, the default loopback device can
be used to test.

/proc/net and /sys/net will still be connected to the root network stack; this
is the same behavior now.

Issue #1833

PiperOrigin-RevId: 296309389
---
 pkg/sentry/fs/proc/net.go                |   5 +-
 pkg/sentry/fs/proc/sys_net.go            |   4 +-
 pkg/sentry/fsimpl/proc/tasks_net.go      |   5 +-
 pkg/sentry/fsimpl/proc/tasks_sys.go      |   4 +-
 pkg/sentry/fsimpl/testutil/kernel.go     |   1 +
 pkg/sentry/inet/BUILD                    |   1 +
 pkg/sentry/inet/namespace.go             |  99 +++++++++++++++++++++++++
 pkg/sentry/kernel/kernel.go              |  26 ++++---
 pkg/sentry/kernel/task.go                |   9 +--
 pkg/sentry/kernel/task_clone.go          |  16 ++--
 pkg/sentry/kernel/task_net.go            |  19 +++--
 pkg/sentry/kernel/task_start.go          |   8 +-
 pkg/tcpip/time_unsafe.go                 |   2 +
 runsc/boot/BUILD                         |   2 +-
 runsc/boot/controller.go                 |  11 +--
 runsc/boot/loader.go                     | 121 +++++++++++++++++++++----------
 runsc/boot/network.go                    |  27 +++++++
 runsc/boot/pprof.go                      |  18 -----
 runsc/boot/pprof/BUILD                   |  11 +++
 runsc/boot/pprof/pprof.go                |  20 +++++
 runsc/sandbox/network.go                 |  25 +------
 test/syscalls/BUILD                      |   2 +
 test/syscalls/linux/BUILD                |  17 +++++
 test/syscalls/linux/network_namespace.cc | 121 +++++++++++++++++++++++++++++++
 24 files changed, 451 insertions(+), 123 deletions(-)
 create mode 100644 pkg/sentry/inet/namespace.go
 delete mode 100644 runsc/boot/pprof.go
 create mode 100644 runsc/boot/pprof/BUILD
 create mode 100644 runsc/boot/pprof/pprof.go
 create mode 100644 test/syscalls/linux/network_namespace.cc

(limited to 'pkg')

diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 6f2775344..95d5817ff 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -43,7 +43,10 @@ import (
 // newNet creates a new proc net entry.
 func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode {
 	var contents map[string]*fs.Inode
-	if s := p.k.NetworkStack(); s != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process. We should make this per-process,
+	// a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net.
+	if s := p.k.RootNetworkNamespace().Stack(); s != nil {
 		contents = map[string]*fs.Inode{
 			"dev":  seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc),
 			"snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc),
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 0772d4ae4..d4c4b533d 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -357,7 +357,9 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine
 
 func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	var contents map[string]*fs.Inode
-	if s := p.k.NetworkStack(); s != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process.
+	if s := p.k.RootNetworkNamespace().Stack(); s != nil {
 		contents = map[string]*fs.Inode{
 			"ipv4": p.newSysNetIPv4Dir(ctx, msrc, s),
 			"core": p.newSysNetCore(ctx, msrc, s),
diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go
index 608fec017..d4e1812d8 100644
--- a/pkg/sentry/fsimpl/proc/tasks_net.go
+++ b/pkg/sentry/fsimpl/proc/tasks_net.go
@@ -39,7 +39,10 @@ import (
 
 func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
 	var contents map[string]*kernfs.Dentry
-	if stack := k.NetworkStack(); stack != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process. We should make this per-process,
+	// a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net.
+	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
 		const (
 			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
 			netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n"
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index c7ce74883..3d5dc463c 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -50,7 +50,9 @@ func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k
 func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
 	var contents map[string]*kernfs.Dentry
 
-	if stack := k.NetworkStack(); stack != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process.
+	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
 		contents = map[string]*kernfs.Dentry{
 			"ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
 				"tcp_sack": newDentry(root, inoGen.NextIno(), 0644, &tcpSackData{stack: stack}),
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index d0be32e72..488478e29 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -128,6 +128,7 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns
 		ThreadGroup:             tc,
 		TaskContext:             &kernel.TaskContext{Name: name},
 		Credentials:             auth.CredentialsFromContext(ctx),
+		NetworkNamespace:        k.RootNetworkNamespace(),
 		AllowedCPUMask:          sched.NewFullCPUSet(k.ApplicationCores()),
 		UTSNamespace:            kernel.UTSNamespaceFromContext(ctx),
 		IPCNamespace:            kernel.IPCNamespaceFromContext(ctx),
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 334432abf..07bf39fed 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -10,6 +10,7 @@ go_library(
     srcs = [
         "context.go",
         "inet.go",
+        "namespace.go",
         "test_stack.go",
     ],
     deps = [
diff --git a/pkg/sentry/inet/namespace.go b/pkg/sentry/inet/namespace.go
new file mode 100644
index 000000000..c16667e7f
--- /dev/null
+++ b/pkg/sentry/inet/namespace.go
@@ -0,0 +1,99 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package inet
+
+// Namespace represents a network namespace. See network_namespaces(7).
+//
+// +stateify savable
+type Namespace struct {
+	// stack is the network stack implementation of this network namespace.
+	stack Stack `state:"nosave"`
+
+	// creator allows kernel to create new network stack for network namespaces.
+	// If nil, no networking will function if network is namespaced.
+	creator NetworkStackCreator
+
+	// isRoot indicates whether this is the root network namespace.
+	isRoot bool
+}
+
+// NewRootNamespace creates the root network namespace, with creator
+// allowing new network namespaces to be created. If creator is nil, no
+// networking will function if the network is namespaced.
+func NewRootNamespace(stack Stack, creator NetworkStackCreator) *Namespace {
+	return &Namespace{
+		stack:   stack,
+		creator: creator,
+		isRoot:  true,
+	}
+}
+
+// NewNamespace creates a new network namespace from the root.
+func NewNamespace(root *Namespace) *Namespace {
+	n := &Namespace{
+		creator: root.creator,
+	}
+	n.init()
+	return n
+}
+
+// Stack returns the network stack of n. Stack may return nil if no network
+// stack is configured.
+func (n *Namespace) Stack() Stack {
+	return n.stack
+}
+
+// IsRoot returns whether n is the root network namespace.
+func (n *Namespace) IsRoot() bool {
+	return n.isRoot
+}
+
+// RestoreRootStack restores the root network namespace with stack. This should
+// only be called when restoring kernel.
+func (n *Namespace) RestoreRootStack(stack Stack) {
+	if !n.isRoot {
+		panic("RestoreRootStack can only be called on root network namespace")
+	}
+	if n.stack != nil {
+		panic("RestoreRootStack called after a stack has already been set")
+	}
+	n.stack = stack
+}
+
+func (n *Namespace) init() {
+	// Root network namespace will have stack assigned later.
+	if n.isRoot {
+		return
+	}
+	if n.creator != nil {
+		var err error
+		n.stack, err = n.creator.CreateStack()
+		if err != nil {
+			panic(err)
+		}
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (n *Namespace) afterLoad() {
+	n.init()
+}
+
+// NetworkStackCreator allows new instances of a network stack to be created. It
+// is used by the kernel to create new network namespaces when requested.
+type NetworkStackCreator interface {
+	// CreateStack creates a new network stack for a network namespace.
+	CreateStack() (Stack, error)
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 7da0368f1..c62fd6eb1 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -111,7 +111,7 @@ type Kernel struct {
 	timekeeper                  *Timekeeper
 	tasks                       *TaskSet
 	rootUserNamespace           *auth.UserNamespace
-	networkStack                inet.Stack `state:"nosave"`
+	rootNetworkNamespace        *inet.Namespace
 	applicationCores            uint
 	useHostCores                bool
 	extraAuxv                   []arch.AuxEntry
@@ -260,8 +260,9 @@ type InitKernelArgs struct {
 	// RootUserNamespace is the root user namespace.
 	RootUserNamespace *auth.UserNamespace
 
-	// NetworkStack is the TCP/IP network stack. NetworkStack may be nil.
-	NetworkStack inet.Stack
+	// RootNetworkNamespace is the root network namespace. If nil, no networking
+	// will be available.
+	RootNetworkNamespace *inet.Namespace
 
 	// ApplicationCores is the number of logical CPUs visible to sandboxed
 	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
@@ -320,7 +321,10 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.rootUTSNamespace = args.RootUTSNamespace
 	k.rootIPCNamespace = args.RootIPCNamespace
 	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
-	k.networkStack = args.NetworkStack
+	k.rootNetworkNamespace = args.RootNetworkNamespace
+	if k.rootNetworkNamespace == nil {
+		k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil)
+	}
 	k.applicationCores = args.ApplicationCores
 	if args.UseHostCores {
 		k.useHostCores = true
@@ -543,8 +547,6 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
 	loadStart := time.Now()
 
-	k.networkStack = net
-
 	initAppCores := k.applicationCores
 
 	// Load the pre-saved CPUID FeatureSet.
@@ -575,6 +577,10 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
 	log.Infof("Kernel load stats: %s", &stats)
 	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
 
+	// rootNetworkNamespace should be populated after loading the state file.
+	// Restore the root network stack.
+	k.rootNetworkNamespace.RestoreRootStack(net)
+
 	// Load the memory file's state.
 	memoryStart := time.Now()
 	if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil {
@@ -905,6 +911,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		FSContext:               fsContext,
 		FDTable:                 args.FDTable,
 		Credentials:             args.Credentials,
+		NetworkNamespace:        k.RootNetworkNamespace(),
 		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
@@ -1255,10 +1262,9 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
 	return k.rootAbstractSocketNamespace
 }
 
-// NetworkStack returns the network stack. NetworkStack may return nil if no
-// network stack is available.
-func (k *Kernel) NetworkStack() inet.Stack {
-	return k.networkStack
+// RootNetworkNamespace returns the root network namespace, always non-nil.
+func (k *Kernel) RootNetworkNamespace() *inet.Namespace {
+	return k.rootNetworkNamespace
 }
 
 // GlobalInit returns the thread group with ID 1 in the root PID namespace, or
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index a3443ff21..e37e23231 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -486,13 +486,10 @@ type Task struct {
 	numaPolicy   int32
 	numaNodeMask uint64
 
-	// If netns is true, the task is in a non-root network namespace. Network
-	// namespaces aren't currently implemented in full; being in a network
-	// namespace simply prevents the task from observing any network devices
-	// (including loopback) or using abstract socket addresses (see unix(7)).
+	// netns is the task's network namespace. netns is never nil.
 	//
-	// netns is protected by mu. netns is owned by the task goroutine.
-	netns bool
+	// netns is protected by mu.
+	netns *inet.Namespace
 
 	// If rseqPreempted is true, before the next call to p.Switch(),
 	// interrupt rseq critical regions as defined by rseqAddr and
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index ba74b4c1c..78866f280 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -54,8 +55,7 @@ type SharingOptions struct {
 	NewUserNamespace bool
 
 	// If NewNetworkNamespace is true, the task should have an independent
-	// network namespace. (Note that network namespaces are not really
-	// implemented; see comment on Task.netns for details.)
+	// network namespace.
 	NewNetworkNamespace bool
 
 	// If NewFiles is true, the task should use an independent file descriptor
@@ -199,6 +199,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		ipcns = NewIPCNamespace(userns)
 	}
 
+	netns := t.NetworkNamespace()
+	if opts.NewNetworkNamespace {
+		netns = inet.NewNamespace(netns)
+	}
+
 	// TODO(b/63601033): Implement CLONE_NEWNS.
 	mntnsVFS2 := t.mountNamespaceVFS2
 	if mntnsVFS2 != nil {
@@ -268,7 +273,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		FDTable:                 fdTable,
 		Credentials:             creds,
 		Niceness:                t.Niceness(),
-		NetworkNamespaced:       t.netns,
+		NetworkNamespace:        netns,
 		AllowedCPUMask:          t.CPUMask(),
 		UTSNamespace:            utsns,
 		IPCNamespace:            ipcns,
@@ -283,9 +288,6 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	} else {
 		cfg.InheritParent = t
 	}
-	if opts.NewNetworkNamespace {
-		cfg.NetworkNamespaced = true
-	}
 	nt, err := t.tg.pidns.owner.NewTask(cfg)
 	if err != nil {
 		if opts.NewThreadGroup {
@@ -482,7 +484,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 			t.mu.Unlock()
 			return syserror.EPERM
 		}
-		t.netns = true
+		t.netns = inet.NewNamespace(t.netns)
 	}
 	if opts.NewUTSNamespace {
 		if !haveCapSysAdmin {
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
index 172a31e1d..f7711232c 100644
--- a/pkg/sentry/kernel/task_net.go
+++ b/pkg/sentry/kernel/task_net.go
@@ -22,14 +22,23 @@ import (
 func (t *Task) IsNetworkNamespaced() bool {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	return t.netns
+	return !t.netns.IsRoot()
 }
 
 // NetworkContext returns the network stack used by the task. NetworkContext
 // may return nil if no network stack is available.
+//
+// TODO(gvisor.dev/issue/1833): Migrate callers of this method to
+// NetworkNamespace().
 func (t *Task) NetworkContext() inet.Stack {
-	if t.IsNetworkNamespaced() {
-		return nil
-	}
-	return t.k.networkStack
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns.Stack()
+}
+
+// NetworkNamespace returns the network namespace observed by the task.
+func (t *Task) NetworkNamespace() *inet.Namespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns
 }
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index f9236a842..a5035bb7f 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
@@ -65,9 +66,8 @@ type TaskConfig struct {
 	// Niceness is the niceness of the new task.
 	Niceness int
 
-	// If NetworkNamespaced is true, the new task should observe a non-root
-	// network namespace.
-	NetworkNamespaced bool
+	// NetworkNamespace is the network namespace to be used for the new task.
+	NetworkNamespace *inet.Namespace
 
 	// AllowedCPUMask contains the cpus that this task can run on.
 	AllowedCPUMask sched.CPUSet
@@ -133,7 +133,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		allowedCPUMask:     cfg.AllowedCPUMask.Copy(),
 		ioUsage:            &usage.IO{},
 		niceness:           cfg.Niceness,
-		netns:              cfg.NetworkNamespaced,
+		netns:              cfg.NetworkNamespace,
 		utsns:              cfg.UTSNamespace,
 		ipcns:              cfg.IPCNamespace,
 		abstractSockets:    cfg.AbstractSocketNamespace,
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index 48764b978..2f98a996f 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -25,6 +25,8 @@ import (
 )
 
 // StdClock implements Clock with the time package.
+//
+// +stateify savable
 type StdClock struct{}
 
 var _ Clock = (*StdClock)(nil)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index ae4dd102a..26f68fe3d 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -19,7 +19,6 @@ go_library(
         "loader_amd64.go",
         "loader_arm64.go",
         "network.go",
-        "pprof.go",
         "strace.go",
         "user.go",
     ],
@@ -91,6 +90,7 @@ go_library(
         "//pkg/usermem",
         "//runsc/boot/filter",
         "//runsc/boot/platforms",
+        "//runsc/boot/pprof",
         "//runsc/specutils",
         "@com_github_golang_protobuf//proto:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 9c9e94864..17e774e0c 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -32,6 +32,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/boot/pprof"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -142,7 +143,7 @@ func newController(fd int, l *Loader) (*controller, error) {
 	}
 	srv.Register(manager)
 
-	if eps, ok := l.k.NetworkStack().(*netstack.Stack); ok {
+	if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
 		net := &Network{
 			Stack: eps.Stack,
 		}
@@ -341,7 +342,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("creating memory file: %v", err)
 	}
 	k.SetMemoryFile(mf)
-	networkStack := cm.l.k.NetworkStack()
+	networkStack := cm.l.k.RootNetworkNamespace().Stack()
 	cm.l.k = k
 
 	// Set up the restore environment.
@@ -365,9 +366,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	}
 
 	if cm.l.conf.ProfileEnable {
-		// initializePProf opens /proc/self/maps, so has to be
-		// called before installing seccomp filters.
-		initializePProf()
+		// pprof.Initialize opens /proc/self/maps, so has to be called before
+		// installing seccomp filters.
+		pprof.Initialize()
 	}
 
 	// Seccomp filters have to be applied before parsing the state file.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index eef43b9df..e7ca98134 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -49,6 +49,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -60,6 +61,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/runsc/boot/filter"
 	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
+	"gvisor.dev/gvisor/runsc/boot/pprof"
 	"gvisor.dev/gvisor/runsc/specutils"
 
 	// Include supported socket providers.
@@ -230,11 +232,8 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("enabling strace: %v", err)
 	}
 
-	// Create an empty network stack because the network namespace may be empty at
-	// this point. Netns is configured before Run() is called. Netstack is
-	// configured using a control uRPC message. Host network is configured inside
-	// Run().
-	networkStack, err := newEmptyNetworkStack(args.Conf, k, k)
+	// Create root network namespace/stack.
+	netns, err := newRootNetworkNamespace(args.Conf, k, k)
 	if err != nil {
 		return nil, fmt.Errorf("creating network: %v", err)
 	}
@@ -277,7 +276,7 @@ func New(args Args) (*Loader, error) {
 		FeatureSet:                  cpuid.HostFeatureSet(),
 		Timekeeper:                  tk,
 		RootUserNamespace:           creds.UserNamespace,
-		NetworkStack:                networkStack,
+		RootNetworkNamespace:        netns,
 		ApplicationCores:            uint(args.NumCPU),
 		Vdso:                        vdso,
 		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
@@ -466,7 +465,7 @@ func (l *Loader) run() error {
 		// Delay host network configuration to this point because network namespace
 		// is configured after the loader is created and before Run() is called.
 		log.Debugf("Configuring host network")
-		stack := l.k.NetworkStack().(*hostinet.Stack)
+		stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
 		if err := stack.Configure(); err != nil {
 			return err
 		}
@@ -485,7 +484,7 @@ func (l *Loader) run() error {
 	// l.restore is set by the container manager when a restore call is made.
 	if !l.restore {
 		if l.conf.ProfileEnable {
-			initializePProf()
+			pprof.Initialize()
 		}
 
 		// Finally done with all configuration. Setup filters before user code
@@ -908,48 +907,92 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newEmptyNetworkStack(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+	// Create an empty network stack because the network namespace may be empty at
+	// this point. Netns is configured before Run() is called. Netstack is
+	// configured using a control uRPC message. Host network is configured inside
+	// Run().
 	switch conf.Network {
 	case NetworkHost:
-		return hostinet.NewStack(), nil
+		// No network namespacing support for hostinet yet, hence creator is nil.
+		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
 
 	case NetworkNone, NetworkSandbox:
-		// NetworkNone sets up loopback using netstack.
-		netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
-		transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
-		s := netstack.Stack{stack.New(stack.Options{
-			NetworkProtocols:   netProtos,
-			TransportProtocols: transProtos,
-			Clock:              clock,
-			Stats:              netstack.Metrics,
-			HandleLocal:        true,
-			// Enable raw sockets for users with sufficient
-			// privileges.
-			RawFactory: raw.EndpointFactory{},
-			UniqueID:   uniqueID,
-		})}
-
-		// Enable SACK Recovery.
-		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
-			return nil, fmt.Errorf("failed to enable SACK: %v", err)
+		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
+		if err != nil {
+			return nil, err
 		}
+		creator := &sandboxNetstackCreator{
+			clock:    clock,
+			uniqueID: uniqueID,
+		}
+		return inet.NewRootNamespace(s, creator), nil
 
-		// Set default TTLs as required by socket/netstack.
-		s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
-		s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	default:
+		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	}
 
-		// Enable Receive Buffer Auto-Tuning.
-		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-			return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
-		}
+}
 
-		s.FillDefaultIPTables()
+func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
+	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+	s := netstack.Stack{stack.New(stack.Options{
+		NetworkProtocols:   netProtos,
+		TransportProtocols: transProtos,
+		Clock:              clock,
+		Stats:              netstack.Metrics,
+		HandleLocal:        true,
+		// Enable raw sockets for users with sufficient
+		// privileges.
+		RawFactory: raw.EndpointFactory{},
+		UniqueID:   uniqueID,
+	})}
 
-		return &s, nil
+	// Enable SACK Recovery.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
+		return nil, fmt.Errorf("failed to enable SACK: %v", err)
+	}
 
-	default:
-		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	// Set default TTLs as required by socket/netstack.
+	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+
+	// Enable Receive Buffer Auto-Tuning.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
+		return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
+	}
+
+	s.FillDefaultIPTables()
+
+	return &s, nil
+}
+
+// sandboxNetstackCreator implements kernel.NetworkStackCreator.
+//
+// +stateify savable
+type sandboxNetstackCreator struct {
+	clock    tcpip.Clock
+	uniqueID stack.UniqueID
+}
+
+// CreateStack implements kernel.NetworkStackCreator.CreateStack.
+func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
+	s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID)
+	if err != nil {
+		return nil, err
 	}
+
+	// Setup loopback.
+	n := &Network{Stack: s.(*netstack.Stack).Stack}
+	nicID := tcpip.NICID(f.uniqueID.UniqueID())
+	link := DefaultLoopbackLink
+	linkEP := loopback.New()
+	if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+		return nil, err
+	}
+
+	return s, nil
 }
 
 // signal sends a signal to one or more processes in a container. If PID is 0,
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 6a8765ec8..bee6ee336 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -17,6 +17,7 @@ package boot
 import (
 	"fmt"
 	"net"
+	"strings"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/log"
@@ -31,6 +32,32 @@ import (
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
+var (
+	// DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and
+	// "::1/8" on "lo" interface.
+	DefaultLoopbackLink = LoopbackLink{
+		Name: "lo",
+		Addresses: []net.IP{
+			net.IP("\x7f\x00\x00\x01"),
+			net.IPv6loopback,
+		},
+		Routes: []Route{
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv4(0x7f, 0, 0, 0),
+					Mask: net.IPv4Mask(0xff, 0, 0, 0),
+				},
+			},
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv6loopback,
+					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
+				},
+			},
+		},
+	}
+)
+
 // Network exposes methods that can be used to configure a network stack.
 type Network struct {
 	Stack *stack.Stack
diff --git a/runsc/boot/pprof.go b/runsc/boot/pprof.go
deleted file mode 100644
index 463362f02..000000000
--- a/runsc/boot/pprof.go
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-func initializePProf() {
-}
diff --git a/runsc/boot/pprof/BUILD b/runsc/boot/pprof/BUILD
new file mode 100644
index 000000000..29cb42b2f
--- /dev/null
+++ b/runsc/boot/pprof/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "pprof",
+    srcs = ["pprof.go"],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+)
diff --git a/runsc/boot/pprof/pprof.go b/runsc/boot/pprof/pprof.go
new file mode 100644
index 000000000..1ded20dee
--- /dev/null
+++ b/runsc/boot/pprof/pprof.go
@@ -0,0 +1,20 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pprof provides a stub to initialize custom profilers.
+package pprof
+
+// Initialize will be called at boot for initializing custom profilers.
+func Initialize() {
+}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 99e143696..bc093fba5 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -21,7 +21,6 @@ import (
 	"path/filepath"
 	"runtime"
 	"strconv"
-	"strings"
 	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -75,30 +74,8 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
 }
 
 func createDefaultLoopbackInterface(conn *urpc.Client) error {
-	link := boot.LoopbackLink{
-		Name: "lo",
-		Addresses: []net.IP{
-			net.IP("\x7f\x00\x00\x01"),
-			net.IPv6loopback,
-		},
-		Routes: []boot.Route{
-			{
-				Destination: net.IPNet{
-
-					IP:   net.IPv4(0x7f, 0, 0, 0),
-					Mask: net.IPv4Mask(0xff, 0, 0, 0),
-				},
-			},
-			{
-				Destination: net.IPNet{
-					IP:   net.IPv6loopback,
-					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
-				},
-			},
-		},
-	}
 	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
-		LoopbackLinks: []boot.LoopbackLink{link},
+		LoopbackLinks: []boot.LoopbackLink{boot.DefaultLoopbackLink},
 	}, nil); err != nil {
 		return fmt.Errorf("creating loopback link and routes: %v", err)
 	}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index d69ac8356..d1977d4de 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -258,6 +258,8 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:munmap_test")
 
+syscall_test(test = "//test/syscalls/linux:network_namespace_test")
+
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:open_create_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 05a818795..aa303af84 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3639,6 +3639,23 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "network_namespace_test",
+    testonly = 1,
+    srcs = ["network_namespace.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        gtest,
+        "//test/util:capability_util",
+        "//test/util:memory_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
 cc_binary(
     name = "semaphore_test",
     testonly = 1,
diff --git a/test/syscalls/linux/network_namespace.cc b/test/syscalls/linux/network_namespace.cc
new file mode 100644
index 000000000..6ea48c263
--- /dev/null
+++ b/test/syscalls/linux/network_namespace.cc
@@ -0,0 +1,121 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <net/if.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/synchronization/notification.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using TestFunc = std::function<PosixError()>;
+using RunFunc = std::function<PosixError(TestFunc)>;
+
+struct NamespaceStrategy {
+  RunFunc run;
+
+  static NamespaceStrategy Of(RunFunc run) {
+    NamespaceStrategy s;
+    s.run = run;
+    return s;
+  }
+};
+
+PosixError RunWithUnshare(TestFunc fn) {
+  PosixError err = PosixError(-1, "function did not return a value");
+  ScopedThread t([&] {
+    if (unshare(CLONE_NEWNET) != 0) {
+      err = PosixError(errno);
+      return;
+    }
+    err = fn();
+  });
+  t.Join();
+  return err;
+}
+
+PosixError RunWithClone(TestFunc fn) {
+  struct Args {
+    absl::Notification n;
+    TestFunc fn;
+    PosixError err;
+  };
+  Args args;
+  args.fn = fn;
+  args.err = PosixError(-1, "function did not return a value");
+
+  ASSIGN_OR_RETURN_ERRNO(
+      Mapping child_stack,
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  pid_t child = clone(
+      +[](void *arg) {
+        Args *args = reinterpret_cast<Args *>(arg);
+        args->err = args->fn();
+        args->n.Notify();
+        syscall(SYS_exit, 0);  // Exit manually. No return address on stack.
+        return 0;
+      },
+      reinterpret_cast<void *>(child_stack.addr() + kPageSize),
+      CLONE_NEWNET | CLONE_THREAD | CLONE_SIGHAND | CLONE_VM, &args);
+  if (child < 0) {
+    return PosixError(errno, "clone() failed");
+  }
+  args.n.WaitForNotification();
+  return args.err;
+}
+
+class NetworkNamespaceTest
+    : public ::testing::TestWithParam<NamespaceStrategy> {};
+
+TEST_P(NetworkNamespaceTest, LoopbackExists) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  EXPECT_NO_ERRNO(GetParam().run([]() {
+    // TODO(gvisor.dev/issue/1833): Update this to test that only "lo" exists.
+    // Check loopback device exists.
+    int sock = socket(AF_INET, SOCK_DGRAM, 0);
+    if (sock < 0) {
+      return PosixError(errno, "socket() failed");
+    }
+    struct ifreq ifr;
+    snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+    if (ioctl(sock, SIOCGIFINDEX, &ifr) < 0) {
+      return PosixError(errno, "ioctl() failed, lo cannot be found");
+    }
+    return NoError();
+  }));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllNetworkNamespaceTest, NetworkNamespaceTest,
+    ::testing::Values(NamespaceStrategy::Of(RunWithUnshare),
+                      NamespaceStrategy::Of(RunWithClone)));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 97c07242c37e56f6cfdc52036d554052ba95f2ae Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 21 Feb 2020 09:53:56 -0800
Subject: Use Route.MaxHeaderLength when constructing NDP RS

Test: stack_test.TestRouterSolicitation
PiperOrigin-RevId: 296454766
---
 pkg/tcpip/stack/ndp.go      |  2 +-
 pkg/tcpip/stack/ndp_test.go | 78 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 68 insertions(+), 12 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 19bd05aa3..f651871ce 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -1235,7 +1235,7 @@ func (ndp *ndpState) startSolicitingRouters() {
 		}
 
 		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize
-		hdr := buffer.NewPrependable(header.IPv6MinimumSize + payloadSize)
+		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + payloadSize)
 		pkt := header.ICMPv6(hdr.Prepend(payloadSize))
 		pkt.SetType(header.ICMPv6RouterSolicit)
 		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index f7b75b74e..6e9306d09 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -267,6 +267,17 @@ func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration s
 	}
 }
 
+// channelLinkWithHeaderLength is a channel.Endpoint with a configurable
+// header length.
+type channelLinkWithHeaderLength struct {
+	*channel.Endpoint
+	headerLength uint16
+}
+
+func (l *channelLinkWithHeaderLength) MaxHeaderLength() uint16 {
+	return l.headerLength
+}
+
 // Check e to make sure that the event is for addr on nic with ID 1, and the
 // resolved flag set to resolved with the specified err.
 func checkDADEvent(e ndpDADEvent, nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) string {
@@ -323,21 +334,46 @@ func TestDADDisabled(t *testing.T) {
 // DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
 // Included in the subtests is a test to make sure that an invalid
 // RetransmitTimer (<1ms) values get fixed to the default RetransmitTimer of 1s.
+// This tests also validates the NDP NS packet that is transmitted.
 func TestDADResolve(t *testing.T) {
 	const nicID = 1
 
 	tests := []struct {
 		name                    string
+		linkHeaderLen           uint16
 		dupAddrDetectTransmits  uint8
 		retransTimer            time.Duration
 		expectedRetransmitTimer time.Duration
 	}{
-		{"1:1s:1s", 1, time.Second, time.Second},
-		{"2:1s:1s", 2, time.Second, time.Second},
-		{"1:2s:2s", 1, 2 * time.Second, 2 * time.Second},
+		{
+			name:                    "1:1s:1s",
+			dupAddrDetectTransmits:  1,
+			retransTimer:            time.Second,
+			expectedRetransmitTimer: time.Second,
+		},
+		{
+			name:                    "2:1s:1s",
+			linkHeaderLen:           1,
+			dupAddrDetectTransmits:  2,
+			retransTimer:            time.Second,
+			expectedRetransmitTimer: time.Second,
+		},
+		{
+			name:                    "1:2s:2s",
+			linkHeaderLen:           2,
+			dupAddrDetectTransmits:  1,
+			retransTimer:            2 * time.Second,
+			expectedRetransmitTimer: 2 * time.Second,
+		},
 		// 0s is an invalid RetransmitTimer timer and will be fixed to
 		// the default RetransmitTimer value of 1s.
-		{"1:0s:1s", 1, 0, time.Second},
+		{
+			name:                    "1:0s:1s",
+			linkHeaderLen:           3,
+			dupAddrDetectTransmits:  1,
+			retransTimer:            0,
+			expectedRetransmitTimer: time.Second,
+		},
 	}
 
 	for _, test := range tests {
@@ -356,10 +392,13 @@ func TestDADResolve(t *testing.T) {
 			opts.NDPConfigs.RetransmitTimer = test.retransTimer
 			opts.NDPConfigs.DupAddrDetectTransmits = test.dupAddrDetectTransmits
 
-			e := channel.New(int(test.dupAddrDetectTransmits), 1280, linkAddr1)
-			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			e := channelLinkWithHeaderLength{
+				Endpoint:     channel.New(int(test.dupAddrDetectTransmits), 1280, linkAddr1),
+				headerLength: test.linkHeaderLen,
+			}
+			e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 			s := stack.New(opts)
-			if err := s.CreateNIC(nicID, e); err != nil {
+			if err := s.CreateNIC(nicID, &e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
 
@@ -445,6 +484,10 @@ func TestDADResolve(t *testing.T) {
 						checker.NDPNSTargetAddress(addr1),
 						checker.NDPNSOptions(nil),
 					))
+
+				if l, want := p.Pkt.Header.AvailableLength(), int(test.linkHeaderLen); l != want {
+					t.Errorf("got p.Pkt.Header.AvailableLength() = %d; want = %d", l, want)
+				}
 			}
 		})
 	}
@@ -3336,8 +3379,11 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 func TestRouterSolicitation(t *testing.T) {
 	t.Parallel()
 
+	const nicID = 1
+
 	tests := []struct {
 		name                        string
+		linkHeaderLen               uint16
 		maxRtrSolicit               uint8
 		rtrSolicitInt               time.Duration
 		effectiveRtrSolicitInt      time.Duration
@@ -3354,6 +3400,7 @@ func TestRouterSolicitation(t *testing.T) {
 		},
 		{
 			name:                        "Two RS with delay",
+			linkHeaderLen:               1,
 			maxRtrSolicit:               2,
 			rtrSolicitInt:               time.Second,
 			effectiveRtrSolicitInt:      time.Second,
@@ -3362,6 +3409,7 @@ func TestRouterSolicitation(t *testing.T) {
 		},
 		{
 			name:                        "Single RS without delay",
+			linkHeaderLen:               2,
 			maxRtrSolicit:               1,
 			rtrSolicitInt:               time.Second,
 			effectiveRtrSolicitInt:      time.Second,
@@ -3370,6 +3418,7 @@ func TestRouterSolicitation(t *testing.T) {
 		},
 		{
 			name:                        "Two RS without delay and invalid zero interval",
+			linkHeaderLen:               3,
 			maxRtrSolicit:               2,
 			rtrSolicitInt:               0,
 			effectiveRtrSolicitInt:      4 * time.Second,
@@ -3407,8 +3456,11 @@ func TestRouterSolicitation(t *testing.T) {
 
 			t.Run(test.name, func(t *testing.T) {
 				t.Parallel()
-				e := channel.New(int(test.maxRtrSolicit), 1280, linkAddr1)
-				e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+				e := channelLinkWithHeaderLength{
+					Endpoint:     channel.New(int(test.maxRtrSolicit), 1280, linkAddr1),
+					headerLength: test.linkHeaderLen,
+				}
+				e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 				waitForPkt := func(timeout time.Duration) {
 					t.Helper()
 					ctx, _ := context.WithTimeout(context.Background(), timeout)
@@ -3434,6 +3486,10 @@ func TestRouterSolicitation(t *testing.T) {
 						checker.TTL(header.NDPHopLimit),
 						checker.NDPRS(),
 					)
+
+					if l, want := p.Pkt.Header.AvailableLength(), int(test.linkHeaderLen); l != want {
+						t.Errorf("got p.Pkt.Header.AvailableLength() = %d; want = %d", l, want)
+					}
 				}
 				waitForNothing := func(timeout time.Duration) {
 					t.Helper()
@@ -3450,8 +3506,8 @@ func TestRouterSolicitation(t *testing.T) {
 						MaxRtrSolicitationDelay: test.maxRtrSolicitDelay,
 					},
 				})
-				if err := s.CreateNIC(1, e); err != nil {
-					t.Fatalf("CreateNIC(1) = %s", err)
+				if err := s.CreateNIC(nicID, &e); err != nil {
+					t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 				}
 
 				// Make sure each RS got sent at the right
-- 
cgit v1.2.3


From a155a23480abfafe096ff50f2c4aaf2c215b6c44 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 21 Feb 2020 11:20:15 -0800
Subject: Attach LinkEndpoint to NetworkDispatcher immediately

Tests: stack_test.TestAttachToLinkEndpointImmediately
PiperOrigin-RevId: 296474068
---
 pkg/tcpip/stack/nic.go        | 25 +++++++------------
 pkg/tcpip/stack/stack.go      |  5 ++--
 pkg/tcpip/stack/stack_test.go | 56 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 18 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index b2be18e47..862954ab2 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -44,8 +44,7 @@ type NIC struct {
 	linkEP  LinkEndpoint
 	context NICContext
 
-	stats  NICStats
-	attach sync.Once
+	stats NICStats
 
 	mu struct {
 		sync.RWMutex
@@ -141,6 +140,8 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 		nic.mu.packetEPs[netProto.Number()] = []PacketEndpoint{}
 	}
 
+	nic.linkEP.Attach(nic)
+
 	return nic
 }
 
@@ -200,14 +201,16 @@ func (n *NIC) disable() *tcpip.Error {
 		}
 	}
 
-	// TODO(b/147015577): Should n detach from its LinkEndpoint?
-
 	n.mu.enabled = false
 	return nil
 }
 
-// enable enables n. enable will attach the nic to its LinkEndpoint and
-// join the IPv6 All-Nodes Multicast address (ff02::1).
+// enable enables n.
+//
+// If the stack has IPv6 enabled, enable will join the IPv6 All-Nodes Multicast
+// address (ff02::1), start DAD for permanent addresses, and start soliciting
+// routers if the stack is not operating as a router. If the stack is also
+// configured to auto-generate a link-local address, one will be generated.
 func (n *NIC) enable() *tcpip.Error {
 	n.mu.RLock()
 	enabled := n.mu.enabled
@@ -225,8 +228,6 @@ func (n *NIC) enable() *tcpip.Error {
 
 	n.mu.enabled = true
 
-	n.attachLinkEndpoint()
-
 	// Create an endpoint to receive broadcast packets on this interface.
 	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
 		if _, err := n.addAddressLocked(ipv4BroadcastAddr, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
@@ -321,14 +322,6 @@ func (n *NIC) becomeIPv6Host() {
 	n.mu.ndp.startSolicitingRouters()
 }
 
-// attachLinkEndpoint attaches the NIC to the endpoint, which will enable it
-// to start delivering packets.
-func (n *NIC) attachLinkEndpoint() {
-	n.attach.Do(func() {
-		n.linkEP.Attach(n)
-	})
-}
-
 // setPromiscuousMode enables or disables promiscuous mode.
 func (n *NIC) setPromiscuousMode(enable bool) {
 	n.mu.Lock()
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index fabc976a7..f0ed76fbe 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -881,6 +881,8 @@ type NICOptions struct {
 // CreateNICWithOptions creates a NIC with the provided id, LinkEndpoint, and
 // NICOptions. See the documentation on type NICOptions for details on how
 // NICs can be configured.
+//
+// LinkEndpoint.Attach will be called to bind ep with a NetworkDispatcher.
 func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOptions) *tcpip.Error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -900,7 +902,6 @@ func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOp
 	}
 
 	n := newNIC(s, id, opts.Name, ep, opts.Context)
-
 	s.nics[id] = n
 	if !opts.Disabled {
 		return n.enable()
@@ -910,7 +911,7 @@ func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOp
 }
 
 // CreateNIC creates a NIC with the provided id and LinkEndpoint and calls
-// `LinkEndpoint.Attach` to start delivering packets to it.
+// LinkEndpoint.Attach to bind ep with a NetworkDispatcher.
 func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
 	return s.CreateNICWithOptions(id, ep, NICOptions{})
 }
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index eb6f7d1fc..18016e7db 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -239,6 +239,23 @@ func fakeNetFactory() stack.NetworkProtocol {
 	return &fakeNetworkProtocol{}
 }
 
+// linkEPWithMockedAttach is a stack.LinkEndpoint that tests can use to verify
+// that LinkEndpoint.Attach was called.
+type linkEPWithMockedAttach struct {
+	stack.LinkEndpoint
+	attached bool
+}
+
+// Attach implements stack.LinkEndpoint.Attach.
+func (l *linkEPWithMockedAttach) Attach(d stack.NetworkDispatcher) {
+	l.LinkEndpoint.Attach(d)
+	l.attached = true
+}
+
+func (l *linkEPWithMockedAttach) isAttached() bool {
+	return l.attached
+}
+
 func TestNetworkReceive(t *testing.T) {
 	// Create a stack with the fake network protocol, one nic, and two
 	// addresses attached to it: 1 & 2.
@@ -510,6 +527,45 @@ func testNoRoute(t *testing.T, s *stack.Stack, nic tcpip.NICID, srcAddr, dstAddr
 	}
 }
 
+// TestAttachToLinkEndpointImmediately tests that a LinkEndpoint is attached to
+// a NetworkDispatcher when the NIC is created.
+func TestAttachToLinkEndpointImmediately(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name    string
+		nicOpts stack.NICOptions
+	}{
+		{
+			name:    "Create enabled NIC",
+			nicOpts: stack.NICOptions{Disabled: false},
+		},
+		{
+			name:    "Create disabled NIC",
+			nicOpts: stack.NICOptions{Disabled: true},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+			})
+
+			e := linkEPWithMockedAttach{
+				LinkEndpoint: loopback.New(),
+			}
+
+			if err := s.CreateNICWithOptions(nicID, &e, test.nicOpts); err != nil {
+				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, test.nicOpts, err)
+			}
+			if !e.isAttached() {
+				t.Fatalf("link endpoint not attached to a network disatcher")
+			}
+		})
+	}
+}
+
 func TestDisableUnknownNIC(t *testing.T) {
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
-- 
cgit v1.2.3


From b8f56c79be40d9c75f4e2f279c9d821d1c1c3569 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 21 Feb 2020 15:41:56 -0800
Subject: Implement tap/tun device in vfs.

PiperOrigin-RevId: 296526279
---
 pkg/abi/linux/BUILD                              |   1 +
 pkg/abi/linux/ioctl.go                           |  26 ++
 pkg/abi/linux/ioctl_tun.go                       |  29 ++
 pkg/sentry/fs/dev/BUILD                          |   5 +
 pkg/sentry/fs/dev/dev.go                         |  10 +-
 pkg/sentry/fs/dev/net_tun.go                     | 170 +++++++++++
 pkg/syserror/syserror.go                         |   1 +
 pkg/tcpip/buffer/view.go                         |   6 +
 pkg/tcpip/link/channel/BUILD                     |   1 +
 pkg/tcpip/link/channel/channel.go                | 180 +++++++++---
 pkg/tcpip/link/tun/BUILD                         |  18 +-
 pkg/tcpip/link/tun/device.go                     | 352 +++++++++++++++++++++++
 pkg/tcpip/link/tun/protocol.go                   |  56 ++++
 pkg/tcpip/stack/nic.go                           |  32 +++
 pkg/tcpip/stack/stack.go                         |  39 +++
 test/syscalls/BUILD                              |   2 +
 test/syscalls/linux/BUILD                        |  30 ++
 test/syscalls/linux/dev.cc                       |   7 +
 test/syscalls/linux/socket_netlink_route_util.cc | 163 +++++++++++
 test/syscalls/linux/socket_netlink_route_util.h  |  55 ++++
 test/syscalls/linux/tuntap.cc                    | 346 ++++++++++++++++++++++
 21 files changed, 1490 insertions(+), 39 deletions(-)
 create mode 100644 pkg/abi/linux/ioctl_tun.go
 create mode 100644 pkg/sentry/fs/dev/net_tun.go
 create mode 100644 pkg/tcpip/link/tun/device.go
 create mode 100644 pkg/tcpip/link/tun/protocol.go
 create mode 100644 test/syscalls/linux/socket_netlink_route_util.cc
 create mode 100644 test/syscalls/linux/socket_netlink_route_util.h
 create mode 100644 test/syscalls/linux/tuntap.cc

(limited to 'pkg')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index a89f34d4b..322d1ccc4 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -30,6 +30,7 @@ go_library(
         "futex.go",
         "inotify.go",
         "ioctl.go",
+        "ioctl_tun.go",
         "ip.go",
         "ipc.go",
         "limits.go",
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 0e18db9ef..2062e6a4b 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -72,3 +72,29 @@ const (
 	SIOCGMIIPHY   = 0x8947
 	SIOCGMIIREG   = 0x8948
 )
+
+// ioctl(2) directions. Used to calculate requests number.
+// Constants from asm-generic/ioctl.h.
+const (
+	_IOC_NONE  = 0
+	_IOC_WRITE = 1
+	_IOC_READ  = 2
+)
+
+// Constants from asm-generic/ioctl.h.
+const (
+	_IOC_NRBITS   = 8
+	_IOC_TYPEBITS = 8
+	_IOC_SIZEBITS = 14
+	_IOC_DIRBITS  = 2
+
+	_IOC_NRSHIFT   = 0
+	_IOC_TYPESHIFT = _IOC_NRSHIFT + _IOC_NRBITS
+	_IOC_SIZESHIFT = _IOC_TYPESHIFT + _IOC_TYPEBITS
+	_IOC_DIRSHIFT  = _IOC_SIZESHIFT + _IOC_SIZEBITS
+)
+
+// IOC outputs the result of _IOC macro in asm-generic/ioctl.h.
+func IOC(dir, typ, nr, size uint32) uint32 {
+	return uint32(dir)<<_IOC_DIRSHIFT | typ<<_IOC_TYPESHIFT | nr<<_IOC_NRSHIFT | size<<_IOC_SIZESHIFT
+}
diff --git a/pkg/abi/linux/ioctl_tun.go b/pkg/abi/linux/ioctl_tun.go
new file mode 100644
index 000000000..c59c9c136
--- /dev/null
+++ b/pkg/abi/linux/ioctl_tun.go
@@ -0,0 +1,29 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// ioctl(2) request numbers from linux/if_tun.h
+var (
+	TUNSETIFF = IOC(_IOC_WRITE, 'T', 202, 4)
+	TUNGETIFF = IOC(_IOC_READ, 'T', 210, 4)
+)
+
+// Flags from net/if_tun.h
+const (
+	IFF_TUN      = 0x0001
+	IFF_TAP      = 0x0002
+	IFF_NO_PI    = 0x1000
+	IFF_NOFILTER = 0x1000
+)
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 4c4b7d5cc..9b6bb26d0 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -9,6 +9,7 @@ go_library(
         "device.go",
         "fs.go",
         "full.go",
+        "net_tun.go",
         "null.go",
         "random.go",
         "tty.go",
@@ -19,15 +20,19 @@ go_library(
         "//pkg/context",
         "//pkg/rand",
         "//pkg/safemem",
+        "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
         "//pkg/sentry/pgalloc",
+        "//pkg/sentry/socket/netstack",
         "//pkg/syserror",
+        "//pkg/tcpip/link/tun",
         "//pkg/usermem",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 35bd23991..7e66c29b0 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -66,8 +66,8 @@ func newMemDevice(ctx context.Context, iops fs.InodeOperations, msrc *fs.MountSo
 	})
 }
 
-func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	iops := ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+func newDirectory(ctx context.Context, contents map[string]*fs.Inode, msrc *fs.MountSource) *fs.Inode {
+	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(ctx, iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
 		InodeID:   devDevice.NextIno(),
@@ -111,7 +111,7 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		// A devpts is typically mounted at /dev/pts to provide
 		// pseudoterminal support. Place an empty directory there for
 		// the devpts to be mounted over.
-		"pts": newDirectory(ctx, msrc),
+		"pts": newDirectory(ctx, nil, msrc),
 		// Similarly, applications expect a ptmx device at /dev/ptmx
 		// connected to the terminals provided by /dev/pts/. Rather
 		// than creating a device directly (which requires a hairy
@@ -124,6 +124,10 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		"ptmx": newSymlink(ctx, "pts/ptmx", msrc),
 
 		"tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor),
+
+		"net": newDirectory(ctx, map[string]*fs.Inode{
+			"tun": newCharacterDevice(ctx, newNetTunDevice(ctx, fs.RootOwner, 0666), msrc, netTunDevMajor, netTunDevMinor),
+		}, msrc),
 	}
 
 	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
new file mode 100644
index 000000000..755644488
--- /dev/null
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -0,0 +1,170 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package dev
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	netTunDevMajor = 10
+	netTunDevMinor = 200
+)
+
+// +stateify savable
+type netTunInodeOperations struct {
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+}
+
+var _ fs.InodeOperations = (*netTunInodeOperations)(nil)
+
+func newNetTunDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *netTunInodeOperations {
+	return &netTunInodeOperations{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC),
+	}
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (iops *netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, d, flags, &netTunFileOperations{}), nil
+}
+
+// +stateify savable
+type netTunFileOperations struct {
+	fsutil.FileNoSeek               `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	device tun.Device
+}
+
+var _ fs.FileOperations = (*netTunFileOperations)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (fops *netTunFileOperations) Release() {
+	fops.device.Release()
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	request := args[1].Uint()
+	data := args[2].Pointer()
+
+	switch request {
+	case linux.TUNSETIFF:
+		t := kernel.TaskFromContext(ctx)
+		if t == nil {
+			panic("Ioctl should be called from a task context")
+		}
+		if !t.HasCapability(linux.CAP_NET_ADMIN) {
+			return 0, syserror.EPERM
+		}
+		stack, ok := t.NetworkContext().(*netstack.Stack)
+		if !ok {
+			return 0, syserror.EINVAL
+		}
+
+		var req linux.IFReq
+		if _, err := usermem.CopyObjectIn(ctx, io, data, &req, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		flags := usermem.ByteOrder.Uint16(req.Data[:])
+		return 0, fops.device.SetIff(stack.Stack, req.Name(), flags)
+
+	case linux.TUNGETIFF:
+		var req linux.IFReq
+
+		copy(req.IFName[:], fops.device.Name())
+
+		// Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when
+		// there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c.
+		flags := fops.device.Flags() | linux.IFF_NOFILTER
+		usermem.ByteOrder.PutUint16(req.Data[:], flags)
+
+		_, err := usermem.CopyObjectOut(ctx, io, data, &req, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// Write implements fs.FileOperations.Write.
+func (fops *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	data := make([]byte, src.NumBytes())
+	if _, err := src.CopyIn(ctx, data); err != nil {
+		return 0, err
+	}
+	return fops.device.Write(data)
+}
+
+// Read implements fs.FileOperations.Read.
+func (fops *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	data, err := fops.device.Read()
+	if err != nil {
+		return 0, err
+	}
+	n, err := dst.CopyOut(ctx, data)
+	if n > 0 && n < len(data) {
+		// Not an error for partial copying. Packet truncated.
+		err = nil
+	}
+	return int64(n), err
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (fops *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fops.device.Readiness(mask)
+}
+
+// EventRegister implements watier.Waitable.EventRegister.
+func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fops.device.EventRegister(e, mask)
+}
+
+// EventUnregister implements watier.Waitable.EventUnregister.
+func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
+	fops.device.EventUnregister(e)
+}
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 2269f6237..4b5a0fca6 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -29,6 +29,7 @@ var (
 	EACCES       = error(syscall.EACCES)
 	EAGAIN       = error(syscall.EAGAIN)
 	EBADF        = error(syscall.EBADF)
+	EBADFD       = error(syscall.EBADFD)
 	EBUSY        = error(syscall.EBUSY)
 	ECHILD       = error(syscall.ECHILD)
 	ECONNREFUSED = error(syscall.ECONNREFUSED)
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index 150310c11..17e94c562 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -156,3 +156,9 @@ func (vv *VectorisedView) Append(vv2 VectorisedView) {
 	vv.views = append(vv.views, vv2.views...)
 	vv.size += vv2.size
 }
+
+// AppendView appends the given view into this vectorised view.
+func (vv *VectorisedView) AppendView(v View) {
+	vv.views = append(vv.views, v)
+	vv.size += len(v)
+}
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index 3974c464e..b8b93e78e 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = ["channel.go"],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/stack",
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 78d447acd..5944ba190 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -20,6 +20,7 @@ package channel
 import (
 	"context"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -33,6 +34,118 @@ type PacketInfo struct {
 	Route stack.Route
 }
 
+// Notification is the interface for receiving notification from the packet
+// queue.
+type Notification interface {
+	// WriteNotify will be called when a write happens to the queue.
+	WriteNotify()
+}
+
+// NotificationHandle is an opaque handle to the registered notification target.
+// It can be used to unregister the notification when no longer interested.
+//
+// +stateify savable
+type NotificationHandle struct {
+	n Notification
+}
+
+type queue struct {
+	// mu protects fields below.
+	mu sync.RWMutex
+	// c is the outbound packet channel. Sending to c should hold mu.
+	c        chan PacketInfo
+	numWrite int
+	numRead  int
+	notify   []*NotificationHandle
+}
+
+func (q *queue) Close() {
+	close(q.c)
+}
+
+func (q *queue) Read() (PacketInfo, bool) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	select {
+	case p := <-q.c:
+		q.numRead++
+		return p, true
+	default:
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) ReadContext(ctx context.Context) (PacketInfo, bool) {
+	// We have to receive from channel without holding the lock, since it can
+	// block indefinitely. This will cause a window that numWrite - numRead
+	// produces a larger number, but won't go to negative. numWrite >= numRead
+	// still holds.
+	select {
+	case pkt := <-q.c:
+		q.mu.Lock()
+		defer q.mu.Unlock()
+		q.numRead++
+		return pkt, true
+	case <-ctx.Done():
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) Write(p PacketInfo) bool {
+	wrote := false
+
+	// It's important to make sure nobody can see numWrite until we increment it,
+	// so numWrite >= numRead holds.
+	q.mu.Lock()
+	select {
+	case q.c <- p:
+		wrote = true
+		q.numWrite++
+	default:
+	}
+	notify := q.notify
+	q.mu.Unlock()
+
+	if wrote {
+		// Send notification outside of lock.
+		for _, h := range notify {
+			h.n.WriteNotify()
+		}
+	}
+	return wrote
+}
+
+func (q *queue) Num() int {
+	q.mu.RLock()
+	defer q.mu.RUnlock()
+	n := q.numWrite - q.numRead
+	if n < 0 {
+		panic("numWrite < numRead")
+	}
+	return n
+}
+
+func (q *queue) AddNotify(notify Notification) *NotificationHandle {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	h := &NotificationHandle{n: notify}
+	q.notify = append(q.notify, h)
+	return h
+}
+
+func (q *queue) RemoveNotify(handle *NotificationHandle) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	// Make a copy, since we reads the array outside of lock when notifying.
+	notify := make([]*NotificationHandle, 0, len(q.notify))
+	for _, h := range q.notify {
+		if h != handle {
+			notify = append(notify, h)
+		}
+	}
+	q.notify = notify
+}
+
 // Endpoint is link layer endpoint that stores outbound packets in a channel
 // and allows injection of inbound packets.
 type Endpoint struct {
@@ -41,14 +154,16 @@ type Endpoint struct {
 	linkAddr           tcpip.LinkAddress
 	LinkEPCapabilities stack.LinkEndpointCapabilities
 
-	// c is where outbound packets are queued.
-	c chan PacketInfo
+	// Outbound packet queue.
+	q *queue
 }
 
 // New creates a new channel endpoint.
 func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
 	return &Endpoint{
-		c:        make(chan PacketInfo, size),
+		q: &queue{
+			c: make(chan PacketInfo, size),
+		},
 		mtu:      mtu,
 		linkAddr: linkAddr,
 	}
@@ -57,43 +172,36 @@ func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
 // Close closes e. Further packet injections will panic. Reads continue to
 // succeed until all packets are read.
 func (e *Endpoint) Close() {
-	close(e.c)
+	e.q.Close()
 }
 
-// Read does non-blocking read for one packet from the outbound packet queue.
+// Read does non-blocking read one packet from the outbound packet queue.
 func (e *Endpoint) Read() (PacketInfo, bool) {
-	select {
-	case pkt := <-e.c:
-		return pkt, true
-	default:
-		return PacketInfo{}, false
-	}
+	return e.q.Read()
 }
 
 // ReadContext does blocking read for one packet from the outbound packet queue.
 // It can be cancelled by ctx, and in this case, it returns false.
 func (e *Endpoint) ReadContext(ctx context.Context) (PacketInfo, bool) {
-	select {
-	case pkt := <-e.c:
-		return pkt, true
-	case <-ctx.Done():
-		return PacketInfo{}, false
-	}
+	return e.q.ReadContext(ctx)
 }
 
 // Drain removes all outbound packets from the channel and counts them.
 func (e *Endpoint) Drain() int {
 	c := 0
 	for {
-		select {
-		case <-e.c:
-			c++
-		default:
+		if _, ok := e.Read(); !ok {
 			return c
 		}
+		c++
 	}
 }
 
+// NumQueued returns the number of packet queued for outbound.
+func (e *Endpoint) NumQueued() int {
+	return e.q.Num()
+}
+
 // InjectInbound injects an inbound packet.
 func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	e.InjectLinkAddr(protocol, "", pkt)
@@ -155,10 +263,7 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 		Route: route,
 	}
 
-	select {
-	case e.c <- p:
-	default:
-	}
+	e.q.Write(p)
 
 	return nil
 }
@@ -171,7 +276,6 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 	route.Release()
 	payloadView := pkts[0].Data.ToView()
 	n := 0
-packetLoop:
 	for _, pkt := range pkts {
 		off := pkt.DataOffset
 		size := pkt.DataSize
@@ -185,12 +289,10 @@ packetLoop:
 			Route: route,
 		}
 
-		select {
-		case e.c <- p:
-			n++
-		default:
-			break packetLoop
+		if !e.q.Write(p) {
+			break
 		}
+		n++
 	}
 
 	return n, nil
@@ -204,13 +306,21 @@ func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 		GSO:   nil,
 	}
 
-	select {
-	case e.c <- p:
-	default:
-	}
+	e.q.Write(p)
 
 	return nil
 }
 
 // Wait implements stack.LinkEndpoint.Wait.
 func (*Endpoint) Wait() {}
+
+// AddNotify adds a notification target for receiving event about outgoing
+// packets.
+func (e *Endpoint) AddNotify(notify Notification) *NotificationHandle {
+	return e.q.AddNotify(notify)
+}
+
+// RemoveNotify removes handle from the list of notification targets.
+func (e *Endpoint) RemoveNotify(handle *NotificationHandle) {
+	e.q.RemoveNotify(handle)
+}
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index e5096ea38..e0db6cf54 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -4,6 +4,22 @@ package(licenses = ["notice"])
 
 go_library(
     name = "tun",
-    srcs = ["tun_unsafe.go"],
+    srcs = [
+        "device.go",
+        "protocol.go",
+        "tun_unsafe.go",
+    ],
     visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/refs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/stack",
+        "//pkg/waiter",
+    ],
 )
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
new file mode 100644
index 000000000..6ff47a742
--- /dev/null
+++ b/pkg/tcpip/link/tun/device.go
@@ -0,0 +1,352 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// drivers/net/tun.c:tun_net_init()
+	defaultDevMtu = 1500
+
+	// Queue length for outbound packet, arriving at fd side for read. Overflow
+	// causes packet drops. gVisor implementation-specific.
+	defaultDevOutQueueLen = 1024
+)
+
+var zeroMAC [6]byte
+
+// Device is an opened /dev/net/tun device.
+//
+// +stateify savable
+type Device struct {
+	waiter.Queue
+
+	mu           sync.RWMutex `state:"nosave"`
+	endpoint     *tunEndpoint
+	notifyHandle *channel.NotificationHandle
+	flags        uint16
+}
+
+// beforeSave is invoked by stateify.
+func (d *Device) beforeSave() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	// TODO(b/110961832): Restore the device to stack. At this moment, the stack
+	// is not savable.
+	if d.endpoint != nil {
+		panic("/dev/net/tun does not support save/restore when a device is associated with it.")
+	}
+}
+
+// Release implements fs.FileOperations.Release.
+func (d *Device) Release() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	// Decrease refcount if there is an endpoint associated with this file.
+	if d.endpoint != nil {
+		d.endpoint.RemoveNotify(d.notifyHandle)
+		d.endpoint.DecRef()
+		d.endpoint = nil
+	}
+}
+
+// SetIff services TUNSETIFF ioctl(2) request.
+func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	if d.endpoint != nil {
+		return syserror.EINVAL
+	}
+
+	// Input validations.
+	isTun := flags&linux.IFF_TUN != 0
+	isTap := flags&linux.IFF_TAP != 0
+	supportedFlags := uint16(linux.IFF_TUN | linux.IFF_TAP | linux.IFF_NO_PI)
+	if isTap && isTun || !isTap && !isTun || flags&^supportedFlags != 0 {
+		return syserror.EINVAL
+	}
+
+	prefix := "tun"
+	if isTap {
+		prefix = "tap"
+	}
+
+	endpoint, err := attachOrCreateNIC(s, name, prefix)
+	if err != nil {
+		return syserror.EINVAL
+	}
+
+	d.endpoint = endpoint
+	d.notifyHandle = d.endpoint.AddNotify(d)
+	d.flags = flags
+	return nil
+}
+
+func attachOrCreateNIC(s *stack.Stack, name, prefix string) (*tunEndpoint, error) {
+	for {
+		// 1. Try to attach to an existing NIC.
+		if name != "" {
+			if nic, found := s.GetNICByName(name); found {
+				endpoint, ok := nic.LinkEndpoint().(*tunEndpoint)
+				if !ok {
+					// Not a NIC created by tun device.
+					return nil, syserror.EOPNOTSUPP
+				}
+				if !endpoint.TryIncRef() {
+					// Race detected: NIC got deleted in between.
+					continue
+				}
+				return endpoint, nil
+			}
+		}
+
+		// 2. Creating a new NIC.
+		id := tcpip.NICID(s.UniqueID())
+		endpoint := &tunEndpoint{
+			Endpoint: channel.New(defaultDevOutQueueLen, defaultDevMtu, ""),
+			stack:    s,
+			nicID:    id,
+			name:     name,
+		}
+		if endpoint.name == "" {
+			endpoint.name = fmt.Sprintf("%s%d", prefix, id)
+		}
+		err := s.CreateNICWithOptions(endpoint.nicID, endpoint, stack.NICOptions{
+			Name: endpoint.name,
+		})
+		switch err {
+		case nil:
+			return endpoint, nil
+		case tcpip.ErrDuplicateNICID:
+			// Race detected: A NIC has been created in between.
+			continue
+		default:
+			return nil, syserror.EINVAL
+		}
+	}
+}
+
+// Write inject one inbound packet to the network interface.
+func (d *Device) Write(data []byte) (int64, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return 0, syserror.EBADFD
+	}
+	if !endpoint.IsAttached() {
+		return 0, syserror.EIO
+	}
+
+	dataLen := int64(len(data))
+
+	// Packet information.
+	var pktInfoHdr PacketInfoHeader
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		if len(data) < PacketInfoHeaderSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		pktInfoHdr = PacketInfoHeader(data[:PacketInfoHeaderSize])
+		data = data[PacketInfoHeaderSize:]
+	}
+
+	// Ethernet header (TAP only).
+	var ethHdr header.Ethernet
+	if d.hasFlags(linux.IFF_TAP) {
+		if len(data) < header.EthernetMinimumSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		ethHdr = header.Ethernet(data[:header.EthernetMinimumSize])
+		data = data[header.EthernetMinimumSize:]
+	}
+
+	// Try to determine network protocol number, default zero.
+	var protocol tcpip.NetworkProtocolNumber
+	switch {
+	case pktInfoHdr != nil:
+		protocol = pktInfoHdr.Protocol()
+	case ethHdr != nil:
+		protocol = ethHdr.Type()
+	}
+
+	// Try to determine remote link address, default zero.
+	var remote tcpip.LinkAddress
+	switch {
+	case ethHdr != nil:
+		remote = ethHdr.SourceAddress()
+	default:
+		remote = tcpip.LinkAddress(zeroMAC[:])
+	}
+
+	pkt := tcpip.PacketBuffer{
+		Data: buffer.View(data).ToVectorisedView(),
+	}
+	if ethHdr != nil {
+		pkt.LinkHeader = buffer.View(ethHdr)
+	}
+	endpoint.InjectLinkAddr(protocol, remote, pkt)
+	return dataLen, nil
+}
+
+// Read reads one outgoing packet from the network interface.
+func (d *Device) Read() ([]byte, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return nil, syserror.EBADFD
+	}
+
+	for {
+		info, ok := endpoint.Read()
+		if !ok {
+			return nil, syserror.ErrWouldBlock
+		}
+
+		v, ok := d.encodePkt(&info)
+		if !ok {
+			// Ignore unsupported packet.
+			continue
+		}
+		return v, nil
+	}
+}
+
+// encodePkt encodes packet for fd side.
+func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) {
+	var vv buffer.VectorisedView
+
+	// Packet information.
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		hdr := make(PacketInfoHeader, PacketInfoHeaderSize)
+		hdr.Encode(&PacketInfoFields{
+			Protocol: info.Proto,
+		})
+		vv.AppendView(buffer.View(hdr))
+	}
+
+	// If the packet does not already have link layer header, and the route
+	// does not exist, we can't compute it. This is possibly a raw packet, tun
+	// device doesn't support this at the moment.
+	if info.Pkt.LinkHeader == nil && info.Route.RemoteLinkAddress == "" {
+		return nil, false
+	}
+
+	// Ethernet header (TAP only).
+	if d.hasFlags(linux.IFF_TAP) {
+		// Add ethernet header if not provided.
+		if info.Pkt.LinkHeader == nil {
+			hdr := &header.EthernetFields{
+				SrcAddr: info.Route.LocalLinkAddress,
+				DstAddr: info.Route.RemoteLinkAddress,
+				Type:    info.Proto,
+			}
+			if hdr.SrcAddr == "" {
+				hdr.SrcAddr = d.endpoint.LinkAddress()
+			}
+
+			eth := make(header.Ethernet, header.EthernetMinimumSize)
+			eth.Encode(hdr)
+			vv.AppendView(buffer.View(eth))
+		} else {
+			vv.AppendView(info.Pkt.LinkHeader)
+		}
+	}
+
+	// Append upper headers.
+	vv.AppendView(buffer.View(info.Pkt.Header.View()[len(info.Pkt.LinkHeader):]))
+	// Append data payload.
+	vv.Append(info.Pkt.Data)
+
+	return vv.ToView(), true
+}
+
+// Name returns the name of the attached network interface. Empty string if
+// unattached.
+func (d *Device) Name() string {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	if d.endpoint != nil {
+		return d.endpoint.name
+	}
+	return ""
+}
+
+// Flags returns the flags set for d. Zero value if unset.
+func (d *Device) Flags() uint16 {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	return d.flags
+}
+
+func (d *Device) hasFlags(flags uint16) bool {
+	return d.flags&flags == flags
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (d *Device) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if mask&waiter.EventIn != 0 {
+		d.mu.RLock()
+		endpoint := d.endpoint
+		d.mu.RUnlock()
+		if endpoint != nil && endpoint.NumQueued() == 0 {
+			mask &= ^waiter.EventIn
+		}
+	}
+	return mask & (waiter.EventIn | waiter.EventOut)
+}
+
+// WriteNotify implements channel.Notification.WriteNotify.
+func (d *Device) WriteNotify() {
+	d.Notify(waiter.EventIn)
+}
+
+// tunEndpoint is the link endpoint for the NIC created by the tun device.
+//
+// It is ref-counted as multiple opening files can attach to the same NIC.
+// The last owner is responsible for deleting the NIC.
+type tunEndpoint struct {
+	*channel.Endpoint
+
+	refs.AtomicRefCount
+
+	stack *stack.Stack
+	nicID tcpip.NICID
+	name  string
+}
+
+// DecRef decrements refcount of e, removes NIC if refcount goes to 0.
+func (e *tunEndpoint) DecRef() {
+	e.DecRefWithDestructor(func() {
+		e.stack.RemoveNIC(e.nicID)
+	})
+}
diff --git a/pkg/tcpip/link/tun/protocol.go b/pkg/tcpip/link/tun/protocol.go
new file mode 100644
index 000000000..89d9d91a9
--- /dev/null
+++ b/pkg/tcpip/link/tun/protocol.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// PacketInfoHeaderSize is the size of the packet information header.
+	PacketInfoHeaderSize = 4
+
+	offsetFlags    = 0
+	offsetProtocol = 2
+)
+
+// PacketInfoFields contains fields sent through the wire if IFF_NO_PI flag is
+// not set.
+type PacketInfoFields struct {
+	Flags    uint16
+	Protocol tcpip.NetworkProtocolNumber
+}
+
+// PacketInfoHeader is the wire representation of the packet information sent if
+// IFF_NO_PI flag is not set.
+type PacketInfoHeader []byte
+
+// Encode encodes f into h.
+func (h PacketInfoHeader) Encode(f *PacketInfoFields) {
+	binary.BigEndian.PutUint16(h[offsetFlags:][:2], f.Flags)
+	binary.BigEndian.PutUint16(h[offsetProtocol:][:2], uint16(f.Protocol))
+}
+
+// Flags returns the flag field in h.
+func (h PacketInfoHeader) Flags() uint16 {
+	return binary.BigEndian.Uint16(h[offsetFlags:])
+}
+
+// Protocol returns the protocol field in h.
+func (h PacketInfoHeader) Protocol() tcpip.NetworkProtocolNumber {
+	return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(h[offsetProtocol:]))
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 862954ab2..46d3a6646 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -298,6 +298,33 @@ func (n *NIC) enable() *tcpip.Error {
 	return nil
 }
 
+// remove detaches NIC from the link endpoint, and marks existing referenced
+// network endpoints expired. This guarantees no packets between this NIC and
+// the network stack.
+func (n *NIC) remove() *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	// Detach from link endpoint, so no packet comes in.
+	n.linkEP.Attach(nil)
+
+	// Remove permanent and permanentTentative addresses, so no packet goes out.
+	var errs []*tcpip.Error
+	for nid, ref := range n.mu.endpoints {
+		switch ref.getKind() {
+		case permanentTentative, permanent:
+			if err := n.removePermanentAddressLocked(nid.LocalAddress); err != nil {
+				errs = append(errs, err)
+			}
+		}
+	}
+	if len(errs) > 0 {
+		return errs[0]
+	}
+
+	return nil
+}
+
 // becomeIPv6Router transitions n into an IPv6 router.
 //
 // When transitioning into an IPv6 router, host-only state (NDP discovered
@@ -1302,6 +1329,11 @@ func (n *NIC) Stack() *Stack {
 	return n.stack
 }
 
+// LinkEndpoint returns the link endpoint of n.
+func (n *NIC) LinkEndpoint() LinkEndpoint {
+	return n.linkEP
+}
+
 // isAddrTentative returns true if addr is tentative on n.
 //
 // Note that if addr is not associated with n, then this function will return
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index f0ed76fbe..900dd46c5 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -916,6 +916,18 @@ func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
 	return s.CreateNICWithOptions(id, ep, NICOptions{})
 }
 
+// GetNICByName gets the NIC specified by name.
+func (s *Stack) GetNICByName(name string) (*NIC, bool) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	for _, nic := range s.nics {
+		if nic.Name() == name {
+			return nic, true
+		}
+	}
+	return nil, false
+}
+
 // EnableNIC enables the given NIC so that the link-layer endpoint can start
 // delivering packets to it.
 func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error {
@@ -956,6 +968,33 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	return nic.enabled()
 }
 
+// RemoveNIC removes NIC and all related routes from the network stack.
+func (s *Stack) RemoveNIC(id tcpip.NICID) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+	delete(s.nics, id)
+
+	// Remove routes in-place. n tracks the number of routes written.
+	n := 0
+	for i, r := range s.routeTable {
+		if r.NIC != id {
+			// Keep this route.
+			if i > n {
+				s.routeTable[n] = r
+			}
+			n++
+		}
+	}
+	s.routeTable = s.routeTable[:n]
+
+	return nic.remove()
+}
+
 // NICAddressRanges returns a map of NICIDs to their associated subnets.
 func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
 	s.mu.RLock()
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index d1977d4de..3518e862d 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -678,6 +678,8 @@ syscall_test(
     test = "//test/syscalls/linux:truncate_test",
 )
 
+syscall_test(test = "//test/syscalls/linux:tuntap_test")
+
 syscall_test(test = "//test/syscalls/linux:udp_bind_test")
 
 syscall_test(
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index aa303af84..704bae17b 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -131,6 +131,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "socket_netlink_route_util",
+    testonly = 1,
+    srcs = ["socket_netlink_route_util.cc"],
+    hdrs = ["socket_netlink_route_util.h"],
+    deps = [
+        ":socket_netlink_util",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 cc_library(
     name = "socket_test_util",
     testonly = 1,
@@ -3430,6 +3441,25 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "tuntap_test",
+    testonly = 1,
+    srcs = ["tuntap.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        gtest,
+        "//test/syscalls/linux:socket_netlink_route_util",
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "udp_socket_test_cases",
     testonly = 1,
diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc
index 4dd302eed..4e473268c 100644
--- a/test/syscalls/linux/dev.cc
+++ b/test/syscalls/linux/dev.cc
@@ -153,6 +153,13 @@ TEST(DevTest, TTYExists) {
   EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
 }
 
+TEST(DevTest, NetTunExists) {
+  struct stat statbuf = {};
+  ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallSucceeds());
+  // Check that it's a character device with rw-rw-rw- permissions.
+  EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
+}
+
 }  // namespace
 }  // namespace testing
 
diff --git a/test/syscalls/linux/socket_netlink_route_util.cc b/test/syscalls/linux/socket_netlink_route_util.cc
new file mode 100644
index 000000000..53eb3b6b2
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route_util.cc
@@ -0,0 +1,163 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+
+#include <linux/if.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include "absl/types/optional.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr uint32_t kSeq = 12345;
+
+}  // namespace
+
+PosixError DumpLinks(
+    const FileDescriptor& fd, uint32_t seq,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = seq;
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
+}
+
+PosixErrorOr<std::vector<Link>> DumpLinks() {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  std::vector<Link> links;
+  RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    if (hdr->nlmsg_type != RTM_NEWLINK ||
+        hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
+      return;
+    }
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+    if (rta == nullptr) {
+      // Ignore links that do not have a name.
+      return;
+    }
+
+    links.emplace_back();
+    links.back().index = msg->ifi_index;
+    links.back().type = msg->ifi_type;
+    links.back().name =
+        std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
+  }));
+  return links;
+}
+
+PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  for (const auto& link : links) {
+    if (link.type == ARPHRD_LOOPBACK) {
+      return absl::optional<Link>(link);
+    }
+  }
+  return absl::optional<Link>();
+}
+
+PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
+                            const void* addr, int addrlen) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifaddrmsg ifaddr;
+    char attrbuf[512];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifaddr));
+  req.hdr.nlmsg_type = RTM_NEWADDR;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifaddr.ifa_index = index;
+  req.ifaddr.ifa_family = family;
+  req.ifaddr.ifa_prefixlen = prefixlen;
+
+  struct rtattr* rta = reinterpret_cast<struct rtattr*>(
+      reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
+  rta->rta_type = IFA_LOCAL;
+  rta->rta_len = RTA_LENGTH(addrlen);
+  req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
+  memcpy(RTA_DATA(rta), addr, addrlen);
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifinfo;
+    char pad[NLMSG_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifinfo));
+  req.hdr.nlmsg_type = RTM_NEWLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifinfo.ifi_index = index;
+  req.ifinfo.ifi_flags = flags;
+  req.ifinfo.ifi_change = change;
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+PosixError LinkSetMacAddr(int index, const void* addr, int addrlen) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifinfo;
+    char attrbuf[512];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifinfo));
+  req.hdr.nlmsg_type = RTM_NEWLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifinfo.ifi_index = index;
+
+  struct rtattr* rta = reinterpret_cast<struct rtattr*>(
+      reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
+  rta->rta_type = IFLA_ADDRESS;
+  rta->rta_len = RTA_LENGTH(addrlen);
+  req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
+  memcpy(RTA_DATA(rta), addr, addrlen);
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_route_util.h b/test/syscalls/linux/socket_netlink_route_util.h
new file mode 100644
index 000000000..2c018e487
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route_util.h
@@ -0,0 +1,55 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+
+struct Link {
+  int index;
+  int16_t type;
+  std::string name;
+};
+
+PosixError DumpLinks(const FileDescriptor& fd, uint32_t seq,
+                     const std::function<void(const struct nlmsghdr* hdr)>& fn);
+
+PosixErrorOr<std::vector<Link>> DumpLinks();
+
+PosixErrorOr<absl::optional<Link>> FindLoopbackLink();
+
+// LinkAddLocalAddr sets IFA_LOCAL attribute on the interface.
+PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
+                            const void* addr, int addrlen);
+
+// LinkChangeFlags changes interface flags. E.g. IFF_UP.
+PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change);
+
+// LinkSetMacAddr sets IFLA_ADDRESS attribute of the interface.
+PosixError LinkSetMacAddr(int index, const void* addr, int addrlen);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
new file mode 100644
index 000000000..f6ac9d7b8
--- /dev/null
+++ b/test/syscalls/linux/tuntap.cc
@@ -0,0 +1,346 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <linux/capability.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_tun.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_split.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr int kIPLen = 4;
+
+constexpr const char kDevNetTun[] = "/dev/net/tun";
+constexpr const char kTapName[] = "tap0";
+
+constexpr const uint8_t kMacA[ETH_ALEN] = {0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA};
+constexpr const uint8_t kMacB[ETH_ALEN] = {0xBB, 0xBB, 0xBB, 0xBB, 0xBB, 0xBB};
+
+PosixErrorOr<std::set<std::string>> DumpLinkNames() {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  std::set<std::string> names;
+  for (const auto& link : links) {
+    names.emplace(link.name);
+  }
+  return names;
+}
+
+PosixErrorOr<absl::optional<Link>> GetLinkByName(const std::string& name) {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  for (const auto& link : links) {
+    if (link.name == name) {
+      return absl::optional<Link>(link);
+    }
+  }
+  return absl::optional<Link>();
+}
+
+struct pihdr {
+  uint16_t pi_flags;
+  uint16_t pi_protocol;
+} __attribute__((packed));
+
+struct ping_pkt {
+  pihdr pi;
+  struct ethhdr eth;
+  struct iphdr ip;
+  struct icmphdr icmp;
+  char payload[64];
+} __attribute__((packed));
+
+ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
+                          const uint8_t dstmac[ETH_ALEN], const char* dstip) {
+  ping_pkt pkt = {};
+
+  pkt.pi.pi_protocol = htons(ETH_P_IP);
+
+  memcpy(pkt.eth.h_dest, dstmac, sizeof(pkt.eth.h_dest));
+  memcpy(pkt.eth.h_source, srcmac, sizeof(pkt.eth.h_source));
+  pkt.eth.h_proto = htons(ETH_P_IP);
+
+  pkt.ip.ihl = 5;
+  pkt.ip.version = 4;
+  pkt.ip.tos = 0;
+  pkt.ip.tot_len = htons(sizeof(struct iphdr) + sizeof(struct icmphdr) +
+                         sizeof(pkt.payload));
+  pkt.ip.id = 1;
+  pkt.ip.frag_off = 1 << 6;  // Do not fragment
+  pkt.ip.ttl = 64;
+  pkt.ip.protocol = IPPROTO_ICMP;
+  inet_pton(AF_INET, dstip, &pkt.ip.daddr);
+  inet_pton(AF_INET, srcip, &pkt.ip.saddr);
+  pkt.ip.check = IPChecksum(pkt.ip);
+
+  pkt.icmp.type = ICMP_ECHO;
+  pkt.icmp.code = 0;
+  pkt.icmp.checksum = 0;
+  pkt.icmp.un.echo.sequence = 1;
+  pkt.icmp.un.echo.id = 1;
+
+  strncpy(pkt.payload, "abcd", sizeof(pkt.payload));
+  pkt.icmp.checksum = ICMPChecksum(pkt.icmp, pkt.payload, sizeof(pkt.payload));
+
+  return pkt;
+}
+
+struct arp_pkt {
+  pihdr pi;
+  struct ethhdr eth;
+  struct arphdr arp;
+  uint8_t arp_sha[ETH_ALEN];
+  uint8_t arp_spa[kIPLen];
+  uint8_t arp_tha[ETH_ALEN];
+  uint8_t arp_tpa[kIPLen];
+} __attribute__((packed));
+
+std::string CreateArpPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
+                            const uint8_t dstmac[ETH_ALEN], const char* dstip) {
+  std::string buffer;
+  buffer.resize(sizeof(arp_pkt));
+
+  arp_pkt* pkt = reinterpret_cast<arp_pkt*>(&buffer[0]);
+  {
+    pkt->pi.pi_protocol = htons(ETH_P_ARP);
+
+    memcpy(pkt->eth.h_dest, kMacA, sizeof(pkt->eth.h_dest));
+    memcpy(pkt->eth.h_source, kMacB, sizeof(pkt->eth.h_source));
+    pkt->eth.h_proto = htons(ETH_P_ARP);
+
+    pkt->arp.ar_hrd = htons(ARPHRD_ETHER);
+    pkt->arp.ar_pro = htons(ETH_P_IP);
+    pkt->arp.ar_hln = ETH_ALEN;
+    pkt->arp.ar_pln = kIPLen;
+    pkt->arp.ar_op = htons(ARPOP_REPLY);
+
+    memcpy(pkt->arp_sha, srcmac, sizeof(pkt->arp_sha));
+    inet_pton(AF_INET, srcip, pkt->arp_spa);
+    memcpy(pkt->arp_tha, dstmac, sizeof(pkt->arp_tha));
+    inet_pton(AF_INET, dstip, pkt->arp_tpa);
+  }
+  return buffer;
+}
+
+}  // namespace
+
+class TuntapTest : public ::testing::Test {
+ protected:
+  void TearDown() override {
+    if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))) {
+      // Bring back capability if we had dropped it in test case.
+      ASSERT_NO_ERRNO(SetCapability(CAP_NET_ADMIN, true));
+    }
+  }
+};
+
+TEST_F(TuntapTest, CreateInterfaceNoCap) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  ASSERT_NO_ERRNO(SetCapability(CAP_NET_ADMIN, false));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  strncpy(ifr.ifr_name, kTapName, IFNAMSIZ);
+
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallFailsWithErrno(EPERM));
+}
+
+TEST_F(TuntapTest, CreateFixedNameInterface) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr_set = {};
+  ifr_set.ifr_flags = IFF_TAP;
+  strncpy(ifr_set.ifr_name, kTapName, IFNAMSIZ);
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr_set),
+              SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_get = {};
+  EXPECT_THAT(ioctl(fd.get(), TUNGETIFF, &ifr_get),
+              SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_expect = ifr_set;
+  // See __tun_chr_ioctl() in net/drivers/tun.c.
+  ifr_expect.ifr_flags |= IFF_NOFILTER;
+
+  EXPECT_THAT(DumpLinkNames(),
+              IsPosixErrorOkAndHolds(::testing::Contains(kTapName)));
+  EXPECT_THAT(memcmp(&ifr_expect, &ifr_get, sizeof(ifr_get)), ::testing::Eq(0));
+}
+
+TEST_F(TuntapTest, CreateInterface) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  // Empty ifr.ifr_name. Let kernel assign.
+
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_get = {};
+  EXPECT_THAT(ioctl(fd.get(), TUNGETIFF, &ifr_get),
+              SyscallSucceedsWithValue(0));
+
+  std::string ifname = ifr_get.ifr_name;
+  EXPECT_THAT(ifname, ::testing::StartsWith("tap"));
+  EXPECT_THAT(DumpLinkNames(),
+              IsPosixErrorOkAndHolds(::testing::Contains(ifname)));
+}
+
+TEST_F(TuntapTest, InvalidReadWrite) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  char buf[128] = {};
+  EXPECT_THAT(read(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EBADFD));
+  EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EBADFD));
+}
+
+TEST_F(TuntapTest, WriteToDownDevice) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // FIXME: gVisor always creates enabled/up'd interfaces.
+  SKIP_IF(IsRunningOnGvisor());
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  // Device created should be down by default.
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallSucceedsWithValue(0));
+
+  char buf[128] = {};
+  EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EIO));
+}
+
+// This test sets up a TAP device and pings kernel by sending ICMP echo request.
+//
+// It works as the following:
+// * Open /dev/net/tun, and create kTapName interface.
+// * Use rtnetlink to do initial setup of the interface:
+//   * Assign IP address 10.0.0.1/24 to kernel.
+//   * MAC address: kMacA
+//   * Bring up the interface.
+// * Send an ICMP echo reqest (ping) packet from 10.0.0.2 (kMacB) to kernel.
+// * Loop to receive packets from TAP device/fd:
+//   * If packet is an ICMP echo reply, it stops and passes the test.
+//   * If packet is an ARP request, it responds with canned reply and resends
+//   the
+//     ICMP request packet.
+TEST_F(TuntapTest, PingKernel) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // Interface creation.
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr_set = {};
+  ifr_set.ifr_flags = IFF_TAP;
+  strncpy(ifr_set.ifr_name, kTapName, IFNAMSIZ);
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr_set),
+              SyscallSucceedsWithValue(0));
+
+  absl::optional<Link> link =
+      ASSERT_NO_ERRNO_AND_VALUE(GetLinkByName(kTapName));
+  ASSERT_TRUE(link.has_value());
+
+  // Interface setup.
+  struct in_addr addr;
+  inet_pton(AF_INET, "10.0.0.1", &addr);
+  EXPECT_NO_ERRNO(LinkAddLocalAddr(link->index, AF_INET, /*prefixlen=*/24,
+                                   &addr, sizeof(addr)));
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME: gVisor doesn't support setting MAC address on interfaces yet.
+    EXPECT_NO_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA)));
+
+    // FIXME: gVisor always creates enabled/up'd interfaces.
+    EXPECT_NO_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP));
+  }
+
+  ping_pkt ping_req = CreatePingPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
+  std::string arp_rep = CreateArpPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
+
+  // Send ping, this would trigger an ARP request on Linux.
+  EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)),
+              SyscallSucceedsWithValue(sizeof(ping_req)));
+
+  // Receive loop to process inbound packets.
+  struct inpkt {
+    union {
+      pihdr pi;
+      ping_pkt ping;
+      arp_pkt arp;
+    };
+  };
+  while (1) {
+    inpkt r = {};
+    int n = read(fd.get(), &r, sizeof(r));
+    EXPECT_THAT(n, SyscallSucceeds());
+
+    if (n < sizeof(pihdr)) {
+      std::cerr << "Ignored packet, protocol: " << r.pi.pi_protocol
+                << " len: " << n << std::endl;
+      continue;
+    }
+
+    // Process ARP packet.
+    if (n >= sizeof(arp_pkt) && r.pi.pi_protocol == htons(ETH_P_ARP)) {
+      // Respond with canned ARP reply.
+      EXPECT_THAT(write(fd.get(), arp_rep.data(), arp_rep.size()),
+                  SyscallSucceedsWithValue(arp_rep.size()));
+      // First ping request might have been dropped due to mac address not in
+      // ARP cache. Send it again.
+      EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)),
+                  SyscallSucceedsWithValue(sizeof(ping_req)));
+    }
+
+    // Process ping response packet.
+    if (n >= sizeof(ping_pkt) && r.pi.pi_protocol == ping_req.pi.pi_protocol &&
+        r.ping.ip.protocol == ping_req.ip.protocol &&
+        !memcmp(&r.ping.ip.saddr, &ping_req.ip.daddr, kIPLen) &&
+        !memcmp(&r.ping.ip.daddr, &ping_req.ip.saddr, kIPLen) &&
+        r.ping.icmp.type == 0 && r.ping.icmp.code == 0) {
+      // Ends and passes the test.
+      break;
+    }
+  }
+}
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From c37b196455e8b3816298e3eea98e4ee2dab8d368 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 24 Feb 2020 10:31:01 -0800
Subject: Add support for tearing down protocol dispatchers and TIME_WAIT
 endpoints.

Protocol dispatchers were previously leaked. Bypassing TIME_WAIT is required to
test this change.

Also fix a race when a socket in SYN-RCVD is closed. This is also required to
test this change.

PiperOrigin-RevId: 296922548
---
 pkg/tcpip/adapters/gonet/gonet_test.go | 63 ++++++++++++++++++++++++++--------
 pkg/tcpip/network/arp/arp.go           | 20 +++++++----
 pkg/tcpip/network/ipv4/ipv4.go         |  6 ++++
 pkg/tcpip/network/ipv6/ipv6.go         |  6 ++++
 pkg/tcpip/stack/registration.go        | 23 ++++++++++---
 pkg/tcpip/stack/stack.go               | 14 +++++++-
 pkg/tcpip/stack/stack_test.go          |  6 ++++
 pkg/tcpip/stack/transport_demuxer.go   | 20 -----------
 pkg/tcpip/stack/transport_test.go      | 15 +++++++-
 pkg/tcpip/tcpip.go                     |  8 ++++-
 pkg/tcpip/transport/icmp/endpoint.go   |  5 +++
 pkg/tcpip/transport/icmp/protocol.go   | 16 ++++++---
 pkg/tcpip/transport/packet/endpoint.go |  5 +++
 pkg/tcpip/transport/raw/endpoint.go    |  5 +++
 pkg/tcpip/transport/tcp/accept.go      |  9 ++++-
 pkg/tcpip/transport/tcp/connect.go     |  4 +--
 pkg/tcpip/transport/tcp/dispatcher.go  | 31 ++++++++++++++++-
 pkg/tcpip/transport/tcp/endpoint.go    | 33 ++++++++++++++++--
 pkg/tcpip/transport/tcp/protocol.go    | 14 ++++++--
 pkg/tcpip/transport/udp/endpoint.go    |  5 +++
 pkg/tcpip/transport/udp/protocol.go    | 14 +++++---
 21 files changed, 256 insertions(+), 66 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index ea0a0409a..3c552988a 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -127,6 +127,10 @@ func TestCloseReader(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 
@@ -175,6 +179,10 @@ func TestCloseReaderWithForwarder(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
@@ -225,30 +233,21 @@ func TestCloseRead(t *testing.T) {
 	if terr != nil {
 		t.Fatalf("newLoopbackStack() = %v", terr)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
 
 	fwd := tcp.NewForwarder(s, 30000, 10, func(r *tcp.ForwarderRequest) {
 		var wq waiter.Queue
-		ep, err := r.CreateEndpoint(&wq)
+		_, err := r.CreateEndpoint(&wq)
 		if err != nil {
 			t.Fatalf("r.CreateEndpoint() = %v", err)
 		}
-		defer ep.Close()
-		r.Complete(false)
-
-		c := NewTCPConn(&wq, ep)
-
-		buf := make([]byte, 256)
-		n, e := c.Read(buf)
-		if e != nil || string(buf[:n]) != "abc123" {
-			t.Fatalf("c.Read() = (%d, %v), want (6, nil)", n, e)
-		}
-
-		if n, e = c.Write([]byte("abc123")); e != nil {
-			t.Errorf("c.Write() = (%d, %v), want (6, nil)", n, e)
-		}
+		// Endpoint will be closed in deferred s.Close (above).
 	})
 
 	s.SetTransportProtocolHandler(tcp.ProtocolNumber, fwd.HandlePacket)
@@ -278,6 +277,10 @@ func TestCloseWrite(t *testing.T) {
 	if terr != nil {
 		t.Fatalf("newLoopbackStack() = %v", terr)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
@@ -334,6 +337,10 @@ func TestUDPForwarder(t *testing.T) {
 	if terr != nil {
 		t.Fatalf("newLoopbackStack() = %v", terr)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	ip1 := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
 	addr1 := tcpip.FullAddress{NICID, ip1, 11211}
@@ -391,6 +398,10 @@ func TestDeadlineChange(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 
@@ -440,6 +451,10 @@ func TestPacketConnTransfer(t *testing.T) {
 	if e != nil {
 		t.Fatalf("newLoopbackStack() = %v", e)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	ip1 := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
 	addr1 := tcpip.FullAddress{NICID, ip1, 11211}
@@ -492,6 +507,10 @@ func TestConnectedPacketConnTransfer(t *testing.T) {
 	if e != nil {
 		t.Fatalf("newLoopbackStack() = %v", e)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	ip := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
 	addr := tcpip.FullAddress{NICID, ip, 11211}
@@ -562,6 +581,8 @@ func makePipe() (c1, c2 net.Conn, stop func(), err error) {
 	stop = func() {
 		c1.Close()
 		c2.Close()
+		s.Close()
+		s.Wait()
 	}
 
 	if err := l.Close(); err != nil {
@@ -624,6 +645,10 @@ func TestTCPDialError(t *testing.T) {
 	if e != nil {
 		t.Fatalf("newLoopbackStack() = %v", e)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	ip := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
 	addr := tcpip.FullAddress{NICID, ip, 11211}
@@ -641,6 +666,10 @@ func TestDialContextTCPCanceled(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
@@ -659,6 +688,10 @@ func TestDialContextTCPTimeout(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 4da13c5df..e9fcc89a8 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -148,12 +148,12 @@ func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWi
 	}, nil
 }
 
-// LinkAddressProtocol implements stack.LinkAddressResolver.
+// LinkAddressProtocol implements stack.LinkAddressResolver.LinkAddressProtocol.
 func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 	return header.IPv4ProtocolNumber
 }
 
-// LinkAddressRequest implements stack.LinkAddressResolver.
+// LinkAddressRequest implements stack.LinkAddressResolver.LinkAddressRequest.
 func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error {
 	r := &stack.Route{
 		RemoteLinkAddress: broadcastMAC,
@@ -172,7 +172,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	})
 }
 
-// ResolveStaticAddress implements stack.LinkAddressResolver.
+// ResolveStaticAddress implements stack.LinkAddressResolver.ResolveStaticAddress.
 func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
 	if addr == header.IPv4Broadcast {
 		return broadcastMAC, true
@@ -183,16 +183,22 @@ func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bo
 	return tcpip.LinkAddress([]byte(nil)), false
 }
 
-// SetOption implements NetworkProtocol.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+// SetOption implements stack.NetworkProtocol.SetOption.
+func (*protocol) SetOption(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
-// Option implements NetworkProtocol.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+// Option implements stack.NetworkProtocol.Option.
+func (*protocol) Option(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
 var broadcastMAC = tcpip.LinkAddress([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff})
 
 // NewProtocol returns an ARP network protocol.
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 6597e6781..4f1742938 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -473,6 +473,12 @@ func (p *protocol) DefaultTTL() uint8 {
 	return uint8(atomic.LoadUint32(&p.defaultTTL))
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
 // calculateMTU calculates the network-layer payload MTU based on the link-layer
 // payload mtu.
 func calculateMTU(mtu uint32) uint32 {
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 180a480fd..9aef5234b 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -265,6 +265,12 @@ func (p *protocol) DefaultTTL() uint8 {
 	return uint8(atomic.LoadUint32(&p.defaultTTL))
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
 // calculateMTU calculates the network-layer payload MTU based on the link-layer
 // payload mtu.
 func calculateMTU(mtu uint32) uint32 {
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index d83adf0ec..f9fd8f18f 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -74,10 +74,11 @@ type TransportEndpoint interface {
 	// HandleControlPacket takes ownership of pkt.
 	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt tcpip.PacketBuffer)
 
-	// Close puts the endpoint in a closed state and frees all resources
-	// associated with it. This cleanup may happen asynchronously. Wait can
-	// be used to block on this asynchronous cleanup.
-	Close()
+	// Abort initiates an expedited endpoint teardown. It puts the endpoint
+	// in a closed state and frees all resources associated with it. This
+	// cleanup may happen asynchronously. Wait can be used to block on this
+	// asynchronous cleanup.
+	Abort()
 
 	// Wait waits for any worker goroutines owned by the endpoint to stop.
 	//
@@ -160,6 +161,13 @@ type TransportProtocol interface {
 	// Option returns an error if the option is not supported or the
 	// provided option value is invalid.
 	Option(option interface{}) *tcpip.Error
+
+	// Close requests that any worker goroutines owned by the protocol
+	// stop.
+	Close()
+
+	// Wait waits for any worker goroutines owned by the protocol to stop.
+	Wait()
 }
 
 // TransportDispatcher contains the methods used by the network stack to deliver
@@ -293,6 +301,13 @@ type NetworkProtocol interface {
 	// Option returns an error if the option is not supported or the
 	// provided option value is invalid.
 	Option(option interface{}) *tcpip.Error
+
+	// Close requests that any worker goroutines owned by the protocol
+	// stop.
+	Close()
+
+	// Wait waits for any worker goroutines owned by the protocol to stop.
+	Wait()
 }
 
 // NetworkDispatcher contains the methods used by the network stack to deliver
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 900dd46c5..ebb6c5e3b 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1446,7 +1446,13 @@ func (s *Stack) RestoreCleanupEndpoints(es []TransportEndpoint) {
 // Endpoints created or modified during this call may not get closed.
 func (s *Stack) Close() {
 	for _, e := range s.RegisteredEndpoints() {
-		e.Close()
+		e.Abort()
+	}
+	for _, p := range s.transportProtocols {
+		p.proto.Close()
+	}
+	for _, p := range s.networkProtocols {
+		p.Close()
 	}
 }
 
@@ -1464,6 +1470,12 @@ func (s *Stack) Wait() {
 	for _, e := range s.CleanupEndpoints() {
 		e.Wait()
 	}
+	for _, p := range s.transportProtocols {
+		p.proto.Wait()
+	}
+	for _, p := range s.networkProtocols {
+		p.Wait()
+	}
 
 	s.mu.RLock()
 	defer s.mu.RUnlock()
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 18016e7db..edf6bec52 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -235,6 +235,12 @@ func (f *fakeNetworkProtocol) Option(option interface{}) *tcpip.Error {
 	}
 }
 
+// Close implements TransportProtocol.Close.
+func (*fakeNetworkProtocol) Close() {}
+
+// Wait implements TransportProtocol.Wait.
+func (*fakeNetworkProtocol) Wait() {}
+
 func fakeNetFactory() stack.NetworkProtocol {
 	return &fakeNetworkProtocol{}
 }
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index d686e6eb8..778c0a4d6 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -306,26 +306,6 @@ func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, p
 	ep.mu.RUnlock() // Don't use defer for performance reasons.
 }
 
-// Close implements stack.TransportEndpoint.Close.
-func (ep *multiPortEndpoint) Close() {
-	ep.mu.RLock()
-	eps := append([]TransportEndpoint(nil), ep.endpointsArr...)
-	ep.mu.RUnlock()
-	for _, e := range eps {
-		e.Close()
-	}
-}
-
-// Wait implements stack.TransportEndpoint.Wait.
-func (ep *multiPortEndpoint) Wait() {
-	ep.mu.RLock()
-	eps := append([]TransportEndpoint(nil), ep.endpointsArr...)
-	ep.mu.RUnlock()
-	for _, e := range eps {
-		e.Wait()
-	}
-}
-
 // singleRegisterEndpoint tries to add an endpoint to the multiPortEndpoint
 // list. The list might be empty already.
 func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, reusePort bool) *tcpip.Error {
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 869c69a6d..5d1da2f8b 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -61,6 +61,10 @@ func newFakeTransportEndpoint(s *stack.Stack, proto *fakeTransportProtocol, netP
 	return &fakeTransportEndpoint{stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
 }
 
+func (f *fakeTransportEndpoint) Abort() {
+	f.Close()
+}
+
 func (f *fakeTransportEndpoint) Close() {
 	f.route.Release()
 }
@@ -272,7 +276,7 @@ func (f *fakeTransportProtocol) NewEndpoint(stack *stack.Stack, netProto tcpip.N
 	return newFakeTransportEndpoint(stack, f, netProto, stack.UniqueID()), nil
 }
 
-func (f *fakeTransportProtocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+func (*fakeTransportProtocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	return nil, tcpip.ErrUnknownProtocol
 }
 
@@ -310,6 +314,15 @@ func (f *fakeTransportProtocol) Option(option interface{}) *tcpip.Error {
 	}
 }
 
+// Abort implements TransportProtocol.Abort.
+func (*fakeTransportProtocol) Abort() {}
+
+// Close implements tcpip.Endpoint.Close.
+func (*fakeTransportProtocol) Close() {}
+
+// Wait implements TransportProtocol.Wait.
+func (*fakeTransportProtocol) Wait() {}
+
 func fakeTransFactory() stack.TransportProtocol {
 	return &fakeTransportProtocol{}
 }
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index ce5527391..3dc5d87d6 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -341,9 +341,15 @@ type ControlMessages struct {
 // networking stack.
 type Endpoint interface {
 	// Close puts the endpoint in a closed state and frees all resources
-	// associated with it.
+	// associated with it. Close initiates the teardown process, the
+	// Endpoint may not be fully closed when Close returns.
 	Close()
 
+	// Abort initiates an expedited endpoint teardown. As compared to
+	// Close, Abort prioritizes closing the Endpoint quickly over cleanly.
+	// Abort is best effort; implementing Abort with Close is acceptable.
+	Abort()
+
 	// Read reads data from the endpoint and optionally returns the sender.
 	//
 	// This method does not block if there is no data pending. It will also
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 42afb3f5b..426da1ee6 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -96,6 +96,11 @@ func (e *endpoint) UniqueID() uint64 {
 	return e.uniqueID
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
 // Close puts the endpoint in a closed state and frees all resources
 // associated with it.
 func (e *endpoint) Close() {
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index 9ce500e80..113d92901 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -104,20 +104,26 @@ func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
+func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
 	return true
 }
 
-// SetOption implements TransportProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+// SetOption implements stack.TransportProtocol.SetOption.
+func (*protocol) SetOption(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
-// Option implements TransportProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+// Option implements stack.TransportProtocol.Option.
+func (*protocol) Option(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
 // NewProtocol4 returns an ICMPv4 transport protocol.
 func NewProtocol4() stack.TransportProtocol {
 	return &protocol{ProtocolNumber4}
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index fc5bc69fa..5722815e9 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -98,6 +98,11 @@ func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumb
 	return ep, nil
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
 // Close implements tcpip.Endpoint.Close.
 func (ep *endpoint) Close() {
 	ep.mu.Lock()
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index ee9c4c58b..2ef5fac76 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -121,6 +121,11 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt
 	return e, nil
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
 // Close implements tcpip.Endpoint.Close.
 func (e *endpoint) Close() {
 	e.mu.Lock()
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 08afb7c17..13e383ffc 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -299,6 +299,13 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept)
 	if err := h.execute(); err != nil {
 		ep.Close()
+		// Wake up any waiters. This is strictly not required normally
+		// as a socket that was never accepted can't really have any
+		// registered waiters except when stack.Wait() is called which
+		// waits for all registered endpoints to stop and expects an
+		// EventHUp.
+		ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+
 		if l.listenEP != nil {
 			l.removePendingEndpoint(ep)
 		}
@@ -607,7 +614,7 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 		e.mu.Unlock()
 
 		// Notify waiters that the endpoint is shutdown.
-		e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut)
+		e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut | waiter.EventHUp | waiter.EventErr)
 	}()
 
 	s := sleep.Sleeper{}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 5c5397823..7730e6445 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1372,7 +1372,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 					e.snd.updateMaxPayloadSize(mtu, count)
 				}
 
-				if n&notifyReset != 0 {
+				if n&notifyReset != 0 || n&notifyAbort != 0 {
 					return tcpip.ErrConnectionAborted
 				}
 
@@ -1655,7 +1655,7 @@ func (e *endpoint) doTimeWait() (twReuse func()) {
 			}
 		case notification:
 			n := e.fetchNotifications()
-			if n&notifyClose != 0 {
+			if n&notifyClose != 0 || n&notifyAbort != 0 {
 				return nil
 			}
 			if n&notifyDrain != 0 {
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
index e18012ac0..d792b07d6 100644
--- a/pkg/tcpip/transport/tcp/dispatcher.go
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -68,17 +68,28 @@ func (q *epQueue) empty() bool {
 type processor struct {
 	epQ              epQueue
 	newEndpointWaker sleep.Waker
+	closeWaker       sleep.Waker
 	id               int
+	wg               sync.WaitGroup
 }
 
 func newProcessor(id int) *processor {
 	p := &processor{
 		id: id,
 	}
+	p.wg.Add(1)
 	go p.handleSegments()
 	return p
 }
 
+func (p *processor) close() {
+	p.closeWaker.Assert()
+}
+
+func (p *processor) wait() {
+	p.wg.Wait()
+}
+
 func (p *processor) queueEndpoint(ep *endpoint) {
 	// Queue an endpoint for processing by the processor goroutine.
 	p.epQ.enqueue(ep)
@@ -87,11 +98,17 @@ func (p *processor) queueEndpoint(ep *endpoint) {
 
 func (p *processor) handleSegments() {
 	const newEndpointWaker = 1
+	const closeWaker = 2
 	s := sleep.Sleeper{}
 	s.AddWaker(&p.newEndpointWaker, newEndpointWaker)
+	s.AddWaker(&p.closeWaker, closeWaker)
 	defer s.Done()
 	for {
-		s.Fetch(true)
+		id, ok := s.Fetch(true)
+		if ok && id == closeWaker {
+			p.wg.Done()
+			return
+		}
 		for ep := p.epQ.dequeue(); ep != nil; ep = p.epQ.dequeue() {
 			if ep.segmentQueue.empty() {
 				continue
@@ -160,6 +177,18 @@ func newDispatcher(nProcessors int) *dispatcher {
 	}
 }
 
+func (d *dispatcher) close() {
+	for _, p := range d.processors {
+		p.close()
+	}
+}
+
+func (d *dispatcher) wait() {
+	for _, p := range d.processors {
+		p.wait()
+	}
+}
+
 func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
 	ep := stackEP.(*endpoint)
 	s := newSegment(r, id, pkt)
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index f2be0e651..f1ad19dac 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -121,6 +121,8 @@ const (
 	notifyDrain
 	notifyReset
 	notifyResetByPeer
+	// notifyAbort is a request for an expedited teardown.
+	notifyAbort
 	notifyKeepaliveChanged
 	notifyMSSChanged
 	// notifyTickleWorker is used to tickle the protocol main loop during a
@@ -785,6 +787,24 @@ func (e *endpoint) notifyProtocolGoroutine(n uint32) {
 	}
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	// The abort notification is not processed synchronously, so no
+	// synchronization is needed.
+	//
+	// If the endpoint becomes connected after this check, we still close
+	// the endpoint. This worst case results in a slower abort.
+	//
+	// If the endpoint disconnected after the check, nothing needs to be
+	// done, so sending a notification which will potentially be ignored is
+	// fine.
+	if e.EndpointState().connected() {
+		e.notifyProtocolGoroutine(notifyAbort)
+		return
+	}
+	e.Close()
+}
+
 // Close puts the endpoint in a closed state and frees all resources associated
 // with it. It must be called only once and with no other concurrent calls to
 // the endpoint.
@@ -829,9 +849,18 @@ func (e *endpoint) closeNoShutdown() {
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
 	tcpip.AddDanglingEndpoint(e)
-	if !e.workerRunning {
+	switch e.EndpointState() {
+	// Sockets in StateSynRecv state(passive connections) are closed when
+	// the handshake fails or if the listening socket is closed while
+	// handshake was in progress. In such cases the handshake goroutine
+	// is already gone by the time Close is called and we need to cleanup
+	// here.
+	case StateInitial, StateBound, StateSynRecv:
 		e.cleanupLocked()
-	} else {
+		e.setEndpointState(StateClose)
+	case StateError, StateClose:
+		// do nothing.
+	default:
 		e.workerCleanup = true
 		e.notifyProtocolGoroutine(notifyClose)
 	}
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 958c06fa7..73098d904 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -194,7 +194,7 @@ func replyWithReset(s *segment) {
 	sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, flags, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */)
 }
 
-// SetOption implements TransportProtocol.SetOption.
+// SetOption implements stack.TransportProtocol.SetOption.
 func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 	switch v := option.(type) {
 	case SACKEnabled:
@@ -269,7 +269,7 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 	}
 }
 
-// Option implements TransportProtocol.Option.
+// Option implements stack.TransportProtocol.Option.
 func (p *protocol) Option(option interface{}) *tcpip.Error {
 	switch v := option.(type) {
 	case *SACKEnabled:
@@ -331,6 +331,16 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 	}
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (p *protocol) Close() {
+	p.dispatcher.close()
+}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (p *protocol) Wait() {
+	p.dispatcher.wait()
+}
+
 // NewProtocol returns a TCP transport protocol.
 func NewProtocol() stack.TransportProtocol {
 	return &protocol{
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index eff7f3600..1c6a600b8 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -186,6 +186,11 @@ func (e *endpoint) UniqueID() uint64 {
 	return e.uniqueID
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
 // Close puts the endpoint in a closed state and frees all resources
 // associated with it.
 func (e *endpoint) Close() {
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 259c3072a..8df089d22 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -180,16 +180,22 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 	return true
 }
 
-// SetOption implements TransportProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+// SetOption implements stack.TransportProtocol.SetOption.
+func (*protocol) SetOption(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
-// Option implements TransportProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+// Option implements stack.TransportProtocol.Option.
+func (*protocol) Option(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
 // NewProtocol returns a UDP transport protocol.
 func NewProtocol() stack.TransportProtocol {
 	return &protocol{}
-- 
cgit v1.2.3


From 93e0c3752981b6a1c5b745faec6506c17480b84b Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 21 Feb 2020 10:28:16 +0000
Subject: Enable bluepill dieTrampoline operation on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I9e1bf2513c23bdd8c387e5b3c874c6ad3ca9aab0
---
 pkg/sentry/platform/kvm/bluepill_arm64.s         |  8 ++++---
 pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go | 23 +++++++++++++++++++-
 pkg/sentry/platform/ring0/aarch64.go             | 27 ++++++++++++------------
 3 files changed, 41 insertions(+), 17 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s
index c61700892..04efa0147 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.s
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.s
@@ -82,6 +82,8 @@ fallback:
 
 // dieTrampoline: see bluepill.go, bluepill_arm64_unsafe.go for documentation.
 TEXT ·dieTrampoline(SB),NOSPLIT,$0
-	// TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64.
-	MOVD	R9, 8(RSP)
-	BL	·dieHandler(SB)
+	// R0: Fake the old PC as caller
+	// R1: First argument (vCPU)
+	MOVD.P R1, 8(RSP) // R1: First argument (vCPU)
+	MOVD.P R0, 8(RSP) // R0: Fake the old PC as caller
+	B ·dieHandler(SB)
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
index 2f02c03cf..195331383 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -18,9 +18,30 @@ package kvm
 
 import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
+// dieArchSetup initialies the state for dieTrampoline.
+//
+// The arm64 dieTrampoline requires the vCPU to be set in R1, and the last PC
+// to be in R0. The trampoline then simulates a call to dieHandler from the
+// provided PC.
+//
 //go:nosplit
 func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
-	// TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64.
+	// If the vCPU is in user mode, we set the stack to the stored stack
+	// value in the vCPU itself. We don't want to unwind the user stack.
+	if guestRegs.Regs.Pstate&ring0.PSR_MODE_MASK == ring0.PSR_MODE_EL0t {
+		regs := c.CPU.Registers()
+		context.Regs[0] = regs.Regs[0]
+		context.Sp = regs.Sp
+		context.Regs[29] = regs.Regs[29] // stack base address
+	} else {
+		context.Regs[0] = guestRegs.Regs.Pc
+		context.Sp = guestRegs.Regs.Sp
+		context.Regs[29] = guestRegs.Regs.Regs[29]
+		context.Pstate = guestRegs.Regs.Pstate
+	}
+	context.Regs[1] = uint64(uintptr(unsafe.Pointer(c)))
+	context.Pc = uint64(dieTrampolineAddr)
 }
diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go
index f6da41c27..8122ac6e2 100644
--- a/pkg/sentry/platform/ring0/aarch64.go
+++ b/pkg/sentry/platform/ring0/aarch64.go
@@ -27,26 +27,27 @@ const (
 	_PTE_PGT_BASE = 0x7000
 	_PTE_PGT_SIZE = 0x1000
 
-	_PSR_MODE_EL0t = 0x0
-	_PSR_MODE_EL1t = 0x4
-	_PSR_MODE_EL1h = 0x5
-	_PSR_EL_MASK   = 0xf
-
-	_PSR_D_BIT = 0x200
-	_PSR_A_BIT = 0x100
-	_PSR_I_BIT = 0x80
-	_PSR_F_BIT = 0x40
+	_PSR_D_BIT = 0x00000200
+	_PSR_A_BIT = 0x00000100
+	_PSR_I_BIT = 0x00000080
+	_PSR_F_BIT = 0x00000040
 )
 
 const (
+	// PSR bits
+	PSR_MODE_EL0t = 0x00000000
+	PSR_MODE_EL1t = 0x00000004
+	PSR_MODE_EL1h = 0x00000005
+	PSR_MODE_MASK = 0x0000000f
+
 	// KernelFlagsSet should always be set in the kernel.
-	KernelFlagsSet = _PSR_MODE_EL1h
+	KernelFlagsSet = PSR_MODE_EL1h
 
 	// UserFlagsSet are always set in userspace.
-	UserFlagsSet = _PSR_MODE_EL0t
+	UserFlagsSet = PSR_MODE_EL0t
 
-	KernelFlagsClear = _PSR_EL_MASK
-	UserFlagsClear   = _PSR_EL_MASK
+	KernelFlagsClear = PSR_MODE_MASK
+	UserFlagsClear   = PSR_MODE_MASK
 
 	PsrDefaultSet = _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT
 )
-- 
cgit v1.2.3


From d7b73792515d1ac34c8d8c41ef5de379f22f002b Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 25 Feb 2020 11:18:28 -0800
Subject: Deflake TestCurrentConnectedIncrement.

TestCurrentConnectedIncrement fails consistently under gotsan due to the sleep
to check metrics is exactly the same as the TIME-WAIT duration. Under gotsan
things can be slow enough that the increment test is done before the protocol
goroutine is run after the TIME-WAIT timer expires and does its cleanup.

Increasing the sleep from 1s to 1.2s makes the test pass consistently.

PiperOrigin-RevId: 297160181
---
 pkg/tcpip/transport/tcp/tcp_test.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index cc118c993..5b2b16afa 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -543,8 +543,9 @@ func TestCurrentConnectedIncrement(t *testing.T) {
 		),
 	)
 
-	// Wait for the TIME-WAIT state to transition to CLOSED.
-	time.Sleep(1 * time.Second)
+	// Wait for a little more than the TIME-WAIT duration for the socket to
+	// transition to CLOSED state.
+	time.Sleep(1200 * time.Millisecond)
 
 	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
 		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
-- 
cgit v1.2.3


From 53504e29ca27b8dc9e098fbb88983fdbce90cca3 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 25 Feb 2020 12:16:43 -0800
Subject: Fix mount refcount issue.

Each mount is holds a reference on a root Dirent, but the mount itself may
live beyond it's own reference. This means that a call to Root() can come
after the associated reference has been dropped.

Instead of introducing a separate layer of references for mount objects,
we simply change the Root() method to use TryIncRef() and allow it to return
nil if the mount is already gone. This requires updating a small number of
callers and minimizes the change (since VFSv2 will replace this code shortly).

PiperOrigin-RevId: 297174230
---
 pkg/sentry/fs/mount_test.go  | 11 ++++++-----
 pkg/sentry/fs/mounts.go      | 10 +++++++---
 pkg/sentry/fs/proc/mounts.go | 16 ++++++++++++----
 3 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index e672a438c..a3d10770b 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -36,11 +36,12 @@ func mountPathsAre(root *Dirent, got []*Mount, want ...string) error {
 	gotPaths := make(map[string]struct{}, len(got))
 	gotStr := make([]string, len(got))
 	for i, g := range got {
-		groot := g.Root()
-		name, _ := groot.FullName(root)
-		groot.DecRef()
-		gotStr[i] = name
-		gotPaths[name] = struct{}{}
+		if groot := g.Root(); groot != nil {
+			name, _ := groot.FullName(root)
+			groot.DecRef()
+			gotStr[i] = name
+			gotPaths[name] = struct{}{}
+		}
 	}
 	if len(got) != len(want) {
 		return fmt.Errorf("mount paths are different, got: %q, want: %q", gotStr, want)
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 574a2cc91..c7981f66e 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -100,10 +100,14 @@ func newUndoMount(d *Dirent) *Mount {
 	}
 }
 
-// Root returns the root dirent of this mount. Callers must call DecRef on the
-// returned dirent.
+// Root returns the root dirent of this mount.
+//
+// This may return nil if the mount has already been free. Callers must handle this
+// case appropriately. If non-nil, callers must call DecRef on the returned *Dirent.
 func (m *Mount) Root() *Dirent {
-	m.root.IncRef()
+	if !m.root.TryIncRef() {
+		return nil
+	}
 	return m.root
 }
 
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index c10888100..94deb553b 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -60,13 +60,15 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
 	})
 	for _, m := range ms {
 		mroot := m.Root()
+		if mroot == nil {
+			continue // No longer valid.
+		}
 		mountPath, desc := mroot.FullName(rootDir)
 		mroot.DecRef()
 		if !desc {
 			// MountSources that are not descendants of the chroot jail are ignored.
 			continue
 		}
-
 		fn(mountPath, m)
 	}
 }
@@ -91,6 +93,12 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 
 	var buf bytes.Buffer
 	forEachMount(mif.t, func(mountPath string, m *fs.Mount) {
+		mroot := m.Root()
+		if mroot == nil {
+			return // No longer valid.
+		}
+		defer mroot.DecRef()
+
 		// Format:
 		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
 		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
@@ -107,9 +115,6 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 
 		// (3) Major:Minor device ID. We don't have a superblock, so we
 		// just use the root inode device number.
-		mroot := m.Root()
-		defer mroot.DecRef()
-
 		sa := mroot.Inode.StableAttr
 		fmt.Fprintf(&buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor)
 
@@ -207,6 +212,9 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
 		//
 		// The "needs dump"and fsck flags are always 0, which is allowed.
 		root := m.Root()
+		if root == nil {
+			return // No longer valid.
+		}
 		defer root.DecRef()
 
 		flags := root.Inode.MountSource.Flags
-- 
cgit v1.2.3


From 98b693e61b37a62f7b29ce1cab8b4c4c54fa044e Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 25 Feb 2020 12:21:27 -0800
Subject: Don't acquire contended lock with the OS thread locked.

Fixes #1049

PiperOrigin-RevId: 297175164
---
 pkg/sentry/platform/kvm/machine.go | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 8076c7529..f1afc74dc 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -329,10 +329,12 @@ func (m *machine) Destroy() {
 }
 
 // Get gets an available vCPU.
+//
+// This will return with the OS thread locked.
 func (m *machine) Get() *vCPU {
+	m.mu.RLock()
 	runtime.LockOSThread()
 	tid := procid.Current()
-	m.mu.RLock()
 
 	// Check for an exact match.
 	if c := m.vCPUs[tid]; c != nil {
@@ -343,8 +345,22 @@ func (m *machine) Get() *vCPU {
 
 	// The happy path failed. We now proceed to acquire an exclusive lock
 	// (because the vCPU map may change), and scan all available vCPUs.
+	// In this case, we first unlock the OS thread. Otherwise, if mu is
+	// not available, the current system thread will be parked and a new
+	// system thread spawned. We avoid this situation by simply refreshing
+	// tid after relocking the system thread.
 	m.mu.RUnlock()
+	runtime.UnlockOSThread()
 	m.mu.Lock()
+	runtime.LockOSThread()
+	tid = procid.Current()
+
+	// Recheck for an exact match.
+	if c := m.vCPUs[tid]; c != nil {
+		c.lock()
+		m.mu.Unlock()
+		return c
+	}
 
 	for {
 		// Scan for an available vCPU.
-- 
cgit v1.2.3


From 6def8ea6ac601daa9256a31f818db9f7eb532168 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 25 Feb 2020 12:22:09 -0800
Subject: Fix nested logging.

PiperOrigin-RevId: 297175316
---
 pkg/log/glog.go               |  6 ++---
 pkg/log/json.go               |  2 +-
 pkg/log/json_k8s.go           |  2 +-
 pkg/log/log.go                | 60 ++++++++++++++++++++++++++++++++-----------
 pkg/sentry/kernel/task_log.go |  6 ++---
 5 files changed, 52 insertions(+), 24 deletions(-)

(limited to 'pkg')

diff --git a/pkg/log/glog.go b/pkg/log/glog.go
index cab5fae55..b4f7bb5a4 100644
--- a/pkg/log/glog.go
+++ b/pkg/log/glog.go
@@ -46,7 +46,7 @@ var pid = os.Getpid()
 //   line             The line number
 //   msg              The user-supplied message
 //
-func (g *GoogleEmitter) Emit(level Level, timestamp time.Time, format string, args ...interface{}) {
+func (g *GoogleEmitter) Emit(depth int, level Level, timestamp time.Time, format string, args ...interface{}) {
 	// Log level.
 	prefix := byte('?')
 	switch level {
@@ -64,9 +64,7 @@ func (g *GoogleEmitter) Emit(level Level, timestamp time.Time, format string, ar
 	microsecond := int(timestamp.Nanosecond() / 1000)
 
 	// 0 = this frame.
-	// 1 = Debugf, etc.
-	// 2 = Caller.
-	_, file, line, ok := runtime.Caller(2)
+	_, file, line, ok := runtime.Caller(depth + 1)
 	if ok {
 		// Trim any directory path from the file.
 		slash := strings.LastIndexByte(file, byte('/'))
diff --git a/pkg/log/json.go b/pkg/log/json.go
index a278c8fc8..0943db1cc 100644
--- a/pkg/log/json.go
+++ b/pkg/log/json.go
@@ -62,7 +62,7 @@ type JSONEmitter struct {
 }
 
 // Emit implements Emitter.Emit.
-func (e JSONEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (e JSONEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) {
 	j := jsonLog{
 		Msg:   fmt.Sprintf(format, v...),
 		Level: level,
diff --git a/pkg/log/json_k8s.go b/pkg/log/json_k8s.go
index cee6eb514..6c6fc8b6f 100644
--- a/pkg/log/json_k8s.go
+++ b/pkg/log/json_k8s.go
@@ -33,7 +33,7 @@ type K8sJSONEmitter struct {
 }
 
 // Emit implements Emitter.Emit.
-func (e *K8sJSONEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (e *K8sJSONEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) {
 	j := k8sJSONLog{
 		Log:   fmt.Sprintf(format, v...),
 		Level: level,
diff --git a/pkg/log/log.go b/pkg/log/log.go
index 5056f17e6..a794da1aa 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -79,7 +79,7 @@ func (l Level) String() string {
 type Emitter interface {
 	// Emit emits the given log statement. This allows for control over the
 	// timestamp used for logging.
-	Emit(level Level, timestamp time.Time, format string, v ...interface{})
+	Emit(depth int, level Level, timestamp time.Time, format string, v ...interface{})
 }
 
 // Writer writes the output to the given writer.
@@ -142,7 +142,7 @@ func (l *Writer) Write(data []byte) (int, error) {
 }
 
 // Emit emits the message.
-func (l *Writer) Emit(level Level, timestamp time.Time, format string, args ...interface{}) {
+func (l *Writer) Emit(_ int, _ Level, _ time.Time, format string, args ...interface{}) {
 	fmt.Fprintf(l, format, args...)
 }
 
@@ -150,9 +150,9 @@ func (l *Writer) Emit(level Level, timestamp time.Time, format string, args ...i
 type MultiEmitter []Emitter
 
 // Emit emits to all emitters.
-func (m *MultiEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (m *MultiEmitter) Emit(depth int, level Level, timestamp time.Time, format string, v ...interface{}) {
 	for _, e := range *m {
-		e.Emit(level, timestamp, format, v...)
+		e.Emit(1+depth, level, timestamp, format, v...)
 	}
 }
 
@@ -167,7 +167,7 @@ type TestEmitter struct {
 }
 
 // Emit emits to the TestLogger.
-func (t *TestEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (t *TestEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) {
 	t.Logf(format, v...)
 }
 
@@ -198,22 +198,37 @@ type BasicLogger struct {
 
 // Debugf implements logger.Debugf.
 func (l *BasicLogger) Debugf(format string, v ...interface{}) {
-	if l.IsLogging(Debug) {
-		l.Emit(Debug, time.Now(), format, v...)
-	}
+	l.DebugfAtDepth(1, format, v...)
 }
 
 // Infof implements logger.Infof.
 func (l *BasicLogger) Infof(format string, v ...interface{}) {
-	if l.IsLogging(Info) {
-		l.Emit(Info, time.Now(), format, v...)
-	}
+	l.InfofAtDepth(1, format, v...)
 }
 
 // Warningf implements logger.Warningf.
 func (l *BasicLogger) Warningf(format string, v ...interface{}) {
+	l.WarningfAtDepth(1, format, v...)
+}
+
+// DebugfAtDepth logs at a specific depth.
+func (l *BasicLogger) DebugfAtDepth(depth int, format string, v ...interface{}) {
+	if l.IsLogging(Debug) {
+		l.Emit(1+depth, Debug, time.Now(), format, v...)
+	}
+}
+
+// InfofAtDepth logs at a specific depth.
+func (l *BasicLogger) InfofAtDepth(depth int, format string, v ...interface{}) {
+	if l.IsLogging(Info) {
+		l.Emit(1+depth, Info, time.Now(), format, v...)
+	}
+}
+
+// WarningfAtDepth logs at a specific depth.
+func (l *BasicLogger) WarningfAtDepth(depth int, format string, v ...interface{}) {
 	if l.IsLogging(Warning) {
-		l.Emit(Warning, time.Now(), format, v...)
+		l.Emit(1+depth, Warning, time.Now(), format, v...)
 	}
 }
 
@@ -257,17 +272,32 @@ func SetLevel(newLevel Level) {
 
 // Debugf logs to the global logger.
 func Debugf(format string, v ...interface{}) {
-	Log().Debugf(format, v...)
+	Log().DebugfAtDepth(1, format, v...)
 }
 
 // Infof logs to the global logger.
 func Infof(format string, v ...interface{}) {
-	Log().Infof(format, v...)
+	Log().InfofAtDepth(1, format, v...)
 }
 
 // Warningf logs to the global logger.
 func Warningf(format string, v ...interface{}) {
-	Log().Warningf(format, v...)
+	Log().WarningfAtDepth(1, format, v...)
+}
+
+// DebugfAtDepth logs to the global logger.
+func DebugfAtDepth(depth int, format string, v ...interface{}) {
+	Log().DebugfAtDepth(1+depth, format, v...)
+}
+
+// InfofAtDepth logs to the global logger.
+func InfofAtDepth(depth int, format string, v ...interface{}) {
+	Log().InfofAtDepth(1+depth, format, v...)
+}
+
+// WarningfAtDepth logs to the global logger.
+func WarningfAtDepth(depth int, format string, v ...interface{}) {
+	Log().WarningfAtDepth(1+depth, format, v...)
 }
 
 // defaultStackSize is the default buffer size to allocate for stack traces.
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index 6d737d3e5..eeccaa197 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -32,21 +32,21 @@ const (
 // Infof logs an formatted info message by calling log.Infof.
 func (t *Task) Infof(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Info) {
-		log.Infof(t.logPrefix.Load().(string)+fmt, v...)
+		log.InfofAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
 // Warningf logs a warning string by calling log.Warningf.
 func (t *Task) Warningf(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Warning) {
-		log.Warningf(t.logPrefix.Load().(string)+fmt, v...)
+		log.WarningfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
 // Debugf creates a debug string that includes the task ID.
 func (t *Task) Debugf(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Debug) {
-		log.Debugf(t.logPrefix.Load().(string)+fmt, v...)
+		log.DebugfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
-- 
cgit v1.2.3


From 471b15b212831af31c2fe36cd42cea7ec7b7785b Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 25 Feb 2020 13:25:36 -0800
Subject: Port most syscalls to VFS2.

pipe and pipe2 aren't ported, pending a slight rework of pipe FDs for VFS2.
mount and umount2 aren't ported out of temporary laziness. access and faccessat
need additional FSImpl methods to implement properly, but are stubbed to
prevent googletest from CHECK-failing. Other syscalls require additional
plumbing.

Updates #1623

PiperOrigin-RevId: 297188448
---
 pkg/abi/linux/epoll_amd64.go                       |   2 +
 pkg/abi/linux/epoll_arm64.go                       |   2 +
 pkg/abi/linux/file.go                              |   2 +
 pkg/abi/linux/fs.go                                |   2 +
 pkg/abi/linux/signal.go                            |   2 +
 pkg/abi/linux/time.go                              |   6 +
 pkg/abi/linux/xattr.go                             |   1 +
 pkg/fspath/BUILD                                   |   4 +-
 pkg/fspath/builder.go                              |   8 +
 pkg/fspath/builder_unsafe.go                       |  27 -
 pkg/fspath/fspath.go                               |   3 +-
 pkg/gohacks/BUILD                                  |  11 +
 pkg/gohacks/gohacks_unsafe.go                      |  57 ++
 pkg/sentry/fsbridge/vfs.go                         |  10 +-
 pkg/sentry/fsimpl/proc/tasks.go                    |   4 +-
 pkg/sentry/kernel/fd_table.go                      |  49 +-
 pkg/sentry/kernel/fs_context.go                    |  22 +
 pkg/sentry/kernel/task.go                          |  18 +
 pkg/sentry/syscalls/linux/sys_epoll.go             |   4 +
 pkg/sentry/syscalls/linux/sys_file.go              |  40 ++
 pkg/sentry/syscalls/linux/sys_getdents.go          |   4 +
 pkg/sentry/syscalls/linux/sys_lseek.go             |   4 +
 pkg/sentry/syscalls/linux/sys_mmap.go              |   4 +
 pkg/sentry/syscalls/linux/sys_read.go              |   4 +
 pkg/sentry/syscalls/linux/sys_stat.go              |   4 +
 pkg/sentry/syscalls/linux/sys_sync.go              |   4 +
 pkg/sentry/syscalls/linux/sys_write.go             |   4 +
 pkg/sentry/syscalls/linux/sys_xattr.go             |   4 +
 pkg/sentry/syscalls/linux/vfs2/BUILD               |  28 +-
 pkg/sentry/syscalls/linux/vfs2/epoll.go            | 225 ++++++++
 pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go     |  44 ++
 pkg/sentry/syscalls/linux/vfs2/execve.go           | 137 +++++
 pkg/sentry/syscalls/linux/vfs2/fd.go               | 147 ++++++
 pkg/sentry/syscalls/linux/vfs2/filesystem.go       | 326 ++++++++++++
 pkg/sentry/syscalls/linux/vfs2/fscontext.go        | 131 +++++
 pkg/sentry/syscalls/linux/vfs2/getdents.go         | 149 ++++++
 pkg/sentry/syscalls/linux/vfs2/ioctl.go            |  35 ++
 .../syscalls/linux/vfs2/linux64_override_amd64.go  | 216 ++++----
 .../syscalls/linux/vfs2/linux64_override_arm64.go  |   2 +
 pkg/sentry/syscalls/linux/vfs2/mmap.go             |  92 ++++
 pkg/sentry/syscalls/linux/vfs2/path.go             |  94 ++++
 pkg/sentry/syscalls/linux/vfs2/poll.go             | 584 +++++++++++++++++++++
 pkg/sentry/syscalls/linux/vfs2/read_write.go       | 511 ++++++++++++++++++
 pkg/sentry/syscalls/linux/vfs2/setstat.go          | 380 ++++++++++++++
 pkg/sentry/syscalls/linux/vfs2/stat.go             | 346 ++++++++++++
 pkg/sentry/syscalls/linux/vfs2/sync.go             |  87 +++
 pkg/sentry/syscalls/linux/vfs2/sys_read.go         |  95 ----
 pkg/sentry/syscalls/linux/vfs2/xattr.go            | 353 +++++++++++++
 pkg/sentry/vfs/BUILD                               |   1 +
 pkg/sentry/vfs/epoll.go                            |   3 +
 pkg/sentry/vfs/mount_unsafe.go                     |  12 +-
 pkg/sentry/vfs/resolving_path.go                   |   2 +-
 pkg/sentry/vfs/vfs.go                              |  10 +-
 pkg/usermem/BUILD                                  |   2 +-
 pkg/usermem/usermem.go                             |   9 +-
 pkg/usermem/usermem_unsafe.go                      |  27 -
 runsc/boot/filter/config.go                        |   2 +
 57 files changed, 4082 insertions(+), 274 deletions(-)
 delete mode 100644 pkg/fspath/builder_unsafe.go
 create mode 100644 pkg/gohacks/BUILD
 create mode 100644 pkg/gohacks/gohacks_unsafe.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/epoll.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/execve.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/fd.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/filesystem.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/fscontext.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/getdents.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/ioctl.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/mmap.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/path.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/poll.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/read_write.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/setstat.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/stat.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/sync.go
 delete mode 100644 pkg/sentry/syscalls/linux/vfs2/sys_read.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/xattr.go
 delete mode 100644 pkg/usermem/usermem_unsafe.go

(limited to 'pkg')

diff --git a/pkg/abi/linux/epoll_amd64.go b/pkg/abi/linux/epoll_amd64.go
index 57041491c..34ff18009 100644
--- a/pkg/abi/linux/epoll_amd64.go
+++ b/pkg/abi/linux/epoll_amd64.go
@@ -15,6 +15,8 @@
 package linux
 
 // EpollEvent is equivalent to struct epoll_event from epoll(2).
+//
+// +marshal
 type EpollEvent struct {
 	Events uint32
 	// Linux makes struct epoll_event::data a __u64. We represent it as
diff --git a/pkg/abi/linux/epoll_arm64.go b/pkg/abi/linux/epoll_arm64.go
index 62ef5821e..f86c35329 100644
--- a/pkg/abi/linux/epoll_arm64.go
+++ b/pkg/abi/linux/epoll_arm64.go
@@ -15,6 +15,8 @@
 package linux
 
 // EpollEvent is equivalent to struct epoll_event from epoll(2).
+//
+// +marshal
 type EpollEvent struct {
 	Events uint32
 	// Linux makes struct epoll_event a __u64, necessitating 4 bytes of padding
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index c3ab15a4f..e229ac21c 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -241,6 +241,8 @@ const (
 )
 
 // Statx represents struct statx.
+//
+// +marshal
 type Statx struct {
 	Mask           uint32
 	Blksize        uint32
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 2c652baa2..158d2db5b 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -38,6 +38,8 @@ const (
 )
 
 // Statfs is struct statfs, from uapi/asm-generic/statfs.h.
+//
+// +marshal
 type Statfs struct {
 	// Type is one of the filesystem magic values, defined above.
 	Type uint64
diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go
index c69b04ea9..1c330e763 100644
--- a/pkg/abi/linux/signal.go
+++ b/pkg/abi/linux/signal.go
@@ -115,6 +115,8 @@ const (
 )
 
 // SignalSet is a signal mask with a bit corresponding to each signal.
+//
+// +marshal
 type SignalSet uint64
 
 // SignalSetSize is the size in bytes of a SignalSet.
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index e562b46d9..e6860ed49 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -157,6 +157,8 @@ func DurationToTimespec(dur time.Duration) Timespec {
 const SizeOfTimeval = 16
 
 // Timeval represents struct timeval in <time.h>.
+//
+// +marshal
 type Timeval struct {
 	Sec  int64
 	Usec int64
@@ -230,6 +232,8 @@ type Tms struct {
 type TimerID int32
 
 // StatxTimestamp represents struct statx_timestamp.
+//
+// +marshal
 type StatxTimestamp struct {
 	Sec  int64
 	Nsec uint32
@@ -258,6 +262,8 @@ func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) {
 }
 
 // Utime represents struct utimbuf used by utimes(2).
+//
+// +marshal
 type Utime struct {
 	Actime  int64
 	Modtime int64
diff --git a/pkg/abi/linux/xattr.go b/pkg/abi/linux/xattr.go
index a3b6406fa..99180b208 100644
--- a/pkg/abi/linux/xattr.go
+++ b/pkg/abi/linux/xattr.go
@@ -18,6 +18,7 @@ package linux
 const (
 	XATTR_NAME_MAX = 255
 	XATTR_SIZE_MAX = 65536
+	XATTR_LIST_MAX = 65536
 
 	XATTR_CREATE  = 1
 	XATTR_REPLACE = 2
diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD
index ee84471b2..67dd1e225 100644
--- a/pkg/fspath/BUILD
+++ b/pkg/fspath/BUILD
@@ -8,9 +8,11 @@ go_library(
     name = "fspath",
     srcs = [
         "builder.go",
-        "builder_unsafe.go",
         "fspath.go",
     ],
+    deps = [
+        "//pkg/gohacks",
+    ],
 )
 
 go_test(
diff --git a/pkg/fspath/builder.go b/pkg/fspath/builder.go
index 7ddb36826..6318d3874 100644
--- a/pkg/fspath/builder.go
+++ b/pkg/fspath/builder.go
@@ -16,6 +16,8 @@ package fspath
 
 import (
 	"fmt"
+
+	"gvisor.dev/gvisor/pkg/gohacks"
 )
 
 // Builder is similar to strings.Builder, but is used to produce pathnames
@@ -102,3 +104,9 @@ func (b *Builder) AppendString(str string) {
 	copy(b.buf[b.start:], b.buf[oldStart:])
 	copy(b.buf[len(b.buf)-len(str):], str)
 }
+
+// String returns the accumulated string. No other methods should be called
+// after String.
+func (b *Builder) String() string {
+	return gohacks.StringFromImmutableBytes(b.buf[b.start:])
+}
diff --git a/pkg/fspath/builder_unsafe.go b/pkg/fspath/builder_unsafe.go
deleted file mode 100644
index 75606808d..000000000
--- a/pkg/fspath/builder_unsafe.go
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fspath
-
-import (
-	"unsafe"
-)
-
-// String returns the accumulated string. No other methods should be called
-// after String.
-func (b *Builder) String() string {
-	bs := b.buf[b.start:]
-	// Compare strings.Builder.String().
-	return *(*string)(unsafe.Pointer(&bs))
-}
diff --git a/pkg/fspath/fspath.go b/pkg/fspath/fspath.go
index 9fb3fee24..4c983d5fd 100644
--- a/pkg/fspath/fspath.go
+++ b/pkg/fspath/fspath.go
@@ -67,7 +67,8 @@ func Parse(pathname string) Path {
 
 // Path contains the information contained in a pathname string.
 //
-// Path is copyable by value.
+// Path is copyable by value. The zero value for Path is equivalent to
+// fspath.Parse(""), i.e. the empty path.
 type Path struct {
 	// Begin is an iterator to the first path component in the relative part of
 	// the path.
diff --git a/pkg/gohacks/BUILD b/pkg/gohacks/BUILD
new file mode 100644
index 000000000..798a65eca
--- /dev/null
+++ b/pkg/gohacks/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "gohacks",
+    srcs = [
+        "gohacks_unsafe.go",
+    ],
+    visibility = ["//:sandbox"],
+)
diff --git a/pkg/gohacks/gohacks_unsafe.go b/pkg/gohacks/gohacks_unsafe.go
new file mode 100644
index 000000000..aad675172
--- /dev/null
+++ b/pkg/gohacks/gohacks_unsafe.go
@@ -0,0 +1,57 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gohacks contains utilities for subverting the Go compiler.
+package gohacks
+
+import (
+	"reflect"
+	"unsafe"
+)
+
+// Noescape hides a pointer from escape analysis. Noescape is the identity
+// function but escape analysis doesn't think the output depends on the input.
+// Noescape is inlined and currently compiles down to zero instructions.
+// USE CAREFULLY!
+//
+// (Noescape is copy/pasted from Go's runtime/stubs.go:noescape().)
+//
+//go:nosplit
+func Noescape(p unsafe.Pointer) unsafe.Pointer {
+	x := uintptr(p)
+	return unsafe.Pointer(x ^ 0)
+}
+
+// ImmutableBytesFromString is equivalent to []byte(s), except that it uses the
+// same memory backing s instead of making a heap-allocated copy. This is only
+// valid if the returned slice is never mutated.
+func ImmutableBytesFromString(s string) []byte {
+	shdr := (*reflect.StringHeader)(unsafe.Pointer(&s))
+	var bs []byte
+	bshdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs))
+	bshdr.Data = shdr.Data
+	bshdr.Len = shdr.Len
+	bshdr.Cap = shdr.Len
+	return bs
+}
+
+// StringFromImmutableBytes is equivalent to string(bs), except that it uses
+// the same memory backing bs instead of making a heap-allocated copy. This is
+// only valid if bs is never mutated after StringFromImmutableBytes returns.
+func StringFromImmutableBytes(bs []byte) string {
+	// This is cheaper than messing with reflect.StringHeader and
+	// reflect.SliceHeader, which as of this writing produces many dead stores
+	// of zeroes. Compare strings.Builder.String().
+	return *(*string)(unsafe.Pointer(&bs))
+}
diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
index e657c39bc..6aa17bfc1 100644
--- a/pkg/sentry/fsbridge/vfs.go
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -117,15 +117,19 @@ func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry)
 // default anyways.
 //
 // TODO(gvisor.dev/issue/1623): Check mount has read and exec permission.
-func (l *vfsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
+func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
 	vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem()
 	creds := auth.CredentialsFromContext(ctx)
+	path := fspath.Parse(pathname)
 	pop := &vfs.PathOperation{
 		Root:               l.root,
-		Start:              l.root,
-		Path:               fspath.Parse(path),
+		Start:              l.workingDir,
+		Path:               path,
 		FollowFinalSymlink: resolveFinal,
 	}
+	if path.Absolute {
+		pop.Start = l.root
+	}
 	fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts)
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index ce08a7d53..10c08fa90 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -73,9 +73,9 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames
 		"meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}),
 		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"),
 		"net":     newNetDir(root, inoGen, k),
-		"stat":    newDentry(root, inoGen.NextIno(), 0444, &statData{}),
+		"stat":    newDentry(root, inoGen.NextIno(), 0444, &statData{k: k}),
 		"uptime":  newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}),
-		"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}),
+		"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{k: k}),
 	}
 
 	inode := &tasksInode{
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 23b88f7a6..58001d56c 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -296,6 +296,50 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	return fds, nil
 }
 
+// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
+// the given file description. If it succeeds, it takes a reference on file.
+func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
+	if minfd < 0 {
+		// Don't accept negative FDs.
+		return -1, syscall.EINVAL
+	}
+
+	// Default limit.
+	end := int32(math.MaxInt32)
+
+	// Ensure we don't get past the provided limit.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		lim := limitSet.Get(limits.NumberOfFiles)
+		if lim.Cur != limits.Infinity {
+			end = int32(lim.Cur)
+		}
+		if minfd >= end {
+			return -1, syscall.EMFILE
+		}
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// From f.next to find available fd.
+	fd := minfd
+	if fd < f.next {
+		fd = f.next
+	}
+	for fd < end {
+		if d, _, _ := f.get(fd); d == nil {
+			f.setVFS2(fd, file, flags)
+			if fd == f.next {
+				// Update next search start position.
+				f.next = fd + 1
+			}
+			return fd, nil
+		}
+		fd++
+	}
+	return -1, syscall.EMFILE
+}
+
 // NewFDAt sets the file reference for the given FD. If there is an active
 // reference for that FD, the ref count for that existing reference is
 // decremented.
@@ -316,9 +360,6 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2
 		return syscall.EBADF
 	}
 
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
 	// Check the limit for the provided file.
 	if limitSet := limits.FromContext(ctx); limitSet != nil {
 		if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
@@ -327,6 +368,8 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2
 	}
 
 	// Install the entry.
+	f.mu.Lock()
+	defer f.mu.Unlock()
 	f.setAll(fd, file, fileVFS2, flags)
 	return nil
 }
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 7218aa24e..47f78df9a 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -244,6 +244,28 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
 	old.DecRef()
 }
 
+// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
+//
+// This is not a valid call after free.
+func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) {
+	if !vd.Ok() {
+		panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
+	}
+
+	f.mu.Lock()
+
+	if !f.rootVFS2.Ok() {
+		f.mu.Unlock()
+		panic(fmt.Sprintf("FSContext.SetRootDirectoryVFS2(%v)) called after destroy", vd))
+	}
+
+	old := f.rootVFS2
+	vd.IncRef()
+	f.rootVFS2 = vd
+	f.mu.Unlock()
+	old.DecRef()
+}
+
 // Umask returns the current umask.
 func (f *FSContext) Umask() uint {
 	f.mu.Lock()
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index e37e23231..2cee2e6ed 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -789,6 +789,15 @@ func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error)
 	return fds[0], nil
 }
 
+// NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.Get.
+func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
+	return t.fdTable.NewFDVFS2(t, fd, file, flags)
+}
+
 // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
 //
 // This automatically passes the task as the context.
@@ -798,6 +807,15 @@ func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error {
 	return t.fdTable.NewFDAt(t, fd, file, flags)
 }
 
+// NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error {
+	return t.fdTable.NewFDAtVFS2(t, fd, file, flags)
+}
+
 // WithMuLocked executes f with t.mu locked.
 func (t *Task) WithMuLocked(f func(*Task)) {
 	t.mu.Lock()
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index fbef5b376..3ab93fbde 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // EpollCreate1 implements the epoll_create1(2) linux syscall.
 func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	flags := args[0].Int()
@@ -164,3 +166,5 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 
 	return EpollWait(t, args)
 }
+
+// LINT.ThenChange(vfs2/epoll.go)
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 421845ebb..c21f14dc0 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -130,6 +130,8 @@ func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string
 	return path, dirPath, nil
 }
 
+// LINT.IfChange
+
 func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) {
 	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -575,6 +577,10 @@ func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
 // Ioctl implements linux syscall ioctl(2).
 func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -650,6 +656,10 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 }
 
+// LINT.ThenChange(vfs2/ioctl.go)
+
+// LINT.IfChange
+
 // Getcwd implements the linux syscall getcwd(2).
 func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
@@ -760,6 +770,10 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	return 0, nil, nil
 }
 
+// LINT.ThenChange(vfs2/fscontext.go)
+
+// LINT.IfChange
+
 // Close implements linux syscall close(2).
 func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -1094,6 +1108,8 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 }
 
+// LINT.ThenChange(vfs2/fd.go)
+
 const (
 	_FADV_NORMAL     = 0
 	_FADV_RANDOM     = 1
@@ -1141,6 +1157,8 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, nil
 }
 
+// LINT.IfChange
+
 func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
 	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1421,6 +1439,10 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
 func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
 	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1480,6 +1502,10 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	return n, nil, err
 }
 
+// LINT.ThenChange(vfs2/stat.go)
+
+// LINT.IfChange
+
 func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1516,6 +1542,10 @@ func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	return 0, nil, unlinkAt(t, dirFD, addr)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
 // Truncate implements linux syscall truncate(2).
 func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
@@ -1614,6 +1644,8 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, nil
 }
 
+// LINT.ThenChange(vfs2/setstat.go)
+
 // Umask implements linux syscall umask(2).
 func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	mask := args[0].ModeT()
@@ -1621,6 +1653,8 @@ func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	return uintptr(mask), nil, nil
 }
 
+// LINT.IfChange
+
 // Change ownership of a file.
 //
 // uid and gid may be -1, in which case they will not be changed.
@@ -1987,6 +2021,10 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
 }
 
+// LINT.ThenChange(vfs2/setstat.go)
+
+// LINT.IfChange
+
 func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error {
 	newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
 	if err != nil {
@@ -2042,6 +2080,8 @@ func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
 // Fallocate implements linux system call fallocate(2).
 func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index f66f4ffde..b126fecc0 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -27,6 +27,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // Getdents implements linux syscall getdents(2) for 64bit systems.
 func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -244,3 +246,5 @@ func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error {
 func (ds *direntSerializer) Written() int {
 	return ds.written
 }
+
+// LINT.ThenChange(vfs2/getdents.go)
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
index 297e920c4..3f7691eae 100644
--- a/pkg/sentry/syscalls/linux/sys_lseek.go
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -21,6 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// LINT.IfChange
+
 // Lseek implements linux syscall lseek(2).
 func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -52,3 +54,5 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 	return uintptr(offset), nil, err
 }
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 9959f6e61..91694d374 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -35,6 +35,8 @@ func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
 	return uintptr(addr), nil, nil
 }
 
+// LINT.IfChange
+
 // Mmap implements linux syscall mmap(2).
 func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	prot := args[2].Int()
@@ -104,6 +106,8 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 	return uintptr(rv), nil, err
 }
 
+// LINT.ThenChange(vfs2/mmap.go)
+
 // Munmap implements linux syscall munmap(2).
 func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64())
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 227692f06..78a2cb750 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -28,6 +28,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 const (
 	// EventMaskRead contains events that can be triggered on reads.
 	EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
@@ -388,3 +390,5 @@ func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (i
 
 	return total, err
 }
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 11f25e00d..701b27b4a 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -23,6 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
 	return linux.Stat{
 		Dev:     sattr.DeviceID,
@@ -297,3 +299,5 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
 	_, err = t.CopyOut(addr, &statfs)
 	return err
 }
+
+// LINT.ThenChange(vfs2/stat.go)
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index 3e55235bd..5ad465ae3 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -22,6 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// LINT.IfChange
+
 // Sync implements linux system call sync(2).
 func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	t.MountNamespace().SyncAll(t)
@@ -135,3 +137,5 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 
 	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
 }
+
+// LINT.ThenChange(vfs2/sync.go)
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index aba892939..506ee54ce 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -28,6 +28,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 const (
 	// EventMaskWrite contains events that can be triggered on writes.
 	//
@@ -358,3 +360,5 @@ func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) (
 
 	return total, err
 }
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 9d8140b8a..2de5e3422 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // GetXattr implements linux syscall getxattr(2).
 func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return getXattrFromPath(t, args, true)
@@ -418,3 +420,5 @@ func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
 
 	return d.Inode.RemoveXattr(t, d, name)
 }
+
+// LINT.ThenChange(vfs2/xattr.go)
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index 6b8a00b6e..f51761e81 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -5,18 +5,44 @@ package(licenses = ["notice"])
 go_library(
     name = "vfs2",
     srcs = [
+        "epoll.go",
+        "epoll_unsafe.go",
+        "execve.go",
+        "fd.go",
+        "filesystem.go",
+        "fscontext.go",
+        "getdents.go",
+        "ioctl.go",
         "linux64.go",
         "linux64_override_amd64.go",
         "linux64_override_arm64.go",
-        "sys_read.go",
+        "mmap.go",
+        "path.go",
+        "poll.go",
+        "read_write.go",
+        "setstat.go",
+        "stat.go",
+        "sync.go",
+        "xattr.go",
     ],
+    marshal = True,
     visibility = ["//:sandbox"],
     deps = [
+        "//pkg/abi/linux",
+        "//pkg/fspath",
+        "//pkg/gohacks",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/memmap",
         "//pkg/sentry/syscalls",
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/vfs",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
new file mode 100644
index 000000000..d6cb0e79a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -0,0 +1,225 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"math"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// EpollCreate1 implements Linux syscall epoll_create1(2).
+func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	if flags&^linux.EPOLL_CLOEXEC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file, err := t.Kernel().VFS().NewEpollInstanceFD()
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// EpollCreate implements Linux syscall epoll_create(2).
+func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := args[0].Int()
+
+	// "Since Linux 2.6.8, the size argument is ignored, but must be greater
+	// than zero" - epoll_create(2)
+	if size <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file, err := t.Kernel().VFS().NewEpollInstanceFD()
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// EpollCtl implements Linux syscall epoll_ctl(2).
+func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := args[0].Int()
+	op := args[1].Int()
+	fd := args[2].Int()
+	eventAddr := args[3].Pointer()
+
+	epfile := t.GetFileVFS2(epfd)
+	if epfile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer epfile.DecRef()
+	ep, ok := epfile.Impl().(*vfs.EpollInstance)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+	if epfile == file {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var event linux.EpollEvent
+	switch op {
+	case linux.EPOLL_CTL_ADD:
+		if err := event.CopyIn(t, eventAddr); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, ep.AddInterest(file, fd, event)
+	case linux.EPOLL_CTL_DEL:
+		return 0, nil, ep.DeleteInterest(file, fd)
+	case linux.EPOLL_CTL_MOD:
+		if err := event.CopyIn(t, eventAddr); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, ep.ModifyInterest(file, fd, event)
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+// EpollWait implements Linux syscall epoll_wait(2).
+func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := args[0].Int()
+	eventsAddr := args[1].Pointer()
+	maxEvents := int(args[2].Int())
+	timeout := int(args[3].Int())
+
+	const _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS
+	if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS {
+		return 0, nil, syserror.EINVAL
+	}
+
+	epfile := t.GetFileVFS2(epfd)
+	if epfile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer epfile.DecRef()
+	ep, ok := epfile.Impl().(*vfs.EpollInstance)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent,
+	// maxEvents), so that the buffer can be allocated on the stack.
+	var (
+		events       [16]linux.EpollEvent
+		total        int
+		ch           chan struct{}
+		haveDeadline bool
+		deadline     ktime.Time
+	)
+	for {
+		batchEvents := len(events)
+		if batchEvents > maxEvents {
+			batchEvents = maxEvents
+		}
+		n := ep.ReadEvents(events[:batchEvents])
+		maxEvents -= n
+		if n != 0 {
+			// Copy what we read out.
+			copiedEvents, err := copyOutEvents(t, eventsAddr, events[:n])
+			eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
+			total += copiedEvents
+			if err != nil {
+				if total != 0 {
+					return uintptr(total), nil, nil
+				}
+				return 0, nil, err
+			}
+			// If we've filled the application's event buffer, we're done.
+			if maxEvents == 0 {
+				return uintptr(total), nil, nil
+			}
+			// Loop if we read a full batch, under the expectation that there
+			// may be more events to read.
+			if n == batchEvents {
+				continue
+			}
+		}
+		// We get here if n != batchEvents. If we read any number of events
+		// (just now, or in a previous iteration of this loop), or if timeout
+		// is 0 (such that epoll_wait should be non-blocking), return the
+		// events we've read so far to the application.
+		if total != 0 || timeout == 0 {
+			return uintptr(total), nil, nil
+		}
+		// In the first iteration of this loop, register with the epoll
+		// instance for readability events, but then immediately continue the
+		// loop since we need to retry ReadEvents() before blocking. In all
+		// subsequent iterations, block until events are available, the timeout
+		// expires, or an interrupt arrives.
+		if ch == nil {
+			var w waiter.Entry
+			w, ch = waiter.NewChannelEntry(nil)
+			epfile.EventRegister(&w, waiter.EventIn)
+			defer epfile.EventUnregister(&w)
+		} else {
+			// Set up the timer if a timeout was specified.
+			if timeout > 0 && !haveDeadline {
+				timeoutDur := time.Duration(timeout) * time.Millisecond
+				deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
+				haveDeadline = true
+			}
+			if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+				if err == syserror.ETIMEDOUT {
+					err = nil
+				}
+				// total must be 0 since otherwise we would have returned
+				// above.
+				return 0, nil, err
+			}
+		}
+	}
+}
+
+// EpollPwait implements Linux syscall epoll_pwait(2).
+func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	maskAddr := args[4].Pointer()
+	maskSize := uint(args[5].Uint())
+
+	if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
+		return 0, nil, err
+	}
+
+	return EpollWait(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go
new file mode 100644
index 000000000..825f325bf
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go
@@ -0,0 +1,44 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"reflect"
+	"runtime"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const sizeofEpollEvent = int(unsafe.Sizeof(linux.EpollEvent{}))
+
+func copyOutEvents(t *kernel.Task, addr usermem.Addr, events []linux.EpollEvent) (int, error) {
+	if len(events) == 0 {
+		return 0, nil
+	}
+	// Cast events to a byte slice for copying.
+	var eventBytes []byte
+	eventBytesHdr := (*reflect.SliceHeader)(unsafe.Pointer(&eventBytes))
+	eventBytesHdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(&events[0])))
+	eventBytesHdr.Len = len(events) * sizeofEpollEvent
+	eventBytesHdr.Cap = len(events) * sizeofEpollEvent
+	copiedBytes, err := t.CopyOutBytes(addr, eventBytes)
+	runtime.KeepAlive(events)
+	copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
+	return copiedEvents, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go
new file mode 100644
index 000000000..aef0078a8
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/execve.go
@@ -0,0 +1,137 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Execve implements linux syscall execve(2).
+func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathnameAddr := args[0].Pointer()
+	argvAddr := args[1].Pointer()
+	envvAddr := args[2].Pointer()
+	return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */)
+}
+
+// Execveat implements linux syscall execveat(2).
+func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathnameAddr := args[1].Pointer()
+	argvAddr := args[2].Pointer()
+	envvAddr := args[3].Pointer()
+	flags := args[4].Int()
+	return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags)
+}
+
+func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
+	if err != nil {
+		return 0, nil, err
+	}
+	var argv, envv []string
+	if argvAddr != 0 {
+		var err error
+		argv, err = t.CopyInVector(argvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+	if envvAddr != 0 {
+		var err error
+		envv, err = t.CopyInVector(envvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	var executable fsbridge.File
+	closeOnExec := false
+	if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute {
+		// We must open the executable ourselves since dirfd is used as the
+		// starting point while resolving path, but the task working directory
+		// is used as the starting point while resolving interpreters (Linux:
+		// fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() =>
+		// do_open_execat(fd=AT_FDCWD)), and the loader package is currently
+		// incapable of handling this correctly.
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return 0, nil, syserror.ENOENT
+		}
+		dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd)
+		if dirfile == nil {
+			return 0, nil, syserror.EBADF
+		}
+		start := dirfile.VirtualDentry()
+		start.IncRef()
+		dirfile.DecRef()
+		closeOnExec = dirfileFlags.CloseOnExec
+		file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{
+			Root:               root,
+			Start:              start,
+			Path:               path,
+			FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+		}, &vfs.OpenOptions{
+			Flags:    linux.O_RDONLY,
+			FileExec: true,
+		})
+		start.DecRef()
+		if err != nil {
+			return 0, nil, err
+		}
+		defer file.DecRef()
+		executable = fsbridge.NewVFSFile(file)
+	}
+
+	// Load the new TaskContext.
+	mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change
+	defer mntns.DecRef()
+	wd := t.FSContext().WorkingDirectoryVFS2()
+	defer wd.DecRef()
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
+	loadArgs := loader.LoadArgs{
+		Opener:              fsbridge.NewVFSLookup(mntns, root, wd),
+		RemainingTraversals: &remainingTraversals,
+		ResolveFinal:        flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+		Filename:            pathname,
+		File:                executable,
+		CloseOnExec:         closeOnExec,
+		Argv:                argv,
+		Envv:                envv,
+		Features:            t.Arch().FeatureSet(),
+	}
+
+	tc, se := t.Kernel().LoadTaskImage(t, loadArgs)
+	if se != nil {
+		return 0, nil, se.ToError()
+	}
+
+	ctrl, err := t.Execve(tc)
+	return 0, ctrl, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
new file mode 100644
index 000000000..3afcea665
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -0,0 +1,147 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Close implements Linux syscall close(2).
+func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	// Note that Remove provides a reference on the file that we may use to
+	// flush. It is still active until we drop the final reference below
+	// (and other reference-holding operations complete).
+	_, file := t.FDTable().Remove(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.OnClose(t)
+	return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file)
+}
+
+// Dup implements Linux syscall dup(2).
+func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
+	if err != nil {
+		return 0, nil, syserror.EMFILE
+	}
+	return uintptr(newFD), nil, nil
+}
+
+// Dup2 implements Linux syscall dup2(2).
+func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := args[0].Int()
+	newfd := args[1].Int()
+
+	if oldfd == newfd {
+		// As long as oldfd is valid, dup2() does nothing and returns newfd.
+		file := t.GetFileVFS2(oldfd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		file.DecRef()
+		return uintptr(newfd), nil, nil
+	}
+
+	return dup3(t, oldfd, newfd, 0)
+}
+
+// Dup3 implements Linux syscall dup3(2).
+func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := args[0].Int()
+	newfd := args[1].Int()
+	flags := args[2].Uint()
+
+	if oldfd == newfd {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return dup3(t, oldfd, newfd, flags)
+}
+
+func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) {
+	if flags&^linux.O_CLOEXEC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(oldfd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(newfd), nil, nil
+}
+
+// Fcntl implements linux syscall fcntl(2).
+func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	cmd := args[1].Int()
+
+	file, flags := t.FDTable().GetVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	switch cmd {
+	case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
+		minfd := args[2].Int()
+		fd, err := t.NewFDFromVFS2(minfd, file, kernel.FDFlags{
+			CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
+		})
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(fd), nil, nil
+	case linux.F_GETFD:
+		return uintptr(flags.ToLinuxFDFlags()), nil, nil
+	case linux.F_SETFD:
+		flags := args[2].Uint()
+		t.FDTable().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
+		})
+		return 0, nil, nil
+	case linux.F_GETFL:
+		return uintptr(file.StatusFlags()), nil, nil
+	case linux.F_SETFL:
+		return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
+	default:
+		// TODO(gvisor.dev/issue/1623): Everything else is not yet supported.
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
new file mode 100644
index 000000000..fc5ceea4c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -0,0 +1,326 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Link implements Linux syscall link(2).
+func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldpathAddr := args[0].Pointer()
+	newpathAddr := args[1].Pointer()
+	return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
+}
+
+// Linkat implements Linux syscall linkat(2).
+func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	flags := args[4].Int()
+	return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
+}
+
+func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+	if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) {
+		return syserror.ENOENT
+	}
+
+	oldpath, err := copyInPath(t, oldpathAddr)
+	if err != nil {
+		return err
+	}
+	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0))
+	if err != nil {
+		return err
+	}
+	defer oldtpop.Release()
+
+	newpath, err := copyInPath(t, newpathAddr)
+	if err != nil {
+		return err
+	}
+	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer newtpop.Release()
+
+	return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop)
+}
+
+// Mkdir implements Linux syscall mkdir(2).
+func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Mkdirat implements Linux syscall mkdirat(2).
+func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+	return 0, nil, mkdirat(t, dirfd, addr, mode)
+}
+
+func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error {
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{
+		Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()),
+	})
+}
+
+// Mknod implements Linux syscall mknod(2).
+func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	dev := args[2].Uint()
+	return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev)
+}
+
+// Mknodat implements Linux syscall mknodat(2).
+func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+	dev := args[3].Uint()
+	return 0, nil, mknodat(t, dirfd, addr, mode, dev)
+}
+
+func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error {
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	major, minor := linux.DecodeDeviceID(dev)
+	return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
+		Mode:     linux.FileMode(mode &^ t.FSContext().Umask()),
+		DevMajor: uint32(major),
+		DevMinor: minor,
+	})
+}
+
+// Open implements Linux syscall open(2).
+func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Uint()
+	mode := args[2].ModeT()
+	return openat(t, linux.AT_FDCWD, addr, flags, mode)
+}
+
+// Openat implements Linux syscall openat(2).
+func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	flags := args[2].Uint()
+	mode := args[3].ModeT()
+	return openat(t, dirfd, addr, flags, mode)
+}
+
+// Creat implements Linux syscall creat(2).
+func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode)
+}
+
+func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0))
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
+		Flags: flags,
+		Mode:  linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()),
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	return uintptr(fd), nil, err
+}
+
+// Rename implements Linux syscall rename(2).
+func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldpathAddr := args[0].Pointer()
+	newpathAddr := args[1].Pointer()
+	return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
+}
+
+// Renameat implements Linux syscall renameat(2).
+func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */)
+}
+
+// Renameat2 implements Linux syscall renameat2(2).
+func Renameat2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	flags := args[4].Uint()
+	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
+}
+
+func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags uint32) error {
+	oldpath, err := copyInPath(t, oldpathAddr)
+	if err != nil {
+		return err
+	}
+	// "If oldpath refers to a symbolic link, the link is renamed" - rename(2)
+	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer oldtpop.Release()
+
+	newpath, err := copyInPath(t, newpathAddr)
+	if err != nil {
+		return err
+	}
+	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer newtpop.Release()
+
+	return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{
+		Flags: flags,
+	})
+}
+
+// Rmdir implements Linux syscall rmdir(2).
+func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr)
+}
+
+func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop)
+}
+
+// Unlink implements Linux syscall unlink(2).
+func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr)
+}
+
+func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop)
+}
+
+// Unlinkat implements Linux syscall unlinkat(2).
+func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if flags&^linux.AT_REMOVEDIR != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if flags&linux.AT_REMOVEDIR != 0 {
+		return 0, nil, rmdirat(t, dirfd, pathAddr)
+	}
+	return 0, nil, unlinkat(t, dirfd, pathAddr)
+}
+
+// Symlink implements Linux syscall symlink(2).
+func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	targetAddr := args[0].Pointer()
+	linkpathAddr := args[1].Pointer()
+	return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr)
+}
+
+// Symlinkat implements Linux syscall symlinkat(2).
+func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	targetAddr := args[0].Pointer()
+	newdirfd := args[1].Int()
+	linkpathAddr := args[2].Pointer()
+	return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr)
+}
+
+func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpathAddr usermem.Addr) error {
+	target, err := t.CopyInString(targetAddr, linux.PATH_MAX)
+	if err != nil {
+		return err
+	}
+	linkpath, err := copyInPath(t, linkpathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
new file mode 100644
index 000000000..317409a18
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Getcwd implements Linux syscall getcwd(2).
+func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	size := args[1].SizeT()
+
+	root := t.FSContext().RootDirectoryVFS2()
+	wd := t.FSContext().WorkingDirectoryVFS2()
+	s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd)
+	root.DecRef()
+	wd.DecRef()
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Note this is >= because we need a terminator.
+	if uint(len(s)) >= size {
+		return 0, nil, syserror.ERANGE
+	}
+
+	// Construct a byte slice containing a NUL terminator.
+	buf := t.CopyScratchBuffer(len(s) + 1)
+	copy(buf, s)
+	buf[len(buf)-1] = 0
+
+	// Write the pathname slice.
+	n, err := t.CopyOutBytes(addr, buf)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Chdir implements Linux syscall chdir(2).
+func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetWorkingDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
+
+// Fchdir implements Linux syscall fchdir(2).
+func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetWorkingDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
+
+// Chroot implements Linux syscall chroot(2).
+func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	if !t.HasCapability(linux.CAP_SYS_CHROOT) {
+		return 0, nil, syserror.EPERM
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetRootDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
new file mode 100644
index 000000000..ddc140b65
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -0,0 +1,149 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Getdents implements Linux syscall getdents(2).
+func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getdents(t, args, false /* isGetdents64 */)
+}
+
+// Getdents64 implements Linux syscall getdents64(2).
+func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getdents(t, args, true /* isGetdents64 */)
+}
+
+func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := int(args[2].Uint())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	cb := getGetdentsCallback(t, addr, size, isGetdents64)
+	err := file.IterDirents(t, cb)
+	n := size - cb.remaining
+	putGetdentsCallback(cb)
+	if n == 0 {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+type getdentsCallback struct {
+	t            *kernel.Task
+	addr         usermem.Addr
+	remaining    int
+	isGetdents64 bool
+}
+
+var getdentsCallbackPool = sync.Pool{
+	New: func() interface{} {
+		return &getdentsCallback{}
+	},
+}
+
+func getGetdentsCallback(t *kernel.Task, addr usermem.Addr, size int, isGetdents64 bool) *getdentsCallback {
+	cb := getdentsCallbackPool.Get().(*getdentsCallback)
+	*cb = getdentsCallback{
+		t:            t,
+		addr:         addr,
+		remaining:    size,
+		isGetdents64: isGetdents64,
+	}
+	return cb
+}
+
+func putGetdentsCallback(cb *getdentsCallback) {
+	cb.t = nil
+	getdentsCallbackPool.Put(cb)
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
+	var buf []byte
+	if cb.isGetdents64 {
+		// struct linux_dirent64 {
+		//     ino64_t        d_ino;    /* 64-bit inode number */
+		//     off64_t        d_off;    /* 64-bit offset to next structure */
+		//     unsigned short d_reclen; /* Size of this dirent */
+		//     unsigned char  d_type;   /* File type */
+		//     char           d_name[]; /* Filename (null-terminated) */
+		// };
+		size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
+		if size < cb.remaining {
+			return syserror.EINVAL
+		}
+		buf = cb.t.CopyScratchBuffer(size)
+		usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
+		usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
+		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
+		buf[18] = dirent.Type
+		copy(buf[19:], dirent.Name)
+		buf[size-1] = 0 // NUL terminator
+	} else {
+		// struct linux_dirent {
+		//     unsigned long  d_ino;     /* Inode number */
+		//     unsigned long  d_off;     /* Offset to next linux_dirent */
+		//     unsigned short d_reclen;  /* Length of this linux_dirent */
+		//     char           d_name[];  /* Filename (null-terminated) */
+		//                       /* length is actually (d_reclen - 2 -
+		//                          offsetof(struct linux_dirent, d_name)) */
+		//     /*
+		//     char           pad;       // Zero padding byte
+		//     char           d_type;    // File type (only since Linux
+		//                               // 2.6.4); offset is (d_reclen - 1)
+		//     */
+		// };
+		if cb.t.Arch().Width() != 8 {
+			panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
+		}
+		size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name)
+		if size < cb.remaining {
+			return syserror.EINVAL
+		}
+		buf = cb.t.CopyScratchBuffer(size)
+		usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
+		usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
+		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
+		copy(buf[18:], dirent.Name)
+		buf[size-3] = 0 // NUL terminator
+		buf[size-2] = 0 // zero padding byte
+		buf[size-1] = dirent.Type
+	}
+	n, err := cb.t.CopyOutBytes(cb.addr, buf)
+	if err != nil {
+		// Don't report partially-written dirents by advancing cb.addr or
+		// cb.remaining.
+		return err
+	}
+	cb.addr += usermem.Addr(n)
+	cb.remaining -= n
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
new file mode 100644
index 000000000..5a2418da9
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Ioctl implements Linux syscall ioctl(2).
+func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	ret, err := file.Ioctl(t, t.MemoryManager(), args)
+	return ret, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
index e0ac32b33..7d220bc20 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build amd64
+
 package vfs2
 
 import (
@@ -22,110 +24,142 @@ import (
 // Override syscall table to add syscalls implementations from this package.
 func Override(table map[uintptr]kernel.Syscall) {
 	table[0] = syscalls.Supported("read", Read)
-
-	// Remove syscalls that haven't been converted yet. It's better to get ENOSYS
-	// rather than a SIGSEGV deep in the stack.
-	delete(table, 1)   // write
-	delete(table, 2)   // open
-	delete(table, 3)   // close
-	delete(table, 4)   // stat
-	delete(table, 5)   // fstat
-	delete(table, 6)   // lstat
-	delete(table, 7)   // poll
-	delete(table, 8)   // lseek
-	delete(table, 9)   // mmap
-	delete(table, 16)  // ioctl
-	delete(table, 17)  // pread64
-	delete(table, 18)  // pwrite64
-	delete(table, 19)  // readv
-	delete(table, 20)  // writev
-	delete(table, 21)  // access
-	delete(table, 22)  // pipe
-	delete(table, 32)  // dup
-	delete(table, 33)  // dup2
-	delete(table, 40)  // sendfile
-	delete(table, 59)  // execve
-	delete(table, 72)  // fcntl
-	delete(table, 73)  // flock
-	delete(table, 74)  // fsync
-	delete(table, 75)  // fdatasync
-	delete(table, 76)  // truncate
-	delete(table, 77)  // ftruncate
-	delete(table, 78)  // getdents
-	delete(table, 79)  // getcwd
-	delete(table, 80)  // chdir
-	delete(table, 81)  // fchdir
-	delete(table, 82)  // rename
-	delete(table, 83)  // mkdir
-	delete(table, 84)  // rmdir
-	delete(table, 85)  // creat
-	delete(table, 86)  // link
-	delete(table, 87)  // unlink
-	delete(table, 88)  // symlink
-	delete(table, 89)  // readlink
-	delete(table, 90)  // chmod
-	delete(table, 91)  // fchmod
-	delete(table, 92)  // chown
-	delete(table, 93)  // fchown
-	delete(table, 94)  // lchown
-	delete(table, 133) // mknod
-	delete(table, 137) // statfs
-	delete(table, 138) // fstatfs
-	delete(table, 161) // chroot
-	delete(table, 162) // sync
+	table[1] = syscalls.Supported("write", Write)
+	table[2] = syscalls.Supported("open", Open)
+	table[3] = syscalls.Supported("close", Close)
+	table[4] = syscalls.Supported("stat", Stat)
+	table[5] = syscalls.Supported("fstat", Fstat)
+	table[6] = syscalls.Supported("lstat", Lstat)
+	table[7] = syscalls.Supported("poll", Poll)
+	table[8] = syscalls.Supported("lseek", Lseek)
+	table[9] = syscalls.Supported("mmap", Mmap)
+	table[16] = syscalls.Supported("ioctl", Ioctl)
+	table[17] = syscalls.Supported("pread64", Pread64)
+	table[18] = syscalls.Supported("pwrite64", Pwrite64)
+	table[19] = syscalls.Supported("readv", Readv)
+	table[20] = syscalls.Supported("writev", Writev)
+	table[21] = syscalls.Supported("access", Access)
+	delete(table, 22) // pipe
+	table[23] = syscalls.Supported("select", Select)
+	table[32] = syscalls.Supported("dup", Dup)
+	table[33] = syscalls.Supported("dup2", Dup2)
+	delete(table, 40) // sendfile
+	delete(table, 41) // socket
+	delete(table, 42) // connect
+	delete(table, 43) // accept
+	delete(table, 44) // sendto
+	delete(table, 45) // recvfrom
+	delete(table, 46) // sendmsg
+	delete(table, 47) // recvmsg
+	delete(table, 48) // shutdown
+	delete(table, 49) // bind
+	delete(table, 50) // listen
+	delete(table, 51) // getsockname
+	delete(table, 52) // getpeername
+	delete(table, 53) // socketpair
+	delete(table, 54) // setsockopt
+	delete(table, 55) // getsockopt
+	table[59] = syscalls.Supported("execve", Execve)
+	table[72] = syscalls.Supported("fcntl", Fcntl)
+	delete(table, 73) // flock
+	table[74] = syscalls.Supported("fsync", Fsync)
+	table[75] = syscalls.Supported("fdatasync", Fdatasync)
+	table[76] = syscalls.Supported("truncate", Truncate)
+	table[77] = syscalls.Supported("ftruncate", Ftruncate)
+	table[78] = syscalls.Supported("getdents", Getdents)
+	table[79] = syscalls.Supported("getcwd", Getcwd)
+	table[80] = syscalls.Supported("chdir", Chdir)
+	table[81] = syscalls.Supported("fchdir", Fchdir)
+	table[82] = syscalls.Supported("rename", Rename)
+	table[83] = syscalls.Supported("mkdir", Mkdir)
+	table[84] = syscalls.Supported("rmdir", Rmdir)
+	table[85] = syscalls.Supported("creat", Creat)
+	table[86] = syscalls.Supported("link", Link)
+	table[87] = syscalls.Supported("unlink", Unlink)
+	table[88] = syscalls.Supported("symlink", Symlink)
+	table[89] = syscalls.Supported("readlink", Readlink)
+	table[90] = syscalls.Supported("chmod", Chmod)
+	table[91] = syscalls.Supported("fchmod", Fchmod)
+	table[92] = syscalls.Supported("chown", Chown)
+	table[93] = syscalls.Supported("fchown", Fchown)
+	table[94] = syscalls.Supported("lchown", Lchown)
+	table[132] = syscalls.Supported("utime", Utime)
+	table[133] = syscalls.Supported("mknod", Mknod)
+	table[137] = syscalls.Supported("statfs", Statfs)
+	table[138] = syscalls.Supported("fstatfs", Fstatfs)
+	table[161] = syscalls.Supported("chroot", Chroot)
+	table[162] = syscalls.Supported("sync", Sync)
 	delete(table, 165) // mount
 	delete(table, 166) // umount2
-	delete(table, 172) // iopl
-	delete(table, 173) // ioperm
 	delete(table, 187) // readahead
-	delete(table, 188) // setxattr
-	delete(table, 189) // lsetxattr
-	delete(table, 190) // fsetxattr
-	delete(table, 191) // getxattr
-	delete(table, 192) // lgetxattr
-	delete(table, 193) // fgetxattr
+	table[188] = syscalls.Supported("setxattr", Setxattr)
+	table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
+	table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
+	table[191] = syscalls.Supported("getxattr", Getxattr)
+	table[192] = syscalls.Supported("lgetxattr", Lgetxattr)
+	table[193] = syscalls.Supported("fgetxattr", Fgetxattr)
+	table[194] = syscalls.Supported("listxattr", Listxattr)
+	table[195] = syscalls.Supported("llistxattr", Llistxattr)
+	table[196] = syscalls.Supported("flistxattr", Flistxattr)
+	table[197] = syscalls.Supported("removexattr", Removexattr)
+	table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
+	table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
 	delete(table, 206) // io_setup
 	delete(table, 207) // io_destroy
 	delete(table, 208) // io_getevents
 	delete(table, 209) // io_submit
 	delete(table, 210) // io_cancel
-	delete(table, 213) // epoll_create
-	delete(table, 214) // epoll_ctl_old
-	delete(table, 215) // epoll_wait_old
-	delete(table, 216) // remap_file_pages
-	delete(table, 217) // getdents64
-	delete(table, 232) // epoll_wait
-	delete(table, 233) // epoll_ctl
+	table[213] = syscalls.Supported("epoll_create", EpollCreate)
+	table[217] = syscalls.Supported("getdents64", Getdents64)
+	delete(table, 221) // fdavise64
+	table[232] = syscalls.Supported("epoll_wait", EpollWait)
+	table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
+	table[235] = syscalls.Supported("utimes", Utimes)
 	delete(table, 253) // inotify_init
 	delete(table, 254) // inotify_add_watch
 	delete(table, 255) // inotify_rm_watch
-	delete(table, 257) // openat
-	delete(table, 258) // mkdirat
-	delete(table, 259) // mknodat
-	delete(table, 260) // fchownat
-	delete(table, 261) // futimesat
-	delete(table, 262) // fstatat
-	delete(table, 263) // unlinkat
-	delete(table, 264) // renameat
-	delete(table, 265) // linkat
-	delete(table, 266) // symlinkat
-	delete(table, 267) // readlinkat
-	delete(table, 268) // fchmodat
-	delete(table, 269) // faccessat
-	delete(table, 270) // pselect
-	delete(table, 271) // ppoll
+	table[257] = syscalls.Supported("openat", Openat)
+	table[258] = syscalls.Supported("mkdirat", Mkdirat)
+	table[259] = syscalls.Supported("mknodat", Mknodat)
+	table[260] = syscalls.Supported("fchownat", Fchownat)
+	table[261] = syscalls.Supported("futimens", Futimens)
+	table[262] = syscalls.Supported("newfstatat", Newfstatat)
+	table[263] = syscalls.Supported("unlinkat", Unlinkat)
+	table[264] = syscalls.Supported("renameat", Renameat)
+	table[265] = syscalls.Supported("linkat", Linkat)
+	table[266] = syscalls.Supported("symlinkat", Symlinkat)
+	table[267] = syscalls.Supported("readlinkat", Readlinkat)
+	table[268] = syscalls.Supported("fchmodat", Fchmodat)
+	table[269] = syscalls.Supported("faccessat", Faccessat)
+	table[270] = syscalls.Supported("pselect", Pselect)
+	table[271] = syscalls.Supported("ppoll", Ppoll)
+	delete(table, 275) // splice
+	delete(table, 276) // tee
+	table[277] = syscalls.Supported("sync_file_range", SyncFileRange)
+	table[280] = syscalls.Supported("utimensat", Utimensat)
+	table[281] = syscalls.Supported("epoll_pwait", EpollPwait)
+	delete(table, 282) // signalfd
+	delete(table, 283) // timerfd_create
+	delete(table, 284) // eventfd
 	delete(table, 285) // fallocate
-	delete(table, 291) // epoll_create1
-	delete(table, 292) // dup3
+	delete(table, 286) // timerfd_settime
+	delete(table, 287) // timerfd_gettime
+	delete(table, 288) // accept4
+	delete(table, 289) // signalfd4
+	delete(table, 290) // eventfd2
+	table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
+	table[292] = syscalls.Supported("dup3", Dup3)
 	delete(table, 293) // pipe2
 	delete(table, 294) // inotify_init1
-	delete(table, 295) // preadv
-	delete(table, 296) // pwritev
-	delete(table, 306) // syncfs
-	delete(table, 316) // renameat2
+	table[295] = syscalls.Supported("preadv", Preadv)
+	table[296] = syscalls.Supported("pwritev", Pwritev)
+	delete(table, 299) // recvmmsg
+	table[306] = syscalls.Supported("syncfs", Syncfs)
+	delete(table, 307) // sendmmsg
+	table[316] = syscalls.Supported("renameat2", Renameat2)
 	delete(table, 319) // memfd_create
-	delete(table, 322) // execveat
-	delete(table, 327) // preadv2
-	delete(table, 328) // pwritev2
-	delete(table, 332) // statx
+	table[322] = syscalls.Supported("execveat", Execveat)
+	table[327] = syscalls.Supported("preadv2", Preadv2)
+	table[328] = syscalls.Supported("pwritev2", Pwritev2)
+	table[332] = syscalls.Supported("statx", Statx)
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
index 6af5c400f..a6b367468 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build arm64
+
 package vfs2
 
 import (
diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go
new file mode 100644
index 000000000..60a43f0a0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go
@@ -0,0 +1,92 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mmap implements Linux syscall mmap(2).
+func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	prot := args[2].Int()
+	flags := args[3].Int()
+	fd := args[4].Int()
+	fixed := flags&linux.MAP_FIXED != 0
+	private := flags&linux.MAP_PRIVATE != 0
+	shared := flags&linux.MAP_SHARED != 0
+	anon := flags&linux.MAP_ANONYMOUS != 0
+	map32bit := flags&linux.MAP_32BIT != 0
+
+	// Require exactly one of MAP_PRIVATE and MAP_SHARED.
+	if private == shared {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts := memmap.MMapOpts{
+		Length:   args[1].Uint64(),
+		Offset:   args[5].Uint64(),
+		Addr:     args[0].Pointer(),
+		Fixed:    fixed,
+		Unmap:    fixed,
+		Map32Bit: map32bit,
+		Private:  private,
+		Perms: usermem.AccessType{
+			Read:    linux.PROT_READ&prot != 0,
+			Write:   linux.PROT_WRITE&prot != 0,
+			Execute: linux.PROT_EXEC&prot != 0,
+		},
+		MaxPerms:  usermem.AnyAccess,
+		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
+		Precommit: linux.MAP_POPULATE&flags != 0,
+	}
+	if linux.MAP_LOCKED&flags != 0 {
+		opts.MLockMode = memmap.MLockEager
+	}
+	defer func() {
+		if opts.MappingIdentity != nil {
+			opts.MappingIdentity.DecRef()
+		}
+	}()
+
+	if !anon {
+		// Convert the passed FD to a file reference.
+		file := t.GetFileVFS2(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		// mmap unconditionally requires that the FD is readable.
+		if !file.IsReadable() {
+			return 0, nil, syserror.EACCES
+		}
+		// MAP_SHARED requires that the FD be writable for PROT_WRITE.
+		if shared && !file.IsWritable() {
+			opts.MaxPerms.Write = false
+		}
+
+		if err := file.ConfigureMMap(t, &opts); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	rv, err := t.MemoryManager().MMap(t, opts)
+	return uintptr(rv), nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go
new file mode 100644
index 000000000..97da6c647
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/path.go
@@ -0,0 +1,94 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func copyInPath(t *kernel.Task, addr usermem.Addr) (fspath.Path, error) {
+	pathname, err := t.CopyInString(addr, linux.PATH_MAX)
+	if err != nil {
+		return fspath.Path{}, err
+	}
+	return fspath.Parse(pathname), nil
+}
+
+type taskPathOperation struct {
+	pop          vfs.PathOperation
+	haveStartRef bool
+}
+
+func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink) (taskPathOperation, error) {
+	root := t.FSContext().RootDirectoryVFS2()
+	start := root
+	haveStartRef := false
+	if !path.Absolute {
+		if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
+			root.DecRef()
+			return taskPathOperation{}, syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			haveStartRef = true
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				root.DecRef()
+				return taskPathOperation{}, syserror.EBADF
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			haveStartRef = true
+			dirfile.DecRef()
+		}
+	}
+	return taskPathOperation{
+		pop: vfs.PathOperation{
+			Root:               root,
+			Start:              start,
+			Path:               path,
+			FollowFinalSymlink: bool(shouldFollowFinalSymlink),
+		},
+		haveStartRef: haveStartRef,
+	}, nil
+}
+
+func (tpop *taskPathOperation) Release() {
+	tpop.pop.Root.DecRef()
+	if tpop.haveStartRef {
+		tpop.pop.Start.DecRef()
+		tpop.haveStartRef = false
+	}
+}
+
+type shouldAllowEmptyPath bool
+
+const (
+	disallowEmptyPath shouldAllowEmptyPath = false
+	allowEmptyPath    shouldAllowEmptyPath = true
+)
+
+type shouldFollowFinalSymlink bool
+
+const (
+	nofollowFinalSymlink shouldFollowFinalSymlink = false
+	followFinalSymlink   shouldFollowFinalSymlink = true
+)
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
new file mode 100644
index 000000000..dbf4882da
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -0,0 +1,584 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"fmt"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// fileCap is the maximum allowable files for poll & select. This has no
+// equivalent in Linux; it exists in gVisor since allocation failure in Go is
+// unrecoverable.
+const fileCap = 1024 * 1024
+
+// Masks for "readable", "writable", and "exceptional" events as defined by
+// select(2).
+const (
+	// selectReadEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLIN_SET.
+	selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR
+
+	// selectWriteEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLOUT_SET.
+	selectWriteEvents = linux.POLLOUT | linux.POLLERR
+
+	// selectExceptEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLEX_SET.
+	selectExceptEvents = linux.POLLPRI
+)
+
+// pollState tracks the associated file description and waiter of a PollFD.
+type pollState struct {
+	file   *vfs.FileDescription
+	waiter waiter.Entry
+}
+
+// initReadiness gets the current ready mask for the file represented by the FD
+// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
+// used to register with the file for event notifications, and a reference to
+// the file is stored in "state".
+func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) {
+	if pfd.FD < 0 {
+		pfd.REvents = 0
+		return
+	}
+
+	file := t.GetFileVFS2(pfd.FD)
+	if file == nil {
+		pfd.REvents = linux.POLLNVAL
+		return
+	}
+
+	if ch == nil {
+		defer file.DecRef()
+	} else {
+		state.file = file
+		state.waiter, _ = waiter.NewChannelEntry(ch)
+		file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	}
+
+	r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	pfd.REvents = int16(r.ToLinux()) & pfd.Events
+}
+
+// releaseState releases all the pollState in "state".
+func releaseState(state []pollState) {
+	for i := range state {
+		if state[i].file != nil {
+			state[i].file.EventUnregister(&state[i].waiter)
+			state[i].file.DecRef()
+		}
+	}
+}
+
+// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout"
+// when "timeout" is greater than zero.
+//
+// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or
+// positive if interrupted by a signal.
+func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
+	var ch chan struct{}
+	if timeout != 0 {
+		ch = make(chan struct{}, 1)
+	}
+
+	// Register for event notification in the files involved if we may
+	// block (timeout not zero). Once we find a file that has a non-zero
+	// result, we stop registering for events but still go through all files
+	// to get their ready masks.
+	state := make([]pollState, len(pfd))
+	defer releaseState(state)
+	n := uintptr(0)
+	for i := range pfd {
+		initReadiness(t, &pfd[i], &state[i], ch)
+		if pfd[i].REvents != 0 {
+			n++
+			ch = nil
+		}
+	}
+
+	if timeout == 0 {
+		return timeout, n, nil
+	}
+
+	haveTimeout := timeout >= 0
+
+	for n == 0 {
+		var err error
+		// Wait for a notification.
+		timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout)
+		if err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = nil
+			}
+			return timeout, 0, err
+		}
+
+		// We got notified, count how many files are ready. If none,
+		// then this was a spurious notification, and we just go back
+		// to sleep with the remaining timeout.
+		for i := range state {
+			if state[i].file == nil {
+				continue
+			}
+
+			r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events)))
+			rl := int16(r.ToLinux()) & pfd[i].Events
+			if rl != 0 {
+				pfd[i].REvents = rl
+				n++
+			}
+		}
+	}
+
+	return timeout, n, nil
+}
+
+// copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max.
+func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) {
+	if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
+		return nil, syserror.EINVAL
+	}
+
+	pfd := make([]linux.PollFD, nfds)
+	if nfds > 0 {
+		if _, err := t.CopyIn(addr, &pfd); err != nil {
+			return nil, err
+		}
+	}
+
+	return pfd, nil
+}
+
+func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
+	pfd, err := copyInPollFDs(t, addr, nfds)
+	if err != nil {
+		return timeout, 0, err
+	}
+
+	// Compatibility warning: Linux adds POLLHUP and POLLERR just before
+	// polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
+	// polling, changing event masks here is an application-visible difference.
+	// (Linux also doesn't copy out event masks at all, only revents.)
+	for i := range pfd {
+		pfd[i].Events |= linux.POLLHUP | linux.POLLERR
+	}
+	remainingTimeout, n, err := pollBlock(t, pfd, timeout)
+	err = syserror.ConvertIntr(err, syserror.EINTR)
+
+	// The poll entries are copied out regardless of whether
+	// any are set or not. This aligns with the Linux behavior.
+	if nfds > 0 && err == nil {
+		if _, err := t.CopyOut(addr, pfd); err != nil {
+			return remainingTimeout, 0, err
+		}
+	}
+
+	return remainingTimeout, n, err
+}
+
+// CopyInFDSet copies an fd set from select(2)/pselect(2).
+func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
+	set := make([]byte, nBytes)
+
+	if addr != 0 {
+		if _, err := t.CopyIn(addr, &set); err != nil {
+			return nil, err
+		}
+		// If we only use part of the last byte, mask out the extraneous bits.
+		//
+		// N.B. This only works on little-endian architectures.
+		if nBitsInLastPartialByte != 0 {
+			set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
+		}
+	}
+	return set, nil
+}
+
+func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
+	if nfds < 0 || nfds > fileCap {
+		return 0, syserror.EINVAL
+	}
+
+	// Calculate the size of the fd sets (one bit per fd).
+	nBytes := (nfds + 7) / 8
+	nBitsInLastPartialByte := nfds % 8
+
+	// Capture all the provided input vectors.
+	r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+	w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+	e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+
+	// Count how many FDs are actually being requested so that we can build
+	// a PollFD array.
+	fdCount := 0
+	for i := 0; i < nBytes; i++ {
+		v := r[i] | w[i] | e[i]
+		for v != 0 {
+			v &= (v - 1)
+			fdCount++
+		}
+	}
+
+	// Build the PollFD array.
+	pfd := make([]linux.PollFD, 0, fdCount)
+	var fd int32
+	for i := 0; i < nBytes; i++ {
+		rV, wV, eV := r[i], w[i], e[i]
+		v := rV | wV | eV
+		m := byte(1)
+		for j := 0; j < 8; j++ {
+			if (v & m) != 0 {
+				// Make sure the fd is valid and decrement the reference
+				// immediately to ensure we don't leak. Note, another thread
+				// might be about to close fd. This is racy, but that's
+				// OK. Linux is racy in the same way.
+				file := t.GetFileVFS2(fd)
+				if file == nil {
+					return 0, syserror.EBADF
+				}
+				file.DecRef()
+
+				var mask int16
+				if (rV & m) != 0 {
+					mask |= selectReadEvents
+				}
+
+				if (wV & m) != 0 {
+					mask |= selectWriteEvents
+				}
+
+				if (eV & m) != 0 {
+					mask |= selectExceptEvents
+				}
+
+				pfd = append(pfd, linux.PollFD{
+					FD:     fd,
+					Events: mask,
+				})
+			}
+
+			fd++
+			m <<= 1
+		}
+	}
+
+	// Do the syscall, then count the number of bits set.
+	if _, _, err = pollBlock(t, pfd, timeout); err != nil {
+		return 0, syserror.ConvertIntr(err, syserror.EINTR)
+	}
+
+	// r, w, and e are currently event mask bitsets; unset bits corresponding
+	// to events that *didn't* occur.
+	bitSetCount := uintptr(0)
+	for idx := range pfd {
+		events := pfd[idx].REvents
+		i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
+		m := byte(1) << j
+		if r[i]&m != 0 {
+			if (events & selectReadEvents) != 0 {
+				bitSetCount++
+			} else {
+				r[i] &^= m
+			}
+		}
+		if w[i]&m != 0 {
+			if (events & selectWriteEvents) != 0 {
+				bitSetCount++
+			} else {
+				w[i] &^= m
+			}
+		}
+		if e[i]&m != 0 {
+			if (events & selectExceptEvents) != 0 {
+				bitSetCount++
+			} else {
+				e[i] &^= m
+			}
+		}
+	}
+
+	// Copy updated vectors back.
+	if readFDs != 0 {
+		if _, err := t.CopyOut(readFDs, r); err != nil {
+			return 0, err
+		}
+	}
+
+	if writeFDs != 0 {
+		if _, err := t.CopyOut(writeFDs, w); err != nil {
+			return 0, err
+		}
+	}
+
+	if exceptFDs != 0 {
+		if _, err := t.CopyOut(exceptFDs, e); err != nil {
+			return 0, err
+		}
+	}
+
+	return bitSetCount, nil
+}
+
+// timeoutRemaining returns the amount of time remaining for the specified
+// timeout or 0 if it has elapsed.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
+	now := t.Kernel().MonotonicClock().Now()
+	remaining := timeout - now.Sub(startNs)
+	if remaining < 0 {
+		remaining = 0
+	}
+	return remaining
+}
+
+// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
+	return tsRemaining.CopyOut(t, timespecAddr)
+}
+
+// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
+	return tvRemaining.CopyOut(t, timevalAddr)
+}
+
+// pollRestartBlock encapsulates the state required to restart poll(2) via
+// restart_syscall(2).
+//
+// +stateify savable
+type pollRestartBlock struct {
+	pfdAddr usermem.Addr
+	nfds    uint
+	timeout time.Duration
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return poll(t, p.pfdAddr, p.nfds, p.timeout)
+}
+
+func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
+	remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	// On an interrupt poll(2) is restarted with the remaining timeout.
+	if err == syserror.EINTR {
+		t.SetSyscallRestartBlock(&pollRestartBlock{
+			pfdAddr: pfdAddr,
+			nfds:    nfds,
+			timeout: remainingTimeout,
+		})
+		return 0, kernel.ERESTART_RESTARTBLOCK
+	}
+	return n, err
+}
+
+// Poll implements linux syscall poll(2).
+func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timeout := time.Duration(args[2].Int()) * time.Millisecond
+	n, err := poll(t, pfdAddr, nfds, timeout)
+	return n, nil, err
+}
+
+// Ppoll implements linux syscall ppoll(2).
+func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timespecAddr := args[2].Pointer()
+	maskAddr := args[3].Pointer()
+	maskSize := uint(args[4].Uint())
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
+		return 0, nil, err
+	}
+
+	_, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// doPoll returns EINTR if interrupted, but ppoll is normally restartable
+	// if interrupted by something other than a signal handled by the
+	// application (i.e. returns ERESTARTNOHAND). However, if
+	// copyOutTimespecRemaining failed, then the restarted ppoll would use the
+	// wrong timeout, so the error should be left as EINTR.
+	//
+	// Note that this means that if err is nil but copyErr is not, copyErr is
+	// ignored. This is consistent with Linux.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Select implements linux syscall select(2).
+func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timevalAddr := args[4].Pointer()
+
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timevalAddr != 0 {
+		var timeval linux.Timeval
+		if err := timeval.CopyIn(t, timevalAddr); err != nil {
+			return 0, nil, err
+		}
+		if timeval.Sec < 0 || timeval.Usec < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		timeout = time.Duration(timeval.ToNsecCapped())
+	}
+	startNs := t.Kernel().MonotonicClock().Now()
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Pselect implements linux syscall pselect(2).
+func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timespecAddr := args[4].Pointer()
+	maskWithSizeAddr := args[5].Pointer()
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if maskWithSizeAddr != 0 {
+		if t.Arch().Width() != 8 {
+			panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width()))
+		}
+		var maskStruct sigSetWithSize
+		if err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil {
+			return 0, nil, err
+		}
+		if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// +marshal
+type sigSetWithSize struct {
+	sigsetAddr   uint64
+	sizeofSigset uint64
+}
+
+// copyTimespecInToDuration copies a Timespec from the untrusted app range,
+// validates it and converts it to a Duration.
+//
+// If the Timespec is larger than what can be represented in a Duration, the
+// returned value is the maximum that Duration will allow.
+//
+// If timespecAddr is NULL, the returned value is negative.
+func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) {
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timespecAddr != 0 {
+		var timespec linux.Timespec
+		if err := timespec.CopyIn(t, timespecAddr); err != nil {
+			return 0, err
+		}
+		if !timespec.Valid() {
+			return 0, syserror.EINVAL
+		}
+		timeout = time.Duration(timespec.ToNsecCapped())
+	}
+	return timeout, nil
+}
+
+func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) error {
+	if maskAddr == 0 {
+		return nil
+	}
+	if maskSize != linux.SignalSetSize {
+		return syserror.EINVAL
+	}
+	var mask linux.SignalSet
+	if err := mask.CopyIn(t, maskAddr); err != nil {
+		return err
+	}
+	mask &^= kernel.UnblockableSignals
+	oldmask := t.SignalMask()
+	t.SetSignalMask(mask)
+	t.SetSavedSignalMask(oldmask)
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
new file mode 100644
index 000000000..35f6308d6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -0,0 +1,511 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	eventMaskRead  = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+	eventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr
+)
+
+// Read implements Linux syscall read(2).
+func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := read(t, file, dst, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+}
+
+// Readv implements Linux syscall readv(2).
+func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := read(t, file, dst, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+}
+
+func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	n, err := file.Read(t, dst, opts)
+	if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err := file.Read(t, dst, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+		if err := t.Block(ch); err != nil {
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	return total, err
+}
+
+// Pread64 implements Linux syscall pread64(2).
+func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+}
+
+// Preadv implements Linux syscall preadv(2).
+func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+}
+
+// Preadv2 implements Linux syscall preadv2(2).
+func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the glibc signature is
+	// preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the actual syscall
+	// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 6th argument (index 5).
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := args[5].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.ReadOptions{
+		Flags: uint32(flags),
+	}
+	var n int64
+	if offset == -1 {
+		n, err = read(t, file, dst, opts)
+	} else {
+		n, err = pread(t, file, dst, offset, opts)
+	}
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+}
+
+func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	n, err := file.PRead(t, dst, offset, opts)
+	if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err := file.PRead(t, dst, offset+total, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+		if err := t.Block(ch); err != nil {
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	return total, err
+}
+
+// Write implements Linux syscall write(2).
+func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := write(t, file, src, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+}
+
+// Writev implements Linux syscall writev(2).
+func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := write(t, file, src, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+}
+
+func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	n, err := file.Write(t, src, opts)
+	if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err := file.Write(t, src, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+		if err := t.Block(ch); err != nil {
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	return total, err
+}
+
+// Pwrite64 implements Linux syscall pwrite64(2).
+func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+}
+
+// Pwritev implements Linux syscall pwritev(2).
+func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+}
+
+// Pwritev2 implements Linux syscall pwritev2(2).
+func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the glibc signature is
+	// pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the actual syscall
+	// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 6th argument (index 5).
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := args[5].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.WriteOptions{
+		Flags: uint32(flags),
+	}
+	var n int64
+	if offset == -1 {
+		n, err = write(t, file, src, opts)
+	} else {
+		n, err = pwrite(t, file, src, offset, opts)
+	}
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+}
+
+func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, err := file.PWrite(t, src, offset, opts)
+	if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err := file.PWrite(t, src, offset+total, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+		if err := t.Block(ch); err != nil {
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	return total, err
+}
+
+// Lseek implements Linux syscall lseek(2).
+func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	whence := args[2].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newoff, err := file.Seek(t, offset, whence)
+	return uintptr(newoff), nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
new file mode 100644
index 000000000..9250659ff
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -0,0 +1,380 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
+
+// Chmod implements Linux syscall chmod(2).
+func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode)
+}
+
+// Fchmodat implements Linux syscall fchmodat(2).
+func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	mode := args[2].ModeT()
+	return 0, nil, fchmodat(t, dirfd, pathAddr, mode)
+}
+
+func fchmodat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_MODE,
+			Mode: uint16(mode & chmodMask),
+		},
+	})
+}
+
+// Fchmod implements Linux syscall fchmod(2).
+func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	mode := args[1].ModeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.SetStat(t, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_MODE,
+			Mode: uint16(mode & chmodMask),
+		},
+	})
+}
+
+// Chown implements Linux syscall chown(2).
+func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	owner := args[1].Int()
+	group := args[2].Int()
+	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */)
+}
+
+// Lchown implements Linux syscall lchown(2).
+func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	owner := args[1].Int()
+	group := args[2].Int()
+	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW)
+}
+
+// Fchownat implements Linux syscall fchownat(2).
+func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	owner := args[2].Int()
+	group := args[3].Int()
+	flags := args[4].Int()
+	return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags)
+}
+
+func fchownat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, owner, group, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
+		return err
+	}
+
+	return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
+}
+
+func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error {
+	userns := t.UserNamespace()
+	if owner != -1 {
+		kuid := userns.MapToKUID(auth.UID(owner))
+		if !kuid.Ok() {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_UID
+		opts.Stat.UID = uint32(kuid)
+	}
+	if group != -1 {
+		kgid := userns.MapToKGID(auth.GID(group))
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_GID
+		opts.Stat.GID = uint32(kgid)
+	}
+	return nil
+}
+
+// Fchown implements Linux syscall fchown(2).
+func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	owner := args[1].Int()
+	group := args[2].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, file.SetStat(t, opts)
+}
+
+// Truncate implements Linux syscall truncate(2).
+func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: uint64(length),
+		},
+	})
+}
+
+// Ftruncate implements Linux syscall ftruncate(2).
+func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.SetStat(t, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: uint64(length),
+		},
+	})
+}
+
+// Utime implements Linux syscall utime(2).
+func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_ATIME | linux.STATX_MTIME,
+		},
+	}
+	if timesAddr == 0 {
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+	} else {
+		var times linux.Utime
+		if err := times.CopyIn(t, timesAddr); err != nil {
+			return 0, nil, err
+		}
+		opts.Stat.Atime.Sec = times.Actime
+		opts.Stat.Mtime.Sec = times.Modtime
+	}
+
+	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Utimes implements Linux syscall utimes(2).
+func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_ATIME | linux.STATX_MTIME,
+		},
+	}
+	if timesAddr == 0 {
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+	} else {
+		var times [2]linux.Timeval
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		opts.Stat.Atime = linux.StatxTimestamp{
+			Sec:  times[0].Sec,
+			Nsec: uint32(times[0].Usec * 1000),
+		}
+		opts.Stat.Mtime = linux.StatxTimestamp{
+			Sec:  times[1].Sec,
+			Nsec: uint32(times[1].Usec * 1000),
+		}
+	}
+
+	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Utimensat implements Linux syscall utimensat(2).
+func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+	flags := args[3].Int()
+
+	if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Futimens implements Linux syscall futimens(2).
+func Futimens(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	timesAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, file.SetStat(t, opts)
+}
+
+func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error {
+	if timesAddr == 0 {
+		opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+		return nil
+	}
+	var times [2]linux.Timespec
+	if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		return err
+	}
+	if times[0].Nsec != linux.UTIME_OMIT {
+		opts.Stat.Mask |= linux.STATX_ATIME
+		opts.Stat.Atime = linux.StatxTimestamp{
+			Sec:  times[0].Sec,
+			Nsec: uint32(times[0].Nsec),
+		}
+	}
+	if times[1].Nsec != linux.UTIME_OMIT {
+		opts.Stat.Mask |= linux.STATX_MTIME
+		opts.Stat.Mtime = linux.StatxTimestamp{
+			Sec:  times[1].Sec,
+			Nsec: uint32(times[1].Nsec),
+		}
+	}
+	return nil
+}
+
+func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
+			return syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.SetStat() instead of
+				// VirtualFilesystem.SetStatAt(), since the former may be able
+				// to use opened file state to expedite the SetStat.
+				err := dirfile.SetStat(t, *opts)
+				dirfile.DecRef()
+				return err
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+	return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: bool(shouldFollowFinalSymlink),
+	}, opts)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
new file mode 100644
index 000000000..dca8d7011
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -0,0 +1,346 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Stat implements Linux syscall stat(2).
+func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+	return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, 0 /* flags */)
+}
+
+// Lstat implements Linux syscall lstat(2).
+func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+	return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, linux.AT_SYMLINK_NOFOLLOW)
+}
+
+// Newfstatat implements Linux syscall newfstatat, which backs fstatat(2).
+func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	statAddr := args[2].Pointer()
+	flags := args[3].Int()
+	return 0, nil, fstatat(t, dirfd, pathAddr, statAddr, flags)
+}
+
+func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+
+	opts := vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS,
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.Stat() instead of
+				// VirtualFilesystem.StatAt() for fstatat(fd, ""), since the
+				// former may be able to use opened file state to expedite the
+				// Stat.
+				statx, err := dirfile.Stat(t, opts)
+				dirfile.DecRef()
+				if err != nil {
+					return err
+				}
+				var stat linux.Stat
+				convertStatxToUserStat(t, &statx, &stat)
+				return stat.CopyOut(t, statAddr)
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+
+	statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+	}, &opts)
+	if err != nil {
+		return err
+	}
+	var stat linux.Stat
+	convertStatxToUserStat(t, &statx, &stat)
+	return stat.CopyOut(t, statAddr)
+}
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+	// Linux just copies fields from struct kstat without regard to struct
+	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+	userns := t.UserNamespace()
+	*stat = linux.Stat{
+		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+		Ino:     statx.Ino,
+		Nlink:   uint64(statx.Nlink),
+		Mode:    uint32(statx.Mode),
+		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+		Size:    int64(statx.Size),
+		Blksize: int64(statx.Blksize),
+		Blocks:  int64(statx.Blocks),
+		ATime:   timespecFromStatxTimestamp(statx.Atime),
+		MTime:   timespecFromStatxTimestamp(statx.Mtime),
+		CTime:   timespecFromStatxTimestamp(statx.Ctime),
+	}
+}
+
+func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec {
+	return linux.Timespec{
+		Sec:  sxts.Sec,
+		Nsec: int64(sxts.Nsec),
+	}
+}
+
+// Fstat implements Linux syscall fstat(2).
+func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	statAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	statx, err := file.Stat(t, vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	var stat linux.Stat
+	convertStatxToUserStat(t, &statx, &stat)
+	return 0, nil, stat.CopyOut(t, statAddr)
+}
+
+// Statx implements Linux syscall statx(2).
+func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	flags := args[2].Int()
+	mask := args[3].Uint()
+	statxAddr := args[4].Pointer()
+
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts := vfs.StatOptions{
+		Mask: mask,
+		Sync: uint32(flags & linux.AT_STATX_SYNC_TYPE),
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return 0, nil, syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return 0, nil, syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.Stat() instead of
+				// VirtualFilesystem.StatAt() for statx(fd, ""), since the
+				// former may be able to use opened file state to expedite the
+				// Stat.
+				statx, err := dirfile.Stat(t, opts)
+				dirfile.DecRef()
+				if err != nil {
+					return 0, nil, err
+				}
+				userifyStatx(t, &statx)
+				return 0, nil, statx.CopyOut(t, statxAddr)
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+
+	statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+	}, &opts)
+	if err != nil {
+		return 0, nil, err
+	}
+	userifyStatx(t, &statx)
+	return 0, nil, statx.CopyOut(t, statxAddr)
+}
+
+func userifyStatx(t *kernel.Task, statx *linux.Statx) {
+	userns := t.UserNamespace()
+	statx.UID = uint32(auth.KUID(statx.UID).In(userns).OrOverflow())
+	statx.GID = uint32(auth.KGID(statx.GID).In(userns).OrOverflow())
+}
+
+// Readlink implements Linux syscall readlink(2).
+func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+	size := args[2].SizeT()
+	return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size)
+}
+
+// Access implements Linux syscall access(2).
+func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// FIXME(jamieliu): actually implement
+	return 0, nil, nil
+}
+
+// Faccessat implements Linux syscall access(2).
+func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// FIXME(jamieliu): actually implement
+	return 0, nil, nil
+}
+
+// Readlinkat implements Linux syscall mknodat(2).
+func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	bufAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	return readlinkat(t, dirfd, pathAddr, bufAddr, size)
+}
+
+func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, size uint) (uintptr, *kernel.SyscallControl, error) {
+	if int(size) <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	// "Since Linux 2.6.39, pathname can be an empty string, in which case the
+	// call operates on the symbolic link referred to by dirfd ..." -
+	// readlinkat(2)
+	tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if len(target) > int(size) {
+		target = target[:size]
+	}
+	n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target))
+	if n == 0 {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Statfs implements Linux syscall statfs(2).
+func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, statfs.CopyOut(t, bufAddr)
+}
+
+// Fstatfs implements Linux syscall fstatfs(2).
+func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufAddr := args[1].Pointer()
+
+	tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, statfs.CopyOut(t, bufAddr)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go
new file mode 100644
index 000000000..365250b0b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/sync.go
@@ -0,0 +1,87 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements Linux syscall sync(2).
+func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.Kernel().VFS().SyncAllFilesystems(t)
+}
+
+// Syncfs implements Linux syscall syncfs(2).
+func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.SyncFS(t)
+}
+
+// Fsync implements Linux syscall fsync(2).
+func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.Sync(t)
+}
+
+// Fdatasync implements Linux syscall fdatasync(2).
+func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// TODO(gvisor.dev/issue/1897): Avoid writeback of unnecessary metadata.
+	return Fsync(t, args)
+}
+
+// SyncFileRange implements Linux syscall sync_file_range(2).
+func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	nbytes := args[2].Int64()
+	flags := args[3].Uint()
+
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if nbytes < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of
+	// [offset, offset+nbytes).
+	return 0, nil, file.Sync(t)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go
deleted file mode 100644
index 7667524c7..000000000
--- a/pkg/sentry/syscalls/linux/vfs2/sys_read.go
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs2
-
-import (
-	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/pkg/waiter"
-)
-
-const (
-	// EventMaskRead contains events that can be triggered on reads.
-	EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
-)
-
-// Read implements linux syscall read(2).  Note that we try to get a buffer that
-// is exactly the size requested because some applications like qemu expect
-// they can do large reads all at once.  Bug for bug.  Same for other read
-// calls below.
-func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	fd := args[0].Int()
-	addr := args[1].Pointer()
-	size := args[2].SizeT()
-
-	file := t.GetFileVFS2(fd)
-	if file == nil {
-		return 0, nil, syserror.EBADF
-	}
-	defer file.DecRef()
-
-	// Check that the size is legitimate.
-	si := int(size)
-	if si < 0 {
-		return 0, nil, syserror.EINVAL
-	}
-
-	// Get the destination of the read.
-	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
-	if err != nil {
-		return 0, nil, err
-	}
-
-	n, err := read(t, file, dst, vfs.ReadOptions{})
-	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
-}
-
-func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	n, err := file.Read(t, dst, opts)
-	if err != syserror.ErrWouldBlock {
-		return n, err
-	}
-
-	// Register for notifications.
-	w, ch := waiter.NewChannelEntry(nil)
-	file.EventRegister(&w, EventMaskRead)
-
-	total := n
-	for {
-		// Shorten dst to reflect bytes previously read.
-		dst = dst.DropFirst(int(n))
-
-		// Issue the request and break out if it completes with anything other than
-		// "would block".
-		n, err := file.Read(t, dst, opts)
-		total += n
-		if err != syserror.ErrWouldBlock {
-			break
-		}
-		if err := t.Block(ch); err != nil {
-			break
-		}
-	}
-	file.EventUnregister(&w)
-
-	return total, err
-}
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
new file mode 100644
index 000000000..89e9ff4d7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -0,0 +1,353 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"bytes"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Listxattr implements Linux syscall listxattr(2).
+func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listxattr(t, args, followFinalSymlink)
+}
+
+// Llistxattr implements Linux syscall llistxattr(2).
+func Llistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listxattr(t, args, nofollowFinalSymlink)
+}
+
+func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	listAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrNameList(t, listAddr, size, names)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Flistxattr implements Linux syscall flistxattr(2).
+func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	listAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	names, err := file.Listxattr(t)
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrNameList(t, listAddr, size, names)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Getxattr implements Linux syscall getxattr(2).
+func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getxattr(t, args, followFinalSymlink)
+}
+
+// Lgetxattr implements Linux syscall lgetxattr(2).
+func Lgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getxattr(t, args, nofollowFinalSymlink)
+}
+
+func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, name)
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrValue(t, valueAddr, size, value)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Fgetxattr implements Linux syscall fgetxattr(2).
+func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	value, err := file.Getxattr(t, name)
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrValue(t, valueAddr, size, value)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Setxattr implements Linux syscall setxattr(2).
+func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, setxattr(t, args, followFinalSymlink)
+}
+
+// Lsetxattr implements Linux syscall lsetxattr(2).
+func Lsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, setxattr(t, args, nofollowFinalSymlink)
+}
+
+func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	flags := args[4].Int()
+
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+	value, err := copyInXattrValue(t, valueAddr, size)
+	if err != nil {
+		return err
+	}
+
+	return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{
+		Name:  name,
+		Value: value,
+		Flags: uint32(flags),
+	})
+}
+
+// Fsetxattr implements Linux syscall fsetxattr(2).
+func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	flags := args[4].Int()
+
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	value, err := copyInXattrValue(t, valueAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, file.Setxattr(t, vfs.SetxattrOptions{
+		Name:  name,
+		Value: value,
+		Flags: uint32(flags),
+	})
+}
+
+// Removexattr implements Linux syscall removexattr(2).
+func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, removexattr(t, args, followFinalSymlink)
+}
+
+// Lremovexattr implements Linux syscall lremovexattr(2).
+func Lremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, removexattr(t, args, nofollowFinalSymlink)
+}
+
+func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name)
+}
+
+// Fremovexattr implements Linux syscall fremovexattr(2).
+func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, file.Removexattr(t, name)
+}
+
+func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
+	name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
+	if err != nil {
+		if err == syserror.ENAMETOOLONG {
+			return "", syserror.ERANGE
+		}
+		return "", err
+	}
+	if len(name) == 0 {
+		return "", syserror.ERANGE
+	}
+	return name, nil
+}
+
+func copyOutXattrNameList(t *kernel.Task, listAddr usermem.Addr, size uint, names []string) (int, error) {
+	if size > linux.XATTR_LIST_MAX {
+		size = linux.XATTR_LIST_MAX
+	}
+	var buf bytes.Buffer
+	for _, name := range names {
+		buf.WriteString(name)
+		buf.WriteByte(0)
+	}
+	if size == 0 {
+		// Return the size that would be required to accomodate the list.
+		return buf.Len(), nil
+	}
+	if buf.Len() > int(size) {
+		if size >= linux.XATTR_LIST_MAX {
+			return 0, syserror.E2BIG
+		}
+		return 0, syserror.ERANGE
+	}
+	return t.CopyOutBytes(listAddr, buf.Bytes())
+}
+
+func copyInXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint) (string, error) {
+	if size > linux.XATTR_SIZE_MAX {
+		return "", syserror.E2BIG
+	}
+	buf := make([]byte, size)
+	if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
+		return "", err
+	}
+	return gohacks.StringFromImmutableBytes(buf), nil
+}
+
+func copyOutXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint, value string) (int, error) {
+	if size > linux.XATTR_SIZE_MAX {
+		size = linux.XATTR_SIZE_MAX
+	}
+	if size == 0 {
+		// Return the size that would be required to accomodate the value.
+		return len(value), nil
+	}
+	if len(value) > int(size) {
+		if size >= linux.XATTR_SIZE_MAX {
+			return 0, syserror.E2BIG
+		}
+		return 0, syserror.ERANGE
+	}
+	return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value))
+}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 0b4f18ab5..07c8383e6 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -43,6 +43,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/gohacks",
         "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index eed41139b..3da45d744 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -202,6 +202,9 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin
 	// Add epi to file.epolls so that it is removed when the last
 	// FileDescription reference is dropped.
 	file.epollMu.Lock()
+	if file.epolls == nil {
+		file.epolls = make(map[*epollInterest]struct{})
+	}
 	file.epolls[epi] = struct{}{}
 	file.epollMu.Unlock()
 
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index 1fe766a44..bc7581698 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -26,6 +26,7 @@ import (
 	"sync/atomic"
 	"unsafe"
 
+	"gvisor.dev/gvisor/pkg/gohacks"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
@@ -160,7 +161,7 @@ func newMountTableSlots(cap uintptr) unsafe.Pointer {
 // Lookup may be called even if there are concurrent mutators of mt.
 func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
 	key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
-	hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
+	hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
 
 loop:
 	for {
@@ -361,12 +362,3 @@ func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
 
 //go:linkname rand32 runtime.fastrand
 func rand32() uint32
-
-// This is copy/pasted from runtime.noescape(), and is needed because arguments
-// apparently escape from all functions defined by linkname.
-//
-//go:nosplit
-func noescape(p unsafe.Pointer) unsafe.Pointer {
-	x := uintptr(p)
-	return unsafe.Pointer(x ^ 0)
-}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 8a0b382f6..eb4ebb511 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -228,7 +228,7 @@ func (rp *ResolvingPath) Advance() {
 		rp.pit = next
 	} else { // at end of path segment, continue with next one
 		rp.curPart--
-		rp.pit = rp.parts[rp.curPart-1]
+		rp.pit = rp.parts[rp.curPart]
 	}
 }
 
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 8f29031b2..73f8043be 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -385,15 +385,11 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 				// Only a regular file can be executed.
 				stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
 				if err != nil {
+					fd.DecRef()
 					return nil, err
 				}
-				if stat.Mask&linux.STATX_TYPE != 0 {
-					// This shouldn't happen, but if type can't be retrieved, file can't
-					// be executed.
-					return nil, syserror.EACCES
-				}
-				if t := linux.FileMode(stat.Mode).FileType(); t != linux.ModeRegular {
-					ctx.Infof("%q is not a regular file: %v", pop.Path, t)
+				if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG {
+					fd.DecRef()
 					return nil, syserror.EACCES
 				}
 			}
diff --git a/pkg/usermem/BUILD b/pkg/usermem/BUILD
index ff8b9e91a..6c9ada9c7 100644
--- a/pkg/usermem/BUILD
+++ b/pkg/usermem/BUILD
@@ -25,7 +25,6 @@ go_library(
         "bytes_io_unsafe.go",
         "usermem.go",
         "usermem_arm64.go",
-        "usermem_unsafe.go",
         "usermem_x86.go",
     ],
     visibility = ["//:sandbox"],
@@ -33,6 +32,7 @@ go_library(
         "//pkg/atomicbitops",
         "//pkg/binary",
         "//pkg/context",
+        "//pkg/gohacks",
         "//pkg/log",
         "//pkg/safemem",
         "//pkg/syserror",
diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go
index 71fd4e155..d2f4403b0 100644
--- a/pkg/usermem/usermem.go
+++ b/pkg/usermem/usermem.go
@@ -23,6 +23,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/gohacks"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -251,7 +252,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 		}
 		end, ok := addr.AddLength(uint64(readlen))
 		if !ok {
-			return stringFromImmutableBytes(buf[:done]), syserror.EFAULT
+			return gohacks.StringFromImmutableBytes(buf[:done]), syserror.EFAULT
 		}
 		// Shorten the read to avoid crossing page boundaries, since faulting
 		// in a page unnecessarily is expensive. This also ensures that partial
@@ -272,16 +273,16 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 		// Look for the terminating zero byte, which may have occurred before
 		// hitting err.
 		if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 {
-			return stringFromImmutableBytes(buf[:done+i]), nil
+			return gohacks.StringFromImmutableBytes(buf[:done+i]), nil
 		}
 
 		done += n
 		if err != nil {
-			return stringFromImmutableBytes(buf[:done]), err
+			return gohacks.StringFromImmutableBytes(buf[:done]), err
 		}
 		addr = end
 	}
-	return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG
+	return gohacks.StringFromImmutableBytes(buf), syserror.ENAMETOOLONG
 }
 
 // CopyOutVec copies bytes from src to the memory mapped at ars in uio. The
diff --git a/pkg/usermem/usermem_unsafe.go b/pkg/usermem/usermem_unsafe.go
deleted file mode 100644
index 876783e78..000000000
--- a/pkg/usermem/usermem_unsafe.go
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package usermem
-
-import (
-	"unsafe"
-)
-
-// stringFromImmutableBytes is equivalent to string(bs), except that it never
-// copies even if escape analysis can't prove that bs does not escape. This is
-// only valid if bs is never mutated after stringFromImmutableBytes returns.
-func stringFromImmutableBytes(bs []byte) string {
-	// Compare strings.Builder.String().
-	return *(*string)(unsafe.Pointer(&bs))
-}
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index c69f4c602..a4627905e 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -229,7 +229,9 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_NANOSLEEP: {},
 	syscall.SYS_PPOLL:     {},
 	syscall.SYS_PREAD64:   {},
+	syscall.SYS_PREADV:    {},
 	syscall.SYS_PWRITE64:  {},
+	syscall.SYS_PWRITEV:   {},
 	syscall.SYS_READ:      {},
 	syscall.SYS_RECVMSG: []seccomp.Rule{
 		{
-- 
cgit v1.2.3


From 72e3f3a3eef3a1dc02db0ff71f98a5d7fe89a6e3 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 25 Feb 2020 13:42:34 -0800
Subject: Add option to skip stuck tasks waiting for address space

PiperOrigin-RevId: 297192390
---
 pkg/sentry/kernel/kernel.go       |  4 ++++
 pkg/sentry/kernel/task_context.go |  2 +-
 pkg/sentry/kernel/task_exec.go    |  2 +-
 pkg/sentry/kernel/task_usermem.go |  2 +-
 pkg/sentry/mm/address_space.go    | 23 ++++++++++++++---------
 pkg/sentry/mm/lifecycle.go        | 26 ++++++++++++++------------
 pkg/sentry/mm/mm.go               |  5 +++++
 pkg/sentry/mm/mm_test.go          |  2 +-
 8 files changed, 41 insertions(+), 25 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index c62fd6eb1..8b76750e9 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -247,6 +247,10 @@ type Kernel struct {
 
 	// VFS keeps the filesystem state used across the kernel.
 	vfs vfs.VirtualFilesystem
+
+	// If set to true, report address space activation waits as if the task is in
+	// external wait so that the watchdog doesn't report the task stuck.
+	SleepForAddressSpaceActivation bool
 }
 
 // InitKernelArgs holds arguments to Init.
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 2be982684..0158b1788 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -140,7 +140,7 @@ func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*Task
 	}
 
 	// Prepare a new user address space to load into.
-	m := mm.NewMemoryManager(k, k)
+	m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
 	defer m.DecUsers(ctx)
 	args.MemoryManager = m
 
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 8f57a34a6..00c425cca 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -220,7 +220,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.mu.Unlock()
 	t.unstopVforkParent()
 	// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
-	t.MemoryManager().Activate()
+	t.MemoryManager().Activate(t)
 
 	t.ptraceExec(oldTID)
 	return (*runSyscallExit)(nil)
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 2bf3ce8a8..b02044ad2 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -30,7 +30,7 @@ var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
 // Activate ensures that the task has an active address space.
 func (t *Task) Activate() {
 	if mm := t.MemoryManager(); mm != nil {
-		if err := mm.Activate(); err != nil {
+		if err := mm.Activate(t); err != nil {
 			panic("unable to activate mm: " + err.Error())
 		}
 	}
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 94d39af60..0332fc71c 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -38,7 +39,7 @@ func (mm *MemoryManager) AddressSpace() platform.AddressSpace {
 //
 // When this MemoryManager is no longer needed by a task, it should call
 // Deactivate to release the reference.
-func (mm *MemoryManager) Activate() error {
+func (mm *MemoryManager) Activate(ctx context.Context) error {
 	// Fast path: the MemoryManager already has an active
 	// platform.AddressSpace, and we just need to indicate that we need it too.
 	for {
@@ -91,16 +92,20 @@ func (mm *MemoryManager) Activate() error {
 		if as == nil {
 			// AddressSpace is unavailable, we must wait.
 			//
-			// activeMu must not be held while waiting, as the user
-			// of the address space we are waiting on may attempt
-			// to take activeMu.
-			//
-			// Don't call UninterruptibleSleepStart to register the
-			// wait to allow the watchdog stuck task to trigger in
-			// case a process is starved waiting for the address
-			// space.
+			// activeMu must not be held while waiting, as the user of the address
+			// space we are waiting on may attempt to take activeMu.
 			mm.activeMu.Unlock()
+
+			sleep := mm.p.CooperativelySchedulesAddressSpace() && mm.sleepForActivation
+			if sleep {
+				// Mark this task sleeping while waiting for the address space to
+				// prevent the watchdog from reporting it as a stuck task.
+				ctx.UninterruptibleSleepStart(false)
+			}
 			<-c
+			if sleep {
+				ctx.UninterruptibleSleepFinish(false)
+			}
 			continue
 		}
 
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 3c263ebaa..d8a5b9d29 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -28,16 +28,17 @@ import (
 )
 
 // NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
-func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *MemoryManager {
+func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager {
 	return &MemoryManager{
-		p:           p,
-		mfp:         mfp,
-		haveASIO:    p.SupportsAddressSpaceIO(),
-		privateRefs: &privateRefs{},
-		users:       1,
-		auxv:        arch.Auxv{},
-		dumpability: UserDumpable,
-		aioManager:  aioManager{contexts: make(map[uint64]*AIOContext)},
+		p:                  p,
+		mfp:                mfp,
+		haveASIO:           p.SupportsAddressSpaceIO(),
+		privateRefs:        &privateRefs{},
+		users:              1,
+		auxv:               arch.Auxv{},
+		dumpability:        UserDumpable,
+		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
+		sleepForActivation: sleepForActivation,
 	}
 }
 
@@ -79,9 +80,10 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		envv:                 mm.envv,
 		auxv:                 append(arch.Auxv(nil), mm.auxv...),
 		// IncRef'd below, once we know that there isn't an error.
-		executable:  mm.executable,
-		dumpability: mm.dumpability,
-		aioManager:  aioManager{contexts: make(map[uint64]*AIOContext)},
+		executable:         mm.executable,
+		dumpability:        mm.dumpability,
+		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
+		sleepForActivation: mm.sleepForActivation,
 	}
 
 	// Copy vmas.
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 637383c7a..c2195ae11 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -226,6 +226,11 @@ type MemoryManager struct {
 	// aioManager keeps track of AIOContexts used for async IOs. AIOManager
 	// must be cloned when CLONE_VM is used.
 	aioManager aioManager
+
+	// sleepForActivation indicates whether the task should report to be sleeping
+	// before trying to activate the address space. When set to true, delays in
+	// activation are not reported as stuck tasks by the watchdog.
+	sleepForActivation bool
 }
 
 // vma represents a virtual memory area.
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index edacca741..fdc308542 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -31,7 +31,7 @@ import (
 func testMemoryManager(ctx context.Context) *MemoryManager {
 	p := platform.FromContext(ctx)
 	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
-	mm := NewMemoryManager(p, mfp)
+	mm := NewMemoryManager(p, mfp, false)
 	mm.layout = arch.MmapLayout{
 		MinAddr:      p.MinUserAddress(),
 		MaxAddr:      p.MaxUserAddress(),
-- 
cgit v1.2.3


From 5f1f9dd9d23d2b805c77b5c38d5900d13e6a29fe Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 25 Feb 2020 15:15:28 -0800
Subject: Use link-local source address for link-local multicast

Tests:
- header_test.TestIsV6LinkLocalMulticastAddress
- header_test.TestScopeForIPv6Address
- stack_test.TestIPv6SourceAddressSelectionScopeAndSameAddress
PiperOrigin-RevId: 297215576
---
 pkg/tcpip/header/ipv6.go      | 22 ++++++++++
 pkg/tcpip/header/ipv6_test.go | 98 ++++++++++++++++++++++++++++++++++++++++---
 pkg/tcpip/stack/stack_test.go | 27 ++++++++----
 3 files changed, 134 insertions(+), 13 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 70e6ce095..76e88e9b3 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -115,6 +115,19 @@ const (
 	// for the secret key used to generate an opaque interface identifier as
 	// outlined by RFC 7217.
 	OpaqueIIDSecretKeyMinBytes = 16
+
+	// ipv6MulticastAddressScopeByteIdx is the byte where the scope (scop) field
+	// is located within a multicast IPv6 address, as per RFC 4291 section 2.7.
+	ipv6MulticastAddressScopeByteIdx = 1
+
+	// ipv6MulticastAddressScopeMask is the mask for the scope (scop) field,
+	// within the byte holding the field, as per RFC 4291 section 2.7.
+	ipv6MulticastAddressScopeMask = 0xF
+
+	// ipv6LinkLocalMulticastScope is the value of the scope (scop) field within
+	// a multicast IPv6 address that indicates the address has link-local scope,
+	// as per RFC 4291 section 2.7.
+	ipv6LinkLocalMulticastScope = 2
 )
 
 // IPv6EmptySubnet is the empty IPv6 subnet. It may also be known as the
@@ -340,6 +353,12 @@ func IsV6LinkLocalAddress(addr tcpip.Address) bool {
 	return addr[0] == 0xfe && (addr[1]&0xc0) == 0x80
 }
 
+// IsV6LinkLocalMulticastAddress determines if the provided address is an IPv6
+// link-local multicast address.
+func IsV6LinkLocalMulticastAddress(addr tcpip.Address) bool {
+	return IsV6MulticastAddress(addr) && addr[ipv6MulticastAddressScopeByteIdx]&ipv6MulticastAddressScopeMask == ipv6LinkLocalMulticastScope
+}
+
 // IsV6UniqueLocalAddress determines if the provided address is an IPv6
 // unique-local address (within the prefix FC00::/7).
 func IsV6UniqueLocalAddress(addr tcpip.Address) bool {
@@ -411,6 +430,9 @@ func ScopeForIPv6Address(addr tcpip.Address) (IPv6AddressScope, *tcpip.Error) {
 	}
 
 	switch {
+	case IsV6LinkLocalMulticastAddress(addr):
+		return LinkLocalScope, nil
+
 	case IsV6LinkLocalAddress(addr):
 		return LinkLocalScope, nil
 
diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go
index c3ad503aa..426a873b1 100644
--- a/pkg/tcpip/header/ipv6_test.go
+++ b/pkg/tcpip/header/ipv6_test.go
@@ -27,11 +27,12 @@ import (
 )
 
 const (
-	linkAddr         = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
-	linkLocalAddr    = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-	uniqueLocalAddr1 = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-	uniqueLocalAddr2 = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-	globalAddr       = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	linkAddr               = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+	linkLocalAddr          = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	linkLocalMulticastAddr = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	uniqueLocalAddr1       = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	uniqueLocalAddr2       = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+	globalAddr             = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
 )
 
 func TestEthernetAdddressToModifiedEUI64(t *testing.T) {
@@ -256,6 +257,85 @@ func TestIsV6UniqueLocalAddress(t *testing.T) {
 	}
 }
 
+func TestIsV6LinkLocalMulticastAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.Address
+		expected bool
+	}{
+		{
+			name:     "Valid Link Local Multicast",
+			addr:     linkLocalMulticastAddr,
+			expected: true,
+		},
+		{
+			name:     "Valid Link Local Multicast with flags",
+			addr:     "\xff\xf2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01",
+			expected: true,
+		},
+		{
+			name:     "Link Local Unicast",
+			addr:     linkLocalAddr,
+			expected: false,
+		},
+		{
+			name:     "IPv4 Multicast",
+			addr:     "\xe0\x00\x00\x01",
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := header.IsV6LinkLocalMulticastAddress(test.addr); got != test.expected {
+				t.Errorf("got header.IsV6LinkLocalMulticastAddress(%s) = %t, want = %t", test.addr, got, test.expected)
+			}
+		})
+	}
+}
+
+func TestIsV6LinkLocalAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.Address
+		expected bool
+	}{
+		{
+			name:     "Valid Link Local Unicast",
+			addr:     linkLocalAddr,
+			expected: true,
+		},
+		{
+			name:     "Link Local Multicast",
+			addr:     linkLocalMulticastAddr,
+			expected: false,
+		},
+		{
+			name:     "Unique Local",
+			addr:     uniqueLocalAddr1,
+			expected: false,
+		},
+		{
+			name:     "Global",
+			addr:     globalAddr,
+			expected: false,
+		},
+		{
+			name:     "IPv4 Link Local",
+			addr:     "\xa9\xfe\x00\x01",
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := header.IsV6LinkLocalAddress(test.addr); got != test.expected {
+				t.Errorf("got header.IsV6LinkLocalAddress(%s) = %t, want = %t", test.addr, got, test.expected)
+			}
+		})
+	}
+}
+
 func TestScopeForIPv6Address(t *testing.T) {
 	tests := []struct {
 		name  string
@@ -270,11 +350,17 @@ func TestScopeForIPv6Address(t *testing.T) {
 			err:   nil,
 		},
 		{
-			name:  "Link Local",
+			name:  "Link Local Unicast",
 			addr:  linkLocalAddr,
 			scope: header.LinkLocalScope,
 			err:   nil,
 		},
+		{
+			name:  "Link Local Multicast",
+			addr:  linkLocalMulticastAddr,
+			scope: header.LinkLocalScope,
+			err:   nil,
+		},
 		{
 			name:  "Global",
 			addr:  globalAddr,
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index edf6bec52..e15db40fb 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -2790,13 +2790,14 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 
 func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 	const (
-		linkLocalAddr1   = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-		linkLocalAddr2   = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-		uniqueLocalAddr1 = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-		uniqueLocalAddr2 = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-		globalAddr1      = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-		globalAddr2      = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-		nicID            = 1
+		linkLocalAddr1         = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		linkLocalAddr2         = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		linkLocalMulticastAddr = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		uniqueLocalAddr1       = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		uniqueLocalAddr2       = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		globalAddr1            = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		globalAddr2            = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		nicID                  = 1
 	)
 
 	// Rule 3 is not tested here, and is instead tested by NDP's AutoGenAddr test.
@@ -2869,6 +2870,18 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 			connectAddr:       linkLocalAddr2,
 			expectedLocalAddr: linkLocalAddr1,
 		},
+		{
+			name:              "Link Local most preferred for link local multicast (last address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, uniqueLocalAddr1, linkLocalAddr1},
+			connectAddr:       linkLocalMulticastAddr,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Link Local most preferred for link local multicast (first address)",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:       linkLocalMulticastAddr,
+			expectedLocalAddr: linkLocalAddr1,
+		},
 		{
 			name:              "Unique Local most preferred (last address)",
 			nicAddrs:          []tcpip.Address{uniqueLocalAddr1, globalAddr1, linkLocalAddr1},
-- 
cgit v1.2.3


From 87288b26a1c40776da31c1edcbe9b1f3a6f5c1ed Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 25 Feb 2020 15:34:20 -0800
Subject: Add netlink sockopt logging to strace.

PiperOrigin-RevId: 297220008
---
 pkg/sentry/strace/socket.go | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'pkg')

diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 51e6d81b2..c0512de89 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -632,4 +632,13 @@ var sockOptNames = map[uint64]abi.ValueSet{
 		linux.MCAST_MSFILTER:           "MCAST_MSFILTER",
 		linux.IPV6_ADDRFORM:            "IPV6_ADDRFORM",
 	},
+	linux.SOL_NETLINK: {
+		linux.NETLINK_BROADCAST_ERROR:  "NETLINK_BROADCAST_ERROR",
+		linux.NETLINK_CAP_ACK:          "NETLINK_CAP_ACK",
+		linux.NETLINK_DUMP_STRICT_CHK:  "NETLINK_DUMP_STRICT_CHK",
+		linux.NETLINK_EXT_ACK:          "NETLINK_EXT_ACK",
+		linux.NETLINK_LIST_MEMBERSHIPS: "NETLINK_LIST_MEMBERSHIPS",
+		linux.NETLINK_NO_ENOBUFS:       "NETLINK_NO_ENOBUFS",
+		linux.NETLINK_PKTINFO:          "NETLINK_PKTINFO",
+	},
 }
-- 
cgit v1.2.3


From 73201f4c5700ce7af404b968698b86d80989ab1e Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Tue, 25 Feb 2020 07:30:12 +0000
Subject: Code Clean: Move arch independent codes to common file in kvm pkg.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: Iefbdf53e8e8d6d23ae75d8a2ff0d2a6e71f414d8
---
 pkg/sentry/platform/kvm/kvm.go       | 32 ++++++++++++++++++++++++++++++++
 pkg/sentry/platform/kvm/kvm_amd64.go | 32 --------------------------------
 pkg/sentry/platform/kvm/kvm_arm64.go | 32 --------------------------------
 3 files changed, 32 insertions(+), 64 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 972ba85c3..a9b4af43e 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -27,6 +27,38 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// userMemoryRegion is a region of physical memory.
+//
+// This mirrors kvm_memory_region.
+type userMemoryRegion struct {
+	slot          uint32
+	flags         uint32
+	guestPhysAddr uint64
+	memorySize    uint64
+	userspaceAddr uint64
+}
+
+// runData is the run structure. This may be mapped for synchronous register
+// access (although that doesn't appear to be supported by my kernel at least).
+//
+// This mirrors kvm_run.
+type runData struct {
+	requestInterruptWindow uint8
+	_                      [7]uint8
+
+	exitReason                 uint32
+	readyForInterruptInjection uint8
+	ifFlag                     uint8
+	_                          [2]uint8
+
+	cr8      uint64
+	apicBase uint64
+
+	// This is the union data for exits. Interpretation depends entirely on
+	// the exitReason above (see vCPU code for more information).
+	data [32]uint64
+}
+
 // KVM represents a lightweight VM context.
 type KVM struct {
 	platform.NoCPUPreemptionDetection
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
index c5a6f9c7d..093497bc4 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -21,17 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
-// userMemoryRegion is a region of physical memory.
-//
-// This mirrors kvm_memory_region.
-type userMemoryRegion struct {
-	slot          uint32
-	flags         uint32
-	guestPhysAddr uint64
-	memorySize    uint64
-	userspaceAddr uint64
-}
-
 // userRegs represents KVM user registers.
 //
 // This mirrors kvm_regs.
@@ -169,27 +158,6 @@ type modelControlRegisters struct {
 	entries [16]modelControlRegister
 }
 
-// runData is the run structure. This may be mapped for synchronous register
-// access (although that doesn't appear to be supported by my kernel at least).
-//
-// This mirrors kvm_run.
-type runData struct {
-	requestInterruptWindow uint8
-	_                      [7]uint8
-
-	exitReason                 uint32
-	readyForInterruptInjection uint8
-	ifFlag                     uint8
-	_                          [2]uint8
-
-	cr8      uint64
-	apicBase uint64
-
-	// This is the union data for exits. Interpretation depends entirely on
-	// the exitReason above (see vCPU code for more information).
-	data [32]uint64
-}
-
 // cpuidEntry is a single CPUID entry.
 //
 // This mirrors kvm_cpuid_entry2.
diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go
index 2319c86d3..79045651e 100644
--- a/pkg/sentry/platform/kvm/kvm_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_arm64.go
@@ -20,17 +20,6 @@ import (
 	"syscall"
 )
 
-// userMemoryRegion is a region of physical memory.
-//
-// This mirrors kvm_memory_region.
-type userMemoryRegion struct {
-	slot          uint32
-	flags         uint32
-	guestPhysAddr uint64
-	memorySize    uint64
-	userspaceAddr uint64
-}
-
 type kvmOneReg struct {
 	id   uint64
 	addr uint64
@@ -53,27 +42,6 @@ type userRegs struct {
 	fpRegs  userFpsimdState
 }
 
-// runData is the run structure. This may be mapped for synchronous register
-// access (although that doesn't appear to be supported by my kernel at least).
-//
-// This mirrors kvm_run.
-type runData struct {
-	requestInterruptWindow uint8
-	_                      [7]uint8
-
-	exitReason                 uint32
-	readyForInterruptInjection uint8
-	ifFlag                     uint8
-	_                          [2]uint8
-
-	cr8      uint64
-	apicBase uint64
-
-	// This is the union data for exits. Interpretation depends entirely on
-	// the exitReason above (see vCPU code for more information).
-	data [32]uint64
-}
-
 // updateGlobalOnce does global initialization. It has to be called only once.
 func updateGlobalOnce(fd int) error {
 	physicalInit()
-- 
cgit v1.2.3


From fba479b3c78621cb122af20d1d677fe9193a971c Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 25 Feb 2020 19:03:23 -0800
Subject: Fix DATA RACE in fs.MayDelete.

MayDelete must lock the directory also, otherwise concurrent renames may
race. Note that this also changes the methods to be aligned with the actual
Remove and RemoveDirectory methods to minimize confusion when reading the
code. (It was hard to see that resolution was correct.)

PiperOrigin-RevId: 297258304
---
 pkg/sentry/fs/dirent.go               | 25 ++++++++++++++-----------
 pkg/sentry/syscalls/linux/sys_file.go |  4 ++--
 2 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index acab0411a..e0b32e1c1 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1438,8 +1438,8 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName
 	}, nil
 }
 
-func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error {
-	uattr, err := dir.Inode.UnstableAttr(ctx)
+func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error {
+	uattr, err := d.Inode.UnstableAttr(ctx)
 	if err != nil {
 		return syserror.EPERM
 	}
@@ -1465,30 +1465,33 @@ func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error {
 	return syserror.EPERM
 }
 
-// MayDelete determines whether `name`, a child of `dir`, can be deleted or
+// MayDelete determines whether `name`, a child of `d`, can be deleted or
 // renamed by `ctx`.
 //
 // Compare Linux kernel fs/namei.c:may_delete.
-func MayDelete(ctx context.Context, root, dir *Dirent, name string) error {
-	if err := dir.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
+func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error {
+	if err := d.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
 		return err
 	}
 
-	victim, err := dir.Walk(ctx, root, name)
+	unlock := d.lockDirectory()
+	defer unlock()
+
+	victim, err := d.walk(ctx, root, name, true /* may unlock */)
 	if err != nil {
 		return err
 	}
 	defer victim.DecRef()
 
-	return mayDelete(ctx, dir, victim)
+	return d.mayDelete(ctx, victim)
 }
 
 // mayDelete determines whether `victim`, a child of `dir`, can be deleted or
 // renamed by `ctx`.
 //
 // Preconditions: `dir` is writable and executable by `ctx`.
-func mayDelete(ctx context.Context, dir, victim *Dirent) error {
-	if err := checkSticky(ctx, dir, victim); err != nil {
+func (d *Dirent) mayDelete(ctx context.Context, victim *Dirent) error {
+	if err := d.checkSticky(ctx, victim); err != nil {
 		return err
 	}
 
@@ -1542,7 +1545,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 	defer renamed.DecRef()
 
 	// Check that the renamed dirent is deletable.
-	if err := mayDelete(ctx, oldParent, renamed); err != nil {
+	if err := oldParent.mayDelete(ctx, renamed); err != nil {
 		return err
 	}
 
@@ -1580,7 +1583,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		// across the Rename, so must call DecRef manually (no defer).
 
 		// Check that we can delete replaced.
-		if err := mayDelete(ctx, newParent, replaced); err != nil {
+		if err := newParent.mayDelete(ctx, replaced); err != nil {
 			replaced.DecRef()
 			return err
 		}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index c21f14dc0..d10a9bed8 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1236,7 +1236,7 @@ func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 			return syserror.ENOTEMPTY
 		}
 
-		if err := fs.MayDelete(t, root, d, name); err != nil {
+		if err := d.MayDelete(t, root, name); err != nil {
 			return err
 		}
 
@@ -1517,7 +1517,7 @@ func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 			return syserror.ENOTDIR
 		}
 
-		if err := fs.MayDelete(t, root, d, name); err != nil {
+		if err := d.MayDelete(t, root, name); err != nil {
 			return err
 		}
 
-- 
cgit v1.2.3


From a92087f0f8fe82ce99414ec99ffe33e514cb21f6 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 25 Feb 2020 19:12:22 -0800
Subject: Add VFS.NewDisconnectedMount().

Analogous to Linux's kern_mount().

PiperOrigin-RevId: 297259580
---
 pkg/sentry/vfs/mount.go | 17 +++++++++++++++++
 pkg/sentry/vfs/vfs.go   | 16 +++++++++++-----
 2 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 9912df799..31a4e5480 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -139,6 +139,23 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 	return mntns, nil
 }
 
+// NewDisconnectedMount returns a Mount representing fs with the given root
+// (which may be nil). The new Mount is not associated with any MountNamespace
+// and is not connected to any other Mounts. References are taken on fs and
+// root.
+func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) (*Mount, error) {
+	fs.IncRef()
+	if root != nil {
+		root.IncRef()
+	}
+	return &Mount{
+		vfs:  vfs,
+		fs:   fs,
+		root: root,
+		refs: 1,
+	}, nil
+}
+
 // MountAt creates and mounts a Filesystem configured by the given arguments.
 func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
 	rft := vfs.getFilesystemType(fsTypeName)
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 73f8043be..bde81e1ef 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -126,17 +126,23 @@ func (vfs *VirtualFilesystem) Init() error {
 	// Construct vfs.anonMount.
 	anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
 	if err != nil {
-		return err
+		// This shouldn't be possible since anonBlockDevMinorNext was
+		// initialized to 1 above (no device numbers have been allocated yet).
+		panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err))
 	}
 	anonfs := anonFilesystem{
 		devMinor: anonfsDevMinor,
 	}
 	anonfs.vfsfs.Init(vfs, &anonfs)
-	vfs.anonMount = &Mount{
-		vfs:  vfs,
-		fs:   &anonfs.vfsfs,
-		refs: 1,
+	defer anonfs.vfsfs.DecRef()
+	anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{})
+	if err != nil {
+		// We should not be passing any MountOptions that would cause
+		// construction of this mount to fail.
+		panic(fmt.Sprintf("VirtualFilesystem.Init: anonfs mount failed: %v", err))
 	}
+	vfs.anonMount = anonMount
+
 	return nil
 }
 
-- 
cgit v1.2.3


From d8ed78431162fcaed0b31b54d939c8a54d4736e7 Mon Sep 17 00:00:00 2001
From: moricho <ikeda.morito@gmail.com>
Date: Tue, 25 Feb 2020 16:49:08 +0900
Subject: add profile option

---
 pkg/sentry/control/pprof.go | 34 +++++++++++++++++++++---
 runsc/boot/controller.go    | 13 +++++----
 runsc/cmd/debug.go          | 64 +++++++++++++++++++++++++++++++++++++--------
 runsc/sandbox/sandbox.go    | 60 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 152 insertions(+), 19 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 151808911..5d1907c0e 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -117,15 +117,43 @@ func (p *Profile) HeapProfile(o *ProfileOpts, _ *struct{}) error {
 	return nil
 }
 
-// Goroutine is an RPC stub which dumps out the stack trace for all running
+// GoroutineProfile is an RPC stub which dumps out the stack trace for all running
 // goroutines.
-func (p *Profile) Goroutine(o *ProfileOpts, _ *struct{}) error {
+func (p *Profile) GoroutineProfile(o *ProfileOpts, _ *struct{}) error {
 	if len(o.FilePayload.Files) < 1 {
 		return errNoOutput
 	}
 	output := o.FilePayload.Files[0]
 	defer output.Close()
-	if err := pprof.Lookup("goroutine").WriteTo(output, 2); err != nil {
+	if err := pprof.Lookup("goroutine").WriteTo(output, 0); err != nil {
+		return err
+	}
+	return nil
+}
+
+// BlockProfile is an RPC stub which dumps out the stack trace that led to
+// blocking on synchronization primitives.
+func (p *Profile) BlockProfile(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+	output := o.FilePayload.Files[0]
+	defer output.Close()
+	if err := pprof.Lookup("block").WriteTo(output, 0); err != nil {
+		return err
+	}
+	return nil
+}
+
+// MutexProfile is an RPC stub which dumps out the stack trace of holders of
+// contended mutexes.
+func (p *Profile) MutexProfile(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+	output := o.FilePayload.Files[0]
+	defer output.Close()
+	if err := pprof.Lookup("mutex").WriteTo(output, 0); err != nil {
 		return err
 	}
 	return nil
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 17e774e0c..8125d5061 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -101,11 +101,14 @@ const (
 
 // Profiling related commands (see pprof.go for more details).
 const (
-	StartCPUProfile = "Profile.StartCPUProfile"
-	StopCPUProfile  = "Profile.StopCPUProfile"
-	HeapProfile     = "Profile.HeapProfile"
-	StartTrace      = "Profile.StartTrace"
-	StopTrace       = "Profile.StopTrace"
+	StartCPUProfile  = "Profile.StartCPUProfile"
+	StopCPUProfile   = "Profile.StopCPUProfile"
+	HeapProfile      = "Profile.HeapProfile"
+	GoroutineProfile = "Profile.GoroutineProfile"
+	BlockProfile     = "Profile.BlockProfile"
+	MutexProfile     = "Profile.MutexProfile"
+	StartTrace       = "Profile.StartTrace"
+	StopTrace        = "Profile.StopTrace"
 )
 
 // Logging related commands (see logging.go for more details).
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 79965460e..b5de2588b 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -32,17 +32,20 @@ import (
 
 // Debug implements subcommands.Command for the "debug" command.
 type Debug struct {
-	pid         int
-	stacks      bool
-	signal      int
-	profileHeap string
-	profileCPU  string
-	trace       string
-	strace      string
-	logLevel    string
-	logPackets  string
-	duration    time.Duration
-	ps          bool
+	pid              int
+	stacks           bool
+	signal           int
+	profileHeap      string
+	profileCPU       string
+	profileGoroutine string
+	profileBlock     string
+	profileMutex     string
+	trace            string
+	strace           string
+	logLevel         string
+	logPackets       string
+	duration         time.Duration
+	ps               bool
 }
 
 // Name implements subcommands.Command.
@@ -66,6 +69,9 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
 	f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
 	f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
+	f.StringVar(&d.profileGoroutine, "profile-goroutine", "", "writes goroutine profile to the given file.")
+	f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.")
+	f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.")
 	f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles")
 	f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.")
 	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
@@ -147,6 +153,42 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 		log.Infof("Heap profile written to %q", d.profileHeap)
 	}
+	if d.profileGoroutine != "" {
+		f, err := os.Create(d.profileGoroutine)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.GoroutineProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Goroutine profile written to %q", d.profileGoroutine)
+	}
+	if d.profileBlock != "" {
+		f, err := os.Create(d.profileBlock)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.BlockProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Block profile written to %q", d.profileBlock)
+	}
+	if d.profileMutex != "" {
+		f, err := os.Create(d.profileMutex)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.MutexProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Mutex profile written to %q", d.profileMutex)
+	}
 
 	delay := false
 	if d.profileCPU != "" {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index ec72bdbfd..2e0e2fd66 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -972,6 +972,66 @@ func (s *Sandbox) StopCPUProfile() error {
 	return nil
 }
 
+// GoroutineProfile writes a goroutine profile to the given file.
+func (s *Sandbox) GoroutineProfile(f *os.File) error {
+	log.Debugf("Goroutine profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.GoroutineProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q goroutine profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// BlockProfile writes a block profile to the given file.
+func (s *Sandbox) BlockProfile(f *os.File) error {
+	log.Debugf("Block profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.BlockProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q block profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// MutexProfile writes a mutex profile to the given file.
+func (s *Sandbox) MutexProfile(f *os.File) error {
+	log.Debugf("Mutex profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.MutexProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q mutex profile: %v", s.ID, err)
+	}
+	return nil
+}
+
 // StartTrace start trace  writing to the given file.
 func (s *Sandbox) StartTrace(f *os.File) error {
 	log.Debugf("Trace start %q", s.ID)
-- 
cgit v1.2.3


From 8fb84f78adfc0dba964ebe97edb51ebf8a80f752 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Wed, 26 Feb 2020 19:28:20 -0800
Subject: Fix construct of linux.Stat for arm64.

PiperOrigin-RevId: 297494373
---
 pkg/sentry/syscalls/linux/BUILD              |  2 ++
 pkg/sentry/syscalls/linux/sys_stat.go        | 18 -----------
 pkg/sentry/syscalls/linux/sys_stat_amd64.go  | 45 +++++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/sys_stat_arm64.go  | 45 +++++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/vfs2/BUILD         |  2 ++
 pkg/sentry/syscalls/linux/vfs2/stat.go       | 23 --------------
 pkg/sentry/syscalls/linux/vfs2/stat_amd64.go | 46 ++++++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/vfs2/stat_arm64.go | 46 ++++++++++++++++++++++++++++
 8 files changed, 186 insertions(+), 41 deletions(-)
 create mode 100644 pkg/sentry/syscalls/linux/sys_stat_amd64.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_stat_arm64.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/stat_arm64.go

(limited to 'pkg')

diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index c7883e68e..0d24fd3c4 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -42,6 +42,8 @@ go_library(
         "sys_socket.go",
         "sys_splice.go",
         "sys_stat.go",
+        "sys_stat_amd64.go",
+        "sys_stat_arm64.go",
         "sys_sync.go",
         "sys_sysinfo.go",
         "sys_syslog.go",
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 701b27b4a..9bd2df104 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -25,24 +25,6 @@ import (
 
 // LINT.IfChange
 
-func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
-	return linux.Stat{
-		Dev:     sattr.DeviceID,
-		Ino:     sattr.InodeID,
-		Nlink:   uattr.Links,
-		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
-		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
-		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
-		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
-		Size:    uattr.Size,
-		Blksize: sattr.BlockSize,
-		Blocks:  uattr.Usage / 512,
-		ATime:   uattr.AccessTime.Timespec(),
-		MTime:   uattr.ModificationTime.Timespec(),
-		CTime:   uattr.StatusChangeTime.Timespec(),
-	}
-}
-
 // Stat implements linux syscall stat(2).
 func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
new file mode 100644
index 000000000..0a04a6113
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// LINT.IfChange
+
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+	return linux.Stat{
+		Dev:     sattr.DeviceID,
+		Ino:     sattr.InodeID,
+		Nlink:   uattr.Links,
+		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+		Size:    uattr.Size,
+		Blksize: sattr.BlockSize,
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	}
+}
+
+// LINT.ThenChange(vfs2/stat_amd64.go)
diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
new file mode 100644
index 000000000..5a3b1bfad
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// LINT.IfChange
+
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+	return linux.Stat{
+		Dev:     sattr.DeviceID,
+		Ino:     sattr.InodeID,
+		Nlink:   uint32(uattr.Links),
+		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+		Size:    uattr.Size,
+		Blksize: int32(sattr.BlockSize),
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	}
+}
+
+// LINT.ThenChange(vfs2/stat_arm64.go)
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index f51761e81..e7695e995 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -22,6 +22,8 @@ go_library(
         "read_write.go",
         "setstat.go",
         "stat.go",
+        "stat_amd64.go",
+        "stat_arm64.go",
         "sync.go",
         "xattr.go",
     ],
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
index dca8d7011..12c532310 100644
--- a/pkg/sentry/syscalls/linux/vfs2/stat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -113,29 +113,6 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
 	return stat.CopyOut(t, statAddr)
 }
 
-// This takes both input and output as pointer arguments to avoid copying large
-// structs.
-func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
-	// Linux just copies fields from struct kstat without regard to struct
-	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
-	userns := t.UserNamespace()
-	*stat = linux.Stat{
-		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
-		Ino:     statx.Ino,
-		Nlink:   uint64(statx.Nlink),
-		Mode:    uint32(statx.Mode),
-		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
-		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
-		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
-		Size:    int64(statx.Size),
-		Blksize: int64(statx.Blksize),
-		Blocks:  int64(statx.Blocks),
-		ATime:   timespecFromStatxTimestamp(statx.Atime),
-		MTime:   timespecFromStatxTimestamp(statx.Mtime),
-		CTime:   timespecFromStatxTimestamp(statx.Ctime),
-	}
-}
-
 func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec {
 	return linux.Timespec{
 		Sec:  sxts.Sec,
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
new file mode 100644
index 000000000..2da538fc6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+	// Linux just copies fields from struct kstat without regard to struct
+	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+	userns := t.UserNamespace()
+	*stat = linux.Stat{
+		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+		Ino:     statx.Ino,
+		Nlink:   uint64(statx.Nlink),
+		Mode:    uint32(statx.Mode),
+		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+		Size:    int64(statx.Size),
+		Blksize: int64(statx.Blksize),
+		Blocks:  int64(statx.Blocks),
+		ATime:   timespecFromStatxTimestamp(statx.Atime),
+		MTime:   timespecFromStatxTimestamp(statx.Mtime),
+		CTime:   timespecFromStatxTimestamp(statx.Ctime),
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go
new file mode 100644
index 000000000..88b9c7627
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+	// Linux just copies fields from struct kstat without regard to struct
+	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+	userns := t.UserNamespace()
+	*stat = linux.Stat{
+		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+		Ino:     statx.Ino,
+		Nlink:   uint32(statx.Nlink),
+		Mode:    uint32(statx.Mode),
+		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+		Size:    int64(statx.Size),
+		Blksize: int32(statx.Blksize),
+		Blocks:  int64(statx.Blocks),
+		ATime:   timespecFromStatxTimestamp(statx.Atime),
+		MTime:   timespecFromStatxTimestamp(statx.Mtime),
+		CTime:   timespecFromStatxTimestamp(statx.Ctime),
+	}
+}
-- 
cgit v1.2.3


From abf7ebcd38e8c2750f4542f29115140bb2b44a9b Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Thu, 27 Feb 2020 10:59:32 -0800
Subject: Internal change.

PiperOrigin-RevId: 297638665
---
 pkg/sentry/socket/netstack/netstack.go |  40 +++++++++--
 pkg/tcpip/transport/packet/endpoint.go |  21 +++++-
 test/syscalls/linux/packet_socket.cc   | 124 ++++++++++++++++++++++++++++++---
 3 files changed, 167 insertions(+), 18 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index e187276c5..48c268bfa 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -712,14 +712,40 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
-	addr, family, err := AddressAndFamily(sockaddr)
-	if err != nil {
-		return err
-	}
-	if err := s.checkFamily(family, true /* exact */); err != nil {
-		return err
+	family := usermem.ByteOrder.Uint16(sockaddr)
+	var addr tcpip.FullAddress
+
+	// Bind for AF_PACKET requires only family, protocol and ifindex.
+	// In function AddressAndFamily, we check the address length which is
+	// not needed for AF_PACKET bind.
+	if family == linux.AF_PACKET {
+		var a linux.SockAddrLink
+		if len(sockaddr) < sockAddrLinkSize {
+			return syserr.ErrInvalidArgument
+		}
+		binary.Unmarshal(sockaddr[:sockAddrLinkSize], usermem.ByteOrder, &a)
+
+		if a.Protocol != uint16(s.protocol) {
+			return syserr.ErrInvalidArgument
+		}
+
+		addr = tcpip.FullAddress{
+			NIC:  tcpip.NICID(a.InterfaceIndex),
+			Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
+		}
+	} else {
+		var err *syserr.Error
+		addr, family, err = AddressAndFamily(sockaddr)
+		if err != nil {
+			return err
+		}
+
+		if err = s.checkFamily(family, true /* exact */); err != nil {
+			return err
+		}
+
+		addr = s.mapFamily(addr, family)
 	}
-	addr = s.mapFamily(addr, family)
 
 	// Issue the bind request to the endpoint.
 	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 5722815e9..09a1cd436 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -76,6 +76,7 @@ type endpoint struct {
 	sndBufSize int
 	closed     bool
 	stats      tcpip.TransportEndpointStats `state:"nosave"`
+	bound      bool
 }
 
 // NewEndpoint returns a new packet endpoint.
@@ -125,6 +126,7 @@ func (ep *endpoint) Close() {
 	}
 
 	ep.closed = true
+	ep.bound = false
 	ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 }
 
@@ -216,7 +218,24 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 	// sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex."
 	// - packet(7).
 
-	return tcpip.ErrNotSupported
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	if ep.bound {
+		return tcpip.ErrAlreadyBound
+	}
+
+	// Unregister endpoint with all the nics.
+	ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep)
+
+	// Bind endpoint to receive packets from specific interface.
+	if err := ep.stack.RegisterPacketEndpoint(addr.NIC, ep.netProto, ep); err != nil {
+		return err
+	}
+
+	ep.bound = true
+
+	return nil
 }
 
 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index 92ae55eec..bc22de788 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <arpa/inet.h>
+#include <ifaddrs.h>
 #include <linux/capability.h>
 #include <linux/if_arp.h>
 #include <linux/if_packet.h>
@@ -163,16 +164,11 @@ int CookedPacketTest::GetLoopbackIndex() {
   return ifr.ifr_ifindex;
 }
 
-// Receive via a packet socket.
-TEST_P(CookedPacketTest, Receive) {
-  // Let's use a simple IP payload: a UDP datagram.
-  FileDescriptor udp_sock =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
-  SendUDPMessage(udp_sock.get());
-
+// Receive and verify the message via packet socket on interface.
+void ReceiveMessage(int sock, int ifindex) {
   // Wait for the socket to become readable.
   struct pollfd pfd = {};
-  pfd.fd = socket_;
+  pfd.fd = sock;
   pfd.events = POLLIN;
   EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 2000), SyscallSucceedsWithValue(1));
 
@@ -182,9 +178,10 @@ TEST_P(CookedPacketTest, Receive) {
   char buf[64];
   struct sockaddr_ll src = {};
   socklen_t src_len = sizeof(src);
-  ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0,
+  ASSERT_THAT(recvfrom(sock, buf, sizeof(buf), 0,
                        reinterpret_cast<struct sockaddr*>(&src), &src_len),
               SyscallSucceedsWithValue(packet_size));
+
   // sockaddr_ll ends with an 8 byte physical address field, but ethernet
   // addresses only use 6 bytes.  Linux used to return sizeof(sockaddr_ll)-2
   // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns
@@ -194,7 +191,7 @@ TEST_P(CookedPacketTest, Receive) {
   // TODO(b/129292371): Verify protocol once we return it.
   // Verify the source address.
   EXPECT_EQ(src.sll_family, AF_PACKET);
-  EXPECT_EQ(src.sll_ifindex, GetLoopbackIndex());
+  EXPECT_EQ(src.sll_ifindex, ifindex);
   EXPECT_EQ(src.sll_halen, ETH_ALEN);
   // This came from the loopback device, so the address is all 0s.
   for (int i = 0; i < src.sll_halen; i++) {
@@ -222,6 +219,18 @@ TEST_P(CookedPacketTest, Receive) {
   EXPECT_EQ(strncmp(payload, kMessage, sizeof(kMessage)), 0);
 }
 
+// Receive via a packet socket.
+TEST_P(CookedPacketTest, Receive) {
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  SendUDPMessage(udp_sock.get());
+
+  // Receive and verify the data.
+  int loopback_index = GetLoopbackIndex();
+  ReceiveMessage(socket_, loopback_index);
+}
+
 // Send via a packet socket.
 TEST_P(CookedPacketTest, Send) {
   // TODO(b/129292371): Remove once we support packet socket writing.
@@ -313,6 +322,101 @@ TEST_P(CookedPacketTest, Send) {
   EXPECT_EQ(src.sin_addr.s_addr, htonl(INADDR_LOOPBACK));
 }
 
+// Bind and receive via packet socket.
+TEST_P(CookedPacketTest, BindReceive) {
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = GetLoopbackIndex();
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  SendUDPMessage(udp_sock.get());
+
+  // Receive and verify the data.
+  ReceiveMessage(socket_, bind_addr.sll_ifindex);
+}
+
+// Double Bind socket.
+TEST_P(CookedPacketTest, DoubleBind) {
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = GetLoopbackIndex();
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Binding socket again should fail.
+  ASSERT_THAT(
+      bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+           sizeof(bind_addr)),
+      // Linux 4.09 returns EINVAL here, but some time before 4.19 it switched
+      // to EADDRINUSE.
+      AnyOf(SyscallFailsWithErrno(EADDRINUSE), SyscallFailsWithErrno(EINVAL)));
+}
+
+// Bind and verify we do not receive data on interface which is not bound
+TEST_P(CookedPacketTest, BindDrop) {
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  struct ifaddrs* if_addr_list = nullptr;
+  auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); });
+
+  ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds());
+
+  // Get interface other than loopback.
+  struct ifreq ifr = {};
+  for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) {
+    if (strcmp(i->ifa_name, "lo") != 0) {
+      strncpy(ifr.ifr_name, i->ifa_name, sizeof(ifr.ifr_name));
+      break;
+    }
+  }
+
+  // Skip if no interface is available other than loopback.
+  if (strlen(ifr.ifr_name) == 0) {
+    GTEST_SKIP();
+  }
+
+  // Get interface index.
+  EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  EXPECT_NE(ifr.ifr_ifindex, 0);
+
+  // Bind to packet socket requires only family, protocol and ifindex.
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = ifr.ifr_ifindex;
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Send to loopback interface.
+  struct sockaddr_in dest = {};
+  dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  dest.sin_family = AF_INET;
+  dest.sin_port = kPort;
+  EXPECT_THAT(sendto(udp_sock.get(), kMessage, sizeof(kMessage), 0,
+                     reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)),
+              SyscallSucceedsWithValue(sizeof(kMessage)));
+
+  // Wait and make sure the socket never receives any data.
+  struct pollfd pfd = {};
+  pfd.fd = socket_;
+  pfd.events = POLLIN;
+  EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest,
                          ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
-- 
cgit v1.2.3


From c6bdc6b05b4abfcb3c677013496c9a6b1d1365dd Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 27 Feb 2020 14:14:34 -0800
Subject: Fix a race in TCP endpoint teardown and teardown the stack in
 tcp_test.

Call stack.Close on stacks when we are done with them in tcp_test. This avoids
leaking resources and reduces the test's flakiness when race/gotsan is enabled.
It also provides test coverage for the race also fixed in this change, which
can be reliably triggered with the stack.Close change (and without the other
changes) when race/gotsan is enabled.

The race was possible when calling Abort (via stack.Close) on an endpoint
processing a SYN segment as part of a passive connect.

Updates #1564

PiperOrigin-RevId: 297685432
---
 pkg/tcpip/transport/tcp/accept.go                  |  1 +
 pkg/tcpip/transport/tcp/connect.go                 |  2 +-
 pkg/tcpip/transport/tcp/endpoint.go                | 16 +++++++++++++++-
 pkg/tcpip/transport/tcp/testing/context/context.go |  1 +
 4 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 13e383ffc..85049e54e 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -236,6 +236,7 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 	n.effectiveNetProtos = []tcpip.NetworkProtocolNumber{s.route.NetProto}
 	n.rcvBufSize = int(l.rcvWnd)
 	n.amss = mssForRoute(&n.route)
+	n.setEndpointState(StateConnecting)
 
 	n.maybeEnableTimestamp(rcvdSynOpts)
 	n.maybeEnableSACKPermitted(rcvdSynOpts)
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 7730e6445..cd247f3e1 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -577,7 +577,7 @@ func (h *handshake) execute() *tcpip.Error {
 
 		case wakerForNotification:
 			n := h.ep.fetchNotifications()
-			if n&notifyClose != 0 {
+			if (n&notifyClose)|(n&notifyAbort) != 0 {
 				return tcpip.ErrAborted
 			}
 			if n&notifyDrain != 0 {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index f1ad19dac..9e72730bd 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -798,7 +798,21 @@ func (e *endpoint) Abort() {
 	// If the endpoint disconnected after the check, nothing needs to be
 	// done, so sending a notification which will potentially be ignored is
 	// fine.
-	if e.EndpointState().connected() {
+	//
+	// If the endpoint connecting finishes after the check, the endpoint
+	// is either in a connected state (where we would notifyAbort anyway),
+	// SYN-RECV (where we would also notifyAbort anyway), or in an error
+	// state where nothing is required and the notification can be safely
+	// ignored.
+	//
+	// Endpoints where a Close during connecting or SYN-RECV state would be
+	// problematic are set to state connecting before being registered (and
+	// thus possible to be Aborted). They are never available in initial
+	// state.
+	//
+	// Endpoints transitioning from initial to connecting state may be
+	// safely either closed or sent notifyAbort.
+	if s := e.EndpointState(); s == StateConnecting || s == StateSynRecv || s.connected() {
 		e.notifyProtocolGoroutine(notifyAbort)
 		return
 	}
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 1e9a0dea3..8cea20fb5 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -204,6 +204,7 @@ func (c *Context) Cleanup() {
 	if c.EP != nil {
 		c.EP.Close()
 	}
+	c.Stack().Close()
 }
 
 // Stack returns a reference to the stack in the Context.
-- 
cgit v1.2.3


From 0f8a9e362337ee684042331c5bf24a3cb43d6fc4 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 28 Feb 2020 10:13:59 -0800
Subject: Change dup2 call to dup3

We changed syscalls to allow dup3 for ARM64.

Updates #1198

PiperOrigin-RevId: 297870816
---
 pkg/sentry/fsimpl/gofer/gofer.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index d00850e25..c4a8f0b38 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -1045,13 +1045,13 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 				// using the old file descriptor, preventing us from safely
 				// closing it. We could handle this by invalidating existing
 				// memmap.Translations, but this is expensive. Instead, use
-				// dup2() to make the old file descriptor refer to the new file
+				// dup3 to make the old file descriptor refer to the new file
 				// description, then close the new file descriptor (which is no
 				// longer needed). Racing callers may use the old or new file
 				// description, but this doesn't matter since they refer to the
 				// same file (unless d.fs.opts.overlayfsStaleRead is true,
 				// which we handle separately).
-				if err := syscall.Dup2(int(h.fd), int(d.handle.fd)); err != nil {
+				if err := syscall.Dup3(int(h.fd), int(d.handle.fd), 0); err != nil {
 					d.handleMu.Unlock()
 					ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err)
 					h.close(ctx)
-- 
cgit v1.2.3


From 6b4d36e3253238dd72d0861ac1220d147e1de8dd Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 28 Feb 2020 10:37:52 -0800
Subject: Hide /dev/net/tun when using hostinet.

/dev/net/tun does not currently work with hostinet. This has caused some
program starts failing because it thinks the feature exists.

PiperOrigin-RevId: 297876196
---
 pkg/sentry/fs/dev/BUILD                |  1 +
 pkg/sentry/fs/dev/dev.go               |  7 +++++--
 pkg/sentry/fs/dev/net_tun.go           |  7 +++++++
 pkg/sentry/kernel/kernel.go            |  4 ++++
 test/syscalls/BUILD                    |  5 +++++
 test/syscalls/linux/BUILD              | 12 +++++++++++
 test/syscalls/linux/dev.cc             |  7 -------
 test/syscalls/linux/tuntap.cc          |  7 +++++++
 test/syscalls/linux/tuntap_hostinet.cc | 37 ++++++++++++++++++++++++++++++++++
 9 files changed, 78 insertions(+), 9 deletions(-)
 create mode 100644 test/syscalls/linux/tuntap_hostinet.cc

(limited to 'pkg')

diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 9b6bb26d0..9379a4d7b 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -26,6 +26,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 7e66c29b0..acbd401a0 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -124,10 +125,12 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		"ptmx": newSymlink(ctx, "pts/ptmx", msrc),
 
 		"tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor),
+	}
 
-		"net": newDirectory(ctx, map[string]*fs.Inode{
+	if isNetTunSupported(inet.StackFromContext(ctx)) {
+		contents["net"] = newDirectory(ctx, map[string]*fs.Inode{
 			"tun": newCharacterDevice(ctx, newNetTunDevice(ctx, fs.RootOwner, 0666), msrc, netTunDevMajor, netTunDevMinor),
-		}, msrc),
+		}, msrc)
 	}
 
 	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
index 755644488..dc7ad075a 100644
--- a/pkg/sentry/fs/dev/net_tun.go
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -168,3 +169,9 @@ func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.Eve
 func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
 	fops.device.EventUnregister(e)
 }
+
+// isNetTunSupported returns whether /dev/net/tun device is supported for s.
+func isNetTunSupported(s inet.Stack) bool {
+	_, ok := s.(*netstack.Stack)
+	return ok
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 8b76750e9..1d627564f 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -755,6 +755,8 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
@@ -1481,6 +1483,8 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 3518e862d..a69b0ce13 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -680,6 +680,11 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:tuntap_test")
 
+syscall_test(
+    add_hostinet = True,
+    test = "//test/syscalls/linux:tuntap_hostinet_test",
+)
+
 syscall_test(test = "//test/syscalls/linux:udp_bind_test")
 
 syscall_test(
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 704bae17b..70c120e42 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3460,6 +3460,18 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "tuntap_hostinet_test",
+    testonly = 1,
+    srcs = ["tuntap_hostinet.cc"],
+    linkstatic = 1,
+    deps = [
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
 cc_library(
     name = "udp_socket_test_cases",
     testonly = 1,
diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc
index 4e473268c..4dd302eed 100644
--- a/test/syscalls/linux/dev.cc
+++ b/test/syscalls/linux/dev.cc
@@ -153,13 +153,6 @@ TEST(DevTest, TTYExists) {
   EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
 }
 
-TEST(DevTest, NetTunExists) {
-  struct stat statbuf = {};
-  ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallSucceeds());
-  // Check that it's a character device with rw-rw-rw- permissions.
-  EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
-}
-
 }  // namespace
 }  // namespace testing
 
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
index f6ac9d7b8..f734511d6 100644
--- a/test/syscalls/linux/tuntap.cc
+++ b/test/syscalls/linux/tuntap.cc
@@ -153,6 +153,13 @@ std::string CreateArpPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
 
 }  // namespace
 
+TEST(TuntapStaticTest, NetTunExists) {
+  struct stat statbuf;
+  ASSERT_THAT(stat(kDevNetTun, &statbuf), SyscallSucceeds());
+  // Check that it's a character device with rw-rw-rw- permissions.
+  EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
+}
+
 class TuntapTest : public ::testing::Test {
  protected:
   void TearDown() override {
diff --git a/test/syscalls/linux/tuntap_hostinet.cc b/test/syscalls/linux/tuntap_hostinet.cc
new file mode 100644
index 000000000..0c527419e
--- /dev/null
+++ b/test/syscalls/linux/tuntap_hostinet.cc
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(TuntapHostInetTest, NoNetTun) {
+  SKIP_IF(!IsRunningOnGvisor());
+
+  struct stat statbuf;
+  ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallFailsWithErrno(ENOENT));
+}
+
+}  // namespace
+}  // namespace testing
+
+}  // namespace gvisor
-- 
cgit v1.2.3


From 463f4217d109ded8af758fe51a5daf8670da9794 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 28 Feb 2020 12:28:10 -0800
Subject: Make pipe buffer implementation standard.

A follow-up change will convert the networking code to use this standard
pipe implementation.

PiperOrigin-RevId: 297903206
---
 pkg/buffer/BUILD                      |  39 ++++
 pkg/buffer/buffer.go                  |  67 ++++++
 pkg/buffer/safemem.go                 | 131 ++++++++++++
 pkg/buffer/view.go                    | 382 ++++++++++++++++++++++++++++++++++
 pkg/buffer/view_test.go               | 233 +++++++++++++++++++++
 pkg/buffer/view_unsafe.go             |  25 +++
 pkg/sentry/kernel/pipe/BUILD          |  18 +-
 pkg/sentry/kernel/pipe/buffer.go      | 115 ----------
 pkg/sentry/kernel/pipe/buffer_test.go |  32 ---
 pkg/sentry/kernel/pipe/pipe.go        | 118 ++---------
 pkg/sentry/kernel/pipe/pipe_util.go   |  25 +--
 11 files changed, 912 insertions(+), 273 deletions(-)
 create mode 100644 pkg/buffer/BUILD
 create mode 100644 pkg/buffer/buffer.go
 create mode 100644 pkg/buffer/safemem.go
 create mode 100644 pkg/buffer/view.go
 create mode 100644 pkg/buffer/view_test.go
 create mode 100644 pkg/buffer/view_unsafe.go
 delete mode 100644 pkg/sentry/kernel/pipe/buffer.go
 delete mode 100644 pkg/sentry/kernel/pipe/buffer_test.go

(limited to 'pkg')

diff --git a/pkg/buffer/BUILD b/pkg/buffer/BUILD
new file mode 100644
index 000000000..a77a3beea
--- /dev/null
+++ b/pkg/buffer/BUILD
@@ -0,0 +1,39 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "buffer_list",
+    out = "buffer_list.go",
+    package = "buffer",
+    prefix = "buffer",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Buffer",
+        "Linker": "*Buffer",
+    },
+)
+
+go_library(
+    name = "buffer",
+    srcs = [
+        "buffer.go",
+        "buffer_list.go",
+        "safemem.go",
+        "view.go",
+        "view_unsafe.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/safemem",
+    ],
+)
+
+go_test(
+    name = "buffer_test",
+    size = "small",
+    srcs = ["view_test.go"],
+    library = ":buffer",
+)
diff --git a/pkg/buffer/buffer.go b/pkg/buffer/buffer.go
new file mode 100644
index 000000000..d5f64609b
--- /dev/null
+++ b/pkg/buffer/buffer.go
@@ -0,0 +1,67 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package buffer provides the implementation of a buffer view.
+package buffer
+
+import (
+	"sync"
+)
+
+const bufferSize = 8144 // See below.
+
+// Buffer encapsulates a queueable byte buffer.
+//
+// Note that the total size is slightly less than two pages. This is done
+// intentionally to ensure that the buffer object aligns with runtime
+// internals. We have no hard size or alignment requirements. This two page
+// size will effectively minimize internal fragmentation, but still have a
+// large enough chunk to limit excessive segmentation.
+//
+// +stateify savable
+type Buffer struct {
+	data  [bufferSize]byte
+	read  int
+	write int
+	bufferEntry
+}
+
+// Reset resets internal data.
+//
+// This must be called before use.
+func (b *Buffer) Reset() {
+	b.read = 0
+	b.write = 0
+}
+
+// Empty indicates the buffer is empty.
+//
+// This indicates there is no data left to read.
+func (b *Buffer) Empty() bool {
+	return b.read == b.write
+}
+
+// Full indicates the buffer is full.
+//
+// This indicates there is no capacity left to write.
+func (b *Buffer) Full() bool {
+	return b.write == len(b.data)
+}
+
+// bufferPool is a pool for buffers.
+var bufferPool = sync.Pool{
+	New: func() interface{} {
+		return new(Buffer)
+	},
+}
diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go
new file mode 100644
index 000000000..071aaa488
--- /dev/null
+++ b/pkg/buffer/safemem.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// WriteBlock returns this buffer as a write Block.
+func (b *Buffer) WriteBlock() safemem.Block {
+	return safemem.BlockFromSafeSlice(b.data[b.write:])
+}
+
+// ReadBlock returns this buffer as a read Block.
+func (b *Buffer) ReadBlock() safemem.Block {
+	return safemem.BlockFromSafeSlice(b.data[b.read:b.write])
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+//
+// This will advance the write index.
+func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	need := int(srcs.NumBytes())
+	if need == 0 {
+		return 0, nil
+	}
+
+	var (
+		dst    safemem.BlockSeq
+		blocks []safemem.Block
+	)
+
+	// Need at least one buffer.
+	firstBuf := v.data.Back()
+	if firstBuf == nil {
+		firstBuf = bufferPool.Get().(*Buffer)
+		v.data.PushBack(firstBuf)
+	}
+
+	// Does the last block have sufficient capacity alone?
+	if l := len(firstBuf.data) - firstBuf.write; l >= need {
+		dst = safemem.BlockSeqOf(firstBuf.WriteBlock())
+	} else {
+		// Append blocks until sufficient.
+		need -= l
+		blocks = append(blocks, firstBuf.WriteBlock())
+		for need > 0 {
+			emptyBuf := bufferPool.Get().(*Buffer)
+			v.data.PushBack(emptyBuf)
+			need -= len(emptyBuf.data) // Full block.
+			blocks = append(blocks, emptyBuf.WriteBlock())
+		}
+		dst = safemem.BlockSeqFromSlice(blocks)
+	}
+
+	// Perform the copy.
+	n, err := safemem.CopySeq(dst, srcs)
+	v.size += int64(n)
+
+	// Update all indices.
+	for left := int(n); left > 0; firstBuf = firstBuf.Next() {
+		if l := len(firstBuf.data) - firstBuf.write; left >= l {
+			firstBuf.write += l // Whole block.
+			left -= l
+		} else {
+			firstBuf.write += left // Partial block.
+			left = 0
+		}
+	}
+
+	return n, err
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+//
+// This will not advance the read index; the caller should follow
+// this call with a call to TrimFront in order to remove the read
+// data from the buffer. This is done to support pipe sematics.
+func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	need := int(dsts.NumBytes())
+	if need == 0 {
+		return 0, nil
+	}
+
+	var (
+		src    safemem.BlockSeq
+		blocks []safemem.Block
+	)
+
+	firstBuf := v.data.Front()
+	if firstBuf == nil {
+		return 0, io.EOF
+	}
+
+	// Is all the data in a single block?
+	if l := firstBuf.write - firstBuf.read; l >= need {
+		src = safemem.BlockSeqOf(firstBuf.ReadBlock())
+	} else {
+		// Build a list of all the buffers.
+		need -= l
+		blocks = append(blocks, firstBuf.ReadBlock())
+		for buf := firstBuf.Next(); buf != nil && need > 0; buf = buf.Next() {
+			need -= buf.write - buf.read
+			blocks = append(blocks, buf.ReadBlock())
+		}
+		src = safemem.BlockSeqFromSlice(blocks)
+	}
+
+	// Perform the copy.
+	n, err := safemem.CopySeq(dsts, src)
+
+	// See above: we would normally advance the read index here, but we
+	// don't do that in order to support pipe semantics. We rely on a
+	// separate call to TrimFront() in this case.
+
+	return n, err
+}
diff --git a/pkg/buffer/view.go b/pkg/buffer/view.go
new file mode 100644
index 000000000..00fc11e9c
--- /dev/null
+++ b/pkg/buffer/view.go
@@ -0,0 +1,382 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"fmt"
+	"io"
+)
+
+// View is a non-linear buffer.
+//
+// All methods are thread compatible.
+//
+// +stateify savable
+type View struct {
+	data bufferList
+	size int64
+}
+
+// TrimFront removes the first count bytes from the buffer.
+func (v *View) TrimFront(count int64) {
+	if count >= v.size {
+		v.advanceRead(v.size)
+	} else {
+		v.advanceRead(count)
+	}
+}
+
+// Read implements io.Reader.Read.
+//
+// Note that reading does not advance the read index. This must be done
+// manually using TrimFront or other methods.
+func (v *View) Read(p []byte) (int, error) {
+	return v.ReadAt(p, 0)
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (v *View) ReadAt(p []byte, offset int64) (int, error) {
+	var (
+		skipped int64
+		done    int64
+	)
+	for buf := v.data.Front(); buf != nil && done < int64(len(p)); buf = buf.Next() {
+		needToSkip := int(offset - skipped)
+		if l := buf.write - buf.read; l <= needToSkip {
+			skipped += int64(l)
+			continue
+		}
+
+		// Actually read data.
+		n := copy(p[done:], buf.data[buf.read+needToSkip:buf.write])
+		skipped += int64(needToSkip)
+		done += int64(n)
+	}
+	if int(done) < len(p) {
+		return int(done), io.EOF
+	}
+	return int(done), nil
+}
+
+// Write implements io.Writer.Write.
+func (v *View) Write(p []byte) (int, error) {
+	v.Append(p) // Does not fail.
+	return len(p), nil
+}
+
+// advanceRead advances the view's read index.
+//
+// Precondition: there must be sufficient bytes in the buffer.
+func (v *View) advanceRead(count int64) {
+	for buf := v.data.Front(); buf != nil && count > 0; {
+		l := int64(buf.write - buf.read)
+		if l > count {
+			// There is still data for reading.
+			buf.read += int(count)
+			v.size -= count
+			count = 0
+			break
+		}
+
+		// Read from this buffer.
+		buf.read += int(l)
+		count -= l
+		v.size -= l
+
+		// When all data has been read from a buffer, we push
+		// it into the empty buffer pool for reuse.
+		oldBuf := buf
+		buf = buf.Next() // Iterate.
+		v.data.Remove(oldBuf)
+		oldBuf.Reset()
+		bufferPool.Put(oldBuf)
+	}
+	if count > 0 {
+		panic(fmt.Sprintf("advanceRead still has %d bytes remaining", count))
+	}
+}
+
+// Truncate truncates the view to the given bytes.
+func (v *View) Truncate(length int64) {
+	if length < 0 || length >= v.size {
+		return // Nothing to do.
+	}
+	for buf := v.data.Back(); buf != nil && v.size > length; buf = v.data.Back() {
+		l := int64(buf.write - buf.read) // Local bytes.
+		switch {
+		case v.size-l >= length:
+			// Drop the buffer completely; see above.
+			v.data.Remove(buf)
+			v.size -= l
+			buf.Reset()
+			bufferPool.Put(buf)
+
+		case v.size > length && v.size-l < length:
+			// Just truncate the buffer locally.
+			delta := (length - (v.size - l))
+			buf.write = buf.read + int(delta)
+			v.size = length
+
+		default:
+			// Should never happen.
+			panic("invalid buffer during truncation")
+		}
+	}
+	v.size = length // Save the new size.
+}
+
+// Grow grows the given view to the number of bytes. If zero
+// is true, all these bytes will be zero. If zero is false,
+// then this is the caller's responsibility.
+//
+// Precondition: length must be >= 0.
+func (v *View) Grow(length int64, zero bool) {
+	if length < 0 {
+		panic("negative length provided")
+	}
+	for v.size < length {
+		buf := v.data.Back()
+
+		// Is there at least one buffer?
+		if buf == nil || buf.Full() {
+			buf = bufferPool.Get().(*Buffer)
+			v.data.PushBack(buf)
+		}
+
+		// Write up to length bytes.
+		l := len(buf.data) - buf.write
+		if int64(l) > length-v.size {
+			l = int(length - v.size)
+		}
+
+		// Zero the written section; note that this pattern is
+		// specifically recognized and optimized by the compiler.
+		if zero {
+			for i := buf.write; i < buf.write+l; i++ {
+				buf.data[i] = 0
+			}
+		}
+
+		// Advance the index.
+		buf.write += l
+		v.size += int64(l)
+	}
+}
+
+// Prepend prepends the given data.
+func (v *View) Prepend(data []byte) {
+	// Is there any space in the first buffer?
+	if buf := v.data.Front(); buf != nil && buf.read > 0 {
+		// Fill up before the first write.
+		avail := buf.read
+		copy(buf.data[0:], data[len(data)-avail:])
+		data = data[:len(data)-avail]
+		v.size += int64(avail)
+	}
+
+	for len(data) > 0 {
+		// Do we need an empty buffer?
+		buf := bufferPool.Get().(*Buffer)
+		v.data.PushFront(buf)
+
+		// The buffer is empty; copy last chunk.
+		start := len(data) - len(buf.data)
+		if start < 0 {
+			start = 0 // Everything.
+		}
+
+		// We have to put the data at the end of the current
+		// buffer in order to ensure that the next prepend will
+		// correctly fill up the beginning of this buffer.
+		bStart := len(buf.data) - len(data[start:])
+		n := copy(buf.data[bStart:], data[start:])
+		buf.read = bStart
+		buf.write = len(buf.data)
+		data = data[:start]
+		v.size += int64(n)
+	}
+}
+
+// Append appends the given data.
+func (v *View) Append(data []byte) {
+	for done := 0; done < len(data); {
+		buf := v.data.Back()
+
+		// Find the first empty buffer.
+		if buf == nil || buf.Full() {
+			buf = bufferPool.Get().(*Buffer)
+			v.data.PushBack(buf)
+		}
+
+		// Copy in to the given buffer.
+		n := copy(buf.data[buf.write:], data[done:])
+		done += n
+		buf.write += n
+		v.size += int64(n)
+	}
+}
+
+// Flatten returns a flattened copy of this data.
+//
+// This method should not be used in any performance-sensitive paths. It may
+// allocate a fresh byte slice sufficiently large to contain all the data in
+// the buffer.
+//
+// N.B. Tee data still belongs to this view, as if there is a single buffer
+// present, then it will be returned directly. This should be used for
+// temporary use only, and a reference to the given slice should not be held.
+func (v *View) Flatten() []byte {
+	if buf := v.data.Front(); buf.Next() == nil {
+		return buf.data[buf.read:buf.write] // Only one buffer.
+	}
+	data := make([]byte, 0, v.size) // Need to flatten.
+	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+		// Copy to the allocated slice.
+		data = append(data, buf.data[buf.read:buf.write]...)
+	}
+	return data
+}
+
+// Size indicates the total amount of data available in this view.
+func (v *View) Size() (sz int64) {
+	sz = v.size // Pre-calculated.
+	return sz
+}
+
+// Copy makes a strict copy of this view.
+func (v *View) Copy() (other View) {
+	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+		other.Append(buf.data[buf.read:buf.write])
+	}
+	return other
+}
+
+// Apply applies the given function across all valid data.
+func (v *View) Apply(fn func([]byte)) {
+	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+		if l := int64(buf.write - buf.read); l > 0 {
+			fn(buf.data[buf.read:buf.write])
+		}
+	}
+}
+
+// Merge merges the provided View with this one.
+//
+// The other view will be empty after this operation.
+func (v *View) Merge(other *View) {
+	// Copy over all buffers.
+	for buf := other.data.Front(); buf != nil && !buf.Empty(); buf = other.data.Front() {
+		other.data.Remove(buf)
+		v.data.PushBack(buf)
+	}
+
+	// Adjust sizes.
+	v.size += other.size
+	other.size = 0
+}
+
+// WriteFromReader writes to the buffer from an io.Reader.
+func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) {
+	var (
+		done int64
+		n    int
+		err  error
+	)
+	for done < count {
+		buf := v.data.Back()
+
+		// Find the first empty buffer.
+		if buf == nil || buf.Full() {
+			buf = bufferPool.Get().(*Buffer)
+			v.data.PushBack(buf)
+		}
+
+		// Is this less than the minimum batch?
+		if len(buf.data[buf.write:]) < minBatch && (count-done) >= int64(minBatch) {
+			tmp := make([]byte, minBatch)
+			n, err = r.Read(tmp)
+			v.Write(tmp[:n])
+			done += int64(n)
+			if err != nil {
+				break
+			}
+			continue
+		}
+
+		// Limit the read, if necessary.
+		end := len(buf.data)
+		if int64(end-buf.write) > (count - done) {
+			end = buf.write + int(count-done)
+		}
+
+		// Pass the relevant portion of the buffer.
+		n, err = r.Read(buf.data[buf.write:end])
+		buf.write += n
+		done += int64(n)
+		v.size += int64(n)
+		if err == io.EOF {
+			err = nil // Short write allowed.
+			break
+		} else if err != nil {
+			break
+		}
+	}
+	return done, err
+}
+
+// ReadToWriter reads from the buffer into an io.Writer.
+//
+// N.B. This does not consume the bytes read. TrimFront should
+// be called appropriately after this call in order to do so.
+func (v *View) ReadToWriter(w io.Writer, count int64) (int64, error) {
+	var (
+		done int64
+		n    int
+		err  error
+	)
+	offset := 0 // Spill-over for batching.
+	for buf := v.data.Front(); buf != nil && done < count; buf = buf.Next() {
+		l := buf.write - buf.read - offset
+
+		// Is this less than the minimum batch?
+		if l < minBatch && (count-done) >= int64(minBatch) && (v.size-done) >= int64(minBatch) {
+			tmp := make([]byte, minBatch)
+			n, err = v.ReadAt(tmp, done)
+			w.Write(tmp[:n])
+			done += int64(n)
+			offset = n - l // Reset below.
+			if err != nil {
+				break
+			}
+			continue
+		}
+
+		// Limit the write if necessary.
+		if int64(l) >= (count - done) {
+			l = int(count - done)
+		}
+
+		// Perform the actual write.
+		n, err = w.Write(buf.data[buf.read+offset : buf.read+offset+l])
+		done += int64(n)
+		if err != nil {
+			break
+		}
+
+		// Reset spill-over.
+		offset = 0
+	}
+	return done, err
+}
diff --git a/pkg/buffer/view_test.go b/pkg/buffer/view_test.go
new file mode 100644
index 000000000..37e652f16
--- /dev/null
+++ b/pkg/buffer/view_test.go
@@ -0,0 +1,233 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+)
+
+func TestView(t *testing.T) {
+	testCases := []struct {
+		name   string
+		input  string
+		output string
+		ops    []func(*View)
+	}{
+		// Prepend.
+		{
+			name:  "prepend",
+			input: "world",
+			ops: []func(*View){
+				func(v *View) {
+					v.Prepend([]byte("hello "))
+				},
+			},
+			output: "hello world",
+		},
+		{
+			name:  "prepend fill",
+			input: strings.Repeat("1", bufferSize-1),
+			ops: []func(*View){
+				func(v *View) {
+					v.Prepend([]byte("0"))
+				},
+			},
+			output: "0" + strings.Repeat("1", bufferSize-1),
+		},
+		{
+			name:  "prepend overflow",
+			input: strings.Repeat("1", bufferSize),
+			ops: []func(*View){
+				func(v *View) {
+					v.Prepend([]byte("0"))
+				},
+			},
+			output: "0" + strings.Repeat("1", bufferSize),
+		},
+		{
+			name:  "prepend multiple buffers",
+			input: strings.Repeat("1", bufferSize-1),
+			ops: []func(*View){
+				func(v *View) {
+					v.Prepend([]byte(strings.Repeat("0", bufferSize*3)))
+				},
+			},
+			output: strings.Repeat("0", bufferSize*3) + strings.Repeat("1", bufferSize-1),
+		},
+
+		// Append.
+		{
+			name:  "append",
+			input: "hello",
+			ops: []func(*View){
+				func(v *View) {
+					v.Append([]byte(" world"))
+				},
+			},
+			output: "hello world",
+		},
+		{
+			name:  "append fill",
+			input: strings.Repeat("1", bufferSize-1),
+			ops: []func(*View){
+				func(v *View) {
+					v.Append([]byte("0"))
+				},
+			},
+			output: strings.Repeat("1", bufferSize-1) + "0",
+		},
+		{
+			name:  "append overflow",
+			input: strings.Repeat("1", bufferSize),
+			ops: []func(*View){
+				func(v *View) {
+					v.Append([]byte("0"))
+				},
+			},
+			output: strings.Repeat("1", bufferSize) + "0",
+		},
+		{
+			name:  "append multiple buffers",
+			input: strings.Repeat("1", bufferSize-1),
+			ops: []func(*View){
+				func(v *View) {
+					v.Append([]byte(strings.Repeat("0", bufferSize*3)))
+				},
+			},
+			output: strings.Repeat("1", bufferSize-1) + strings.Repeat("0", bufferSize*3),
+		},
+
+		// Truncate.
+		{
+			name:  "truncate",
+			input: "hello world",
+			ops: []func(*View){
+				func(v *View) {
+					v.Truncate(5)
+				},
+			},
+			output: "hello",
+		},
+		{
+			name:  "truncate multiple buffers",
+			input: strings.Repeat("1", bufferSize*2),
+			ops: []func(*View){
+				func(v *View) {
+					v.Truncate(bufferSize*2 - 1)
+				},
+			},
+			output: strings.Repeat("1", bufferSize*2-1),
+		},
+		{
+			name:  "truncate multiple buffers to one buffer",
+			input: strings.Repeat("1", bufferSize*2),
+			ops: []func(*View){
+				func(v *View) {
+					v.Truncate(5)
+				},
+			},
+			output: "11111",
+		},
+
+		// TrimFront.
+		{
+			name:  "trim",
+			input: "hello world",
+			ops: []func(*View){
+				func(v *View) {
+					v.TrimFront(6)
+				},
+			},
+			output: "world",
+		},
+		{
+			name:  "trim multiple buffers",
+			input: strings.Repeat("1", bufferSize*2),
+			ops: []func(*View){
+				func(v *View) {
+					v.TrimFront(1)
+				},
+			},
+			output: strings.Repeat("1", bufferSize*2-1),
+		},
+		{
+			name:  "trim multiple buffers to one buffer",
+			input: strings.Repeat("1", bufferSize*2),
+			ops: []func(*View){
+				func(v *View) {
+					v.TrimFront(bufferSize*2 - 1)
+				},
+			},
+			output: "1",
+		},
+
+		// Grow.
+		{
+			name:  "grow",
+			input: "hello world",
+			ops: []func(*View){
+				func(v *View) {
+					v.Grow(1, true)
+				},
+			},
+			output: "hello world",
+		},
+		{
+			name: "grow from zero",
+			ops: []func(*View){
+				func(v *View) {
+					v.Grow(1024, true)
+				},
+			},
+			output: strings.Repeat("\x00", 1024),
+		},
+		{
+			name:  "grow from non-zero",
+			input: strings.Repeat("1", bufferSize),
+			ops: []func(*View){
+				func(v *View) {
+					v.Grow(bufferSize*2, true)
+				},
+			},
+			output: strings.Repeat("1", bufferSize) + strings.Repeat("\x00", bufferSize),
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			// Construct the new view.
+			var view View
+			view.Append([]byte(tc.input))
+
+			// Run all operations.
+			for _, op := range tc.ops {
+				op(&view)
+			}
+
+			// Flatten and validate.
+			out := view.Flatten()
+			if !bytes.Equal([]byte(tc.output), out) {
+				t.Errorf("expected %q, got %q", tc.output, string(out))
+			}
+
+			// Ensure the size is correct.
+			if len(out) != int(view.Size()) {
+				t.Errorf("size is wrong: expected %d, got %d", len(out), view.Size())
+			}
+		})
+	}
+}
diff --git a/pkg/buffer/view_unsafe.go b/pkg/buffer/view_unsafe.go
new file mode 100644
index 000000000..d1ef39b26
--- /dev/null
+++ b/pkg/buffer/view_unsafe.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"unsafe"
+)
+
+// minBatch is the smallest Read or Write operation that the
+// WriteFromReader and ReadToWriter functions will use.
+//
+// This is defined as the size of a native pointer.
+const minBatch = int(unsafe.Sizeof(uintptr(0)))
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 4c049d5b4..f29dc0472 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,25 +1,10 @@
 load("//tools:defs.bzl", "go_library", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-go_template_instance(
-    name = "buffer_list",
-    out = "buffer_list.go",
-    package = "pipe",
-    prefix = "buffer",
-    template = "//pkg/ilist:generic_list",
-    types = {
-        "Element": "*buffer",
-        "Linker": "*buffer",
-    },
-)
-
 go_library(
     name = "pipe",
     srcs = [
-        "buffer.go",
-        "buffer_list.go",
         "device.go",
         "node.go",
         "pipe.go",
@@ -33,8 +18,8 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
+        "//pkg/buffer",
         "//pkg/context",
-        "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
@@ -51,7 +36,6 @@ go_test(
     name = "pipe_test",
     size = "small",
     srcs = [
-        "buffer_test.go",
         "node_test.go",
         "pipe_test.go",
     ],
diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go
deleted file mode 100644
index fe3be5dbd..000000000
--- a/pkg/sentry/kernel/pipe/buffer.go
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pipe
-
-import (
-	"io"
-
-	"gvisor.dev/gvisor/pkg/safemem"
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-// buffer encapsulates a queueable byte buffer.
-//
-// Note that the total size is slightly less than two pages. This
-// is done intentionally to ensure that the buffer object aligns
-// with runtime internals. We have no hard size or alignment
-// requirements. This two page size will effectively minimize
-// internal fragmentation, but still have a large enough chunk
-// to limit excessive segmentation.
-//
-// +stateify savable
-type buffer struct {
-	data  [8144]byte
-	read  int
-	write int
-	bufferEntry
-}
-
-// Reset resets internal data.
-//
-// This must be called before use.
-func (b *buffer) Reset() {
-	b.read = 0
-	b.write = 0
-}
-
-// Empty indicates the buffer is empty.
-//
-// This indicates there is no data left to read.
-func (b *buffer) Empty() bool {
-	return b.read == b.write
-}
-
-// Full indicates the buffer is full.
-//
-// This indicates there is no capacity left to write.
-func (b *buffer) Full() bool {
-	return b.write == len(b.data)
-}
-
-// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
-func (b *buffer) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
-	dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.write:]))
-	n, err := safemem.CopySeq(dst, srcs)
-	b.write += int(n)
-	return n, err
-}
-
-// WriteFromReader writes to the buffer from an io.Reader.
-func (b *buffer) WriteFromReader(r io.Reader, count int64) (int64, error) {
-	dst := b.data[b.write:]
-	if count < int64(len(dst)) {
-		dst = b.data[b.write:][:count]
-	}
-	n, err := r.Read(dst)
-	b.write += n
-	return int64(n), err
-}
-
-// ReadToBlocks implements safemem.Reader.ReadToBlocks.
-func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
-	src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.read:b.write]))
-	n, err := safemem.CopySeq(dsts, src)
-	b.read += int(n)
-	return n, err
-}
-
-// ReadToWriter reads from the buffer into an io.Writer.
-func (b *buffer) ReadToWriter(w io.Writer, count int64, dup bool) (int64, error) {
-	src := b.data[b.read:b.write]
-	if count < int64(len(src)) {
-		src = b.data[b.read:][:count]
-	}
-	n, err := w.Write(src)
-	if !dup {
-		b.read += n
-	}
-	return int64(n), err
-}
-
-// bufferPool is a pool for buffers.
-var bufferPool = sync.Pool{
-	New: func() interface{} {
-		return new(buffer)
-	},
-}
-
-// newBuffer grabs a new buffer from the pool.
-func newBuffer() *buffer {
-	b := bufferPool.Get().(*buffer)
-	b.Reset()
-	return b
-}
diff --git a/pkg/sentry/kernel/pipe/buffer_test.go b/pkg/sentry/kernel/pipe/buffer_test.go
deleted file mode 100644
index 4d54b8b8f..000000000
--- a/pkg/sentry/kernel/pipe/buffer_test.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pipe
-
-import (
-	"testing"
-	"unsafe"
-
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-func TestBufferSize(t *testing.T) {
-	bufferSize := unsafe.Sizeof(buffer{})
-	if bufferSize < usermem.PageSize {
-		t.Errorf("buffer is less than a page")
-	}
-	if bufferSize > (2 * usermem.PageSize) {
-		t.Errorf("buffer is greater than two pages")
-	}
-}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 08410283f..725e9db7d 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -20,6 +20,7 @@ import (
 	"sync/atomic"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -70,10 +71,10 @@ type Pipe struct {
 	// mu protects all pipe internal state below.
 	mu sync.Mutex `state:"nosave"`
 
-	// data is the buffer queue of pipe contents.
+	// view is the underlying set of buffers.
 	//
 	// This is protected by mu.
-	data bufferList
+	view buffer.View
 
 	// max is the maximum size of the pipe in bytes. When this max has been
 	// reached, writers will get EWOULDBLOCK.
@@ -81,11 +82,6 @@ type Pipe struct {
 	// This is protected by mu.
 	max int64
 
-	// size is the current size of the pipe in bytes.
-	//
-	// This is protected by mu.
-	size int64
-
 	// hadWriter indicates if this pipe ever had a writer. Note that this
 	// does not necessarily indicate there is *currently* a writer, just
 	// that there has been a writer at some point since the pipe was
@@ -196,7 +192,7 @@ type readOps struct {
 	limit func(int64)
 
 	// read performs the actual read operation.
-	read func(*buffer) (int64, error)
+	read func(*buffer.View) (int64, error)
 }
 
 // read reads data from the pipe into dst and returns the number of bytes
@@ -213,7 +209,7 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
 	defer p.mu.Unlock()
 
 	// Is the pipe empty?
-	if p.size == 0 {
+	if p.view.Size() == 0 {
 		if !p.HasWriters() {
 			// There are no writers, return EOF.
 			return 0, nil
@@ -222,71 +218,13 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
 	}
 
 	// Limit how much we consume.
-	if ops.left() > p.size {
-		ops.limit(p.size)
+	if ops.left() > p.view.Size() {
+		ops.limit(p.view.Size())
 	}
 
-	done := int64(0)
-	for ops.left() > 0 {
-		// Pop the first buffer.
-		first := p.data.Front()
-		if first == nil {
-			break
-		}
-
-		// Copy user data.
-		n, err := ops.read(first)
-		done += int64(n)
-		p.size -= n
-
-		// Empty buffer?
-		if first.Empty() {
-			// Push to the free list.
-			p.data.Remove(first)
-			bufferPool.Put(first)
-		}
-
-		// Handle errors.
-		if err != nil {
-			return done, err
-		}
-	}
-
-	return done, nil
-}
-
-// dup duplicates all data from this pipe into the given writer.
-//
-// There is no blocking behavior implemented here. The writer may propagate
-// some blocking error. All the writes must be complete writes.
-func (p *Pipe) dup(ctx context.Context, ops readOps) (int64, error) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-
-	// Is the pipe empty?
-	if p.size == 0 {
-		if !p.HasWriters() {
-			// See above.
-			return 0, nil
-		}
-		return 0, syserror.ErrWouldBlock
-	}
-
-	// Limit how much we consume.
-	if ops.left() > p.size {
-		ops.limit(p.size)
-	}
-
-	done := int64(0)
-	for buf := p.data.Front(); buf != nil; buf = buf.Next() {
-		n, err := ops.read(buf)
-		done += n
-		if err != nil {
-			return done, err
-		}
-	}
-
-	return done, nil
+	// Copy user data; the read op is responsible for trimming.
+	done, err := ops.read(&p.view)
+	return done, err
 }
 
 type writeOps struct {
@@ -297,7 +235,7 @@ type writeOps struct {
 	limit func(int64)
 
 	// write should write to the provided buffer.
-	write func(*buffer) (int64, error)
+	write func(*buffer.View) (int64, error)
 }
 
 // write writes data from sv into the pipe and returns the number of bytes
@@ -317,33 +255,19 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
 	// atomic, but requires no atomicity for writes larger than this.
 	wanted := ops.left()
-	if avail := p.max - p.size; wanted > avail {
+	if avail := p.max - p.view.Size(); wanted > avail {
 		if wanted <= p.atomicIOBytes {
 			return 0, syserror.ErrWouldBlock
 		}
 		ops.limit(avail)
 	}
 
-	done := int64(0)
-	for ops.left() > 0 {
-		// Need a new buffer?
-		last := p.data.Back()
-		if last == nil || last.Full() {
-			// Add a new buffer to the data list.
-			last = newBuffer()
-			p.data.PushBack(last)
-		}
-
-		// Copy user data.
-		n, err := ops.write(last)
-		done += int64(n)
-		p.size += n
-
-		// Handle errors.
-		if err != nil {
-			return done, err
-		}
+	// Copy user data.
+	done, err := ops.write(&p.view)
+	if err != nil {
+		return done, err
 	}
+
 	if wanted > done {
 		// Partial write due to full pipe.
 		return done, syserror.ErrWouldBlock
@@ -396,7 +320,7 @@ func (p *Pipe) HasWriters() bool {
 // Precondition: mu must be held.
 func (p *Pipe) rReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
-	if p.HasReaders() && p.data.Front() != nil {
+	if p.HasReaders() && p.view.Size() != 0 {
 		ready |= waiter.EventIn
 	}
 	if !p.HasWriters() && p.hadWriter {
@@ -422,7 +346,7 @@ func (p *Pipe) rReadiness() waiter.EventMask {
 // Precondition: mu must be held.
 func (p *Pipe) wReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
-	if p.HasWriters() && p.size < p.max {
+	if p.HasWriters() && p.view.Size() < p.max {
 		ready |= waiter.EventOut
 	}
 	if !p.HasReaders() {
@@ -451,7 +375,7 @@ func (p *Pipe) rwReadiness() waiter.EventMask {
 func (p *Pipe) queued() int64 {
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	return p.size
+	return p.view.Size()
 }
 
 // FifoSize implements fs.FifoSizer.FifoSize.
@@ -474,7 +398,7 @@ func (p *Pipe) SetFifoSize(size int64) (int64, error) {
 	}
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	if size < p.size {
+	if size < p.view.Size() {
 		return 0, syserror.EBUSY
 	}
 	p.max = size
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index 80158239e..5a1d4fd57 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/amutex"
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -49,9 +50,10 @@ func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error)
 		limit: func(l int64) {
 			dst = dst.TakeFirst64(l)
 		},
-		read: func(buf *buffer) (int64, error) {
-			n, err := dst.CopyOutFrom(ctx, buf)
+		read: func(view *buffer.View) (int64, error) {
+			n, err := dst.CopyOutFrom(ctx, view)
 			dst = dst.DropFirst64(n)
+			view.TrimFront(n)
 			return n, err
 		},
 	})
@@ -70,16 +72,15 @@ func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool)
 		limit: func(l int64) {
 			count = l
 		},
-		read: func(buf *buffer) (int64, error) {
-			n, err := buf.ReadToWriter(w, count, dup)
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadToWriter(w, count)
+			if !dup {
+				view.TrimFront(n)
+			}
 			count -= n
 			return n, err
 		},
 	}
-	if dup {
-		// There is no notification for dup operations.
-		return p.dup(ctx, ops)
-	}
 	n, err := p.read(ctx, ops)
 	if n > 0 {
 		p.Notify(waiter.EventOut)
@@ -96,8 +97,8 @@ func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error)
 		limit: func(l int64) {
 			src = src.TakeFirst64(l)
 		},
-		write: func(buf *buffer) (int64, error) {
-			n, err := src.CopyInTo(ctx, buf)
+		write: func(view *buffer.View) (int64, error) {
+			n, err := src.CopyInTo(ctx, view)
 			src = src.DropFirst64(n)
 			return n, err
 		},
@@ -117,8 +118,8 @@ func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, e
 		limit: func(l int64) {
 			count = l
 		},
-		write: func(buf *buffer) (int64, error) {
-			n, err := buf.WriteFromReader(r, count)
+		write: func(view *buffer.View) (int64, error) {
+			n, err := view.WriteFromReader(r, count)
 			count -= n
 			return n, err
 		},
-- 
cgit v1.2.3


From 837cf6255120fe4577a9fd758ecbe2f52f268ba8 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Fri, 14 Feb 2020 11:58:07 -0800
Subject: pcids.go isn't arch-specific

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 pkg/sentry/platform/ring0/pagetables/BUILD        |   2 +-
 pkg/sentry/platform/ring0/pagetables/pcids.go     | 107 +++++++++++++++++++++
 pkg/sentry/platform/ring0/pagetables/pcids_x86.go | 109 ----------------------
 3 files changed, 108 insertions(+), 110 deletions(-)
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pcids.go
 delete mode 100644 pkg/sentry/platform/ring0/pagetables/pcids_x86.go

(limited to 'pkg')

diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 4f2406ce3..581841555 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -80,7 +80,7 @@ go_library(
         "pagetables_amd64.go",
         "pagetables_arm64.go",
         "pagetables_x86.go",
-        "pcids_x86.go",
+        "pcids.go",
         "walker_amd64.go",
         "walker_arm64.go",
         "walker_empty.go",
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids.go b/pkg/sentry/platform/ring0/pagetables/pcids.go
new file mode 100644
index 000000000..9206030bf
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids.go
@@ -0,0 +1,107 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// limitPCID is the number of valid PCIDs.
+const limitPCID = 4096
+
+// PCIDs is a simple PCID database.
+//
+// This is not protected by locks and is thus suitable for use only with a
+// single CPU at a time.
+type PCIDs struct {
+	// mu protects below.
+	mu sync.Mutex
+
+	// cache are the assigned page tables.
+	cache map[*PageTables]uint16
+
+	// avail are available PCIDs.
+	avail []uint16
+}
+
+// NewPCIDs returns a new PCID database.
+//
+// start is the first index to assign. Typically this will be one, as the zero
+// pcid will always be flushed on transition (see pagetables_x86.go). This may
+// be more than one if specific PCIDs are reserved.
+//
+// Nil is returned iff the start and size are out of range.
+func NewPCIDs(start, size uint16) *PCIDs {
+	if start+uint16(size) >= limitPCID {
+		return nil // See comment.
+	}
+	p := &PCIDs{
+		cache: make(map[*PageTables]uint16),
+	}
+	for pcid := start; pcid < start+size; pcid++ {
+		p.avail = append(p.avail, pcid)
+	}
+	return p
+}
+
+// Assign assigns a PCID to the given PageTables.
+//
+// This may overwrite any previous assignment provided. If this in the case,
+// true is returned to indicate that the PCID should be flushed.
+func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) {
+	p.mu.Lock()
+	if pcid, ok := p.cache[pt]; ok {
+		p.mu.Unlock()
+		return pcid, false // No flush.
+	}
+
+	// Is there something available?
+	if len(p.avail) > 0 {
+		pcid := p.avail[len(p.avail)-1]
+		p.avail = p.avail[:len(p.avail)-1]
+		p.cache[pt] = pcid
+
+		// We need to flush because while this is in the available
+		// pool, it may have been used previously.
+		p.mu.Unlock()
+		return pcid, true
+	}
+
+	// Evict an existing table.
+	for old, pcid := range p.cache {
+		delete(p.cache, old)
+		p.cache[pt] = pcid
+
+		// A flush is definitely required in this case, these page
+		// tables may still be active. (They will just be assigned some
+		// other PCID if and when they hit the given CPU again.)
+		p.mu.Unlock()
+		return pcid, true
+	}
+
+	// No PCID.
+	p.mu.Unlock()
+	return 0, false
+}
+
+// Drop drops references to a set of page tables.
+func (p *PCIDs) Drop(pt *PageTables) {
+	p.mu.Lock()
+	if pcid, ok := p.cache[pt]; ok {
+		delete(p.cache, pt)
+		p.avail = append(p.avail, pcid)
+	}
+	p.mu.Unlock()
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
deleted file mode 100644
index e199bae18..000000000
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build i386 amd64
-
-package pagetables
-
-import (
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-// limitPCID is the number of valid PCIDs.
-const limitPCID = 4096
-
-// PCIDs is a simple PCID database.
-//
-// This is not protected by locks and is thus suitable for use only with a
-// single CPU at a time.
-type PCIDs struct {
-	// mu protects below.
-	mu sync.Mutex
-
-	// cache are the assigned page tables.
-	cache map[*PageTables]uint16
-
-	// avail are available PCIDs.
-	avail []uint16
-}
-
-// NewPCIDs returns a new PCID database.
-//
-// start is the first index to assign. Typically this will be one, as the zero
-// pcid will always be flushed on transition (see pagetables_x86.go). This may
-// be more than one if specific PCIDs are reserved.
-//
-// Nil is returned iff the start and size are out of range.
-func NewPCIDs(start, size uint16) *PCIDs {
-	if start+uint16(size) >= limitPCID {
-		return nil // See comment.
-	}
-	p := &PCIDs{
-		cache: make(map[*PageTables]uint16),
-	}
-	for pcid := start; pcid < start+size; pcid++ {
-		p.avail = append(p.avail, pcid)
-	}
-	return p
-}
-
-// Assign assigns a PCID to the given PageTables.
-//
-// This may overwrite any previous assignment provided. If this in the case,
-// true is returned to indicate that the PCID should be flushed.
-func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) {
-	p.mu.Lock()
-	if pcid, ok := p.cache[pt]; ok {
-		p.mu.Unlock()
-		return pcid, false // No flush.
-	}
-
-	// Is there something available?
-	if len(p.avail) > 0 {
-		pcid := p.avail[len(p.avail)-1]
-		p.avail = p.avail[:len(p.avail)-1]
-		p.cache[pt] = pcid
-
-		// We need to flush because while this is in the available
-		// pool, it may have been used previously.
-		p.mu.Unlock()
-		return pcid, true
-	}
-
-	// Evict an existing table.
-	for old, pcid := range p.cache {
-		delete(p.cache, old)
-		p.cache[pt] = pcid
-
-		// A flush is definitely required in this case, these page
-		// tables may still be active. (They will just be assigned some
-		// other PCID if and when they hit the given CPU again.)
-		p.mu.Unlock()
-		return pcid, true
-	}
-
-	// No PCID.
-	p.mu.Unlock()
-	return 0, false
-}
-
-// Drop drops references to a set of page tables.
-func (p *PCIDs) Drop(pt *PageTables) {
-	p.mu.Lock()
-	if pcid, ok := p.cache[pt]; ok {
-		delete(p.cache, pt)
-		p.avail = append(p.avail, pcid)
-	}
-	p.mu.Unlock()
-}
-- 
cgit v1.2.3


From 413a9b7fdc14f8bff660e1988e3ef0355dd4e6c6 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Fri, 21 Feb 2020 11:00:11 -0800
Subject: Define CPUIDInstruction for arm64

There is no cpuid instruction on arm64, so we need to defined it
just to avoid a compile time error.

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 pkg/sentry/arch/arch_arm64.go |  5 +++++
 pkg/sentry/kernel/task_run.go | 41 ++++++++++++++++++++++++++++-------------
 2 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index 372b650b9..885115ae2 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -53,6 +53,11 @@ const (
 	preferredPIELoadAddr usermem.Addr = maxAddr64 / 6 * 5
 )
 
+var (
+	// CPUIDInstruction doesn't exist on ARM64.
+	CPUIDInstruction = []byte{}
+)
+
 // These constants are selected as heuristics to help make the Platform's
 // potentially limited address space conform as closely to Linux as possible.
 const (
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 5568c91bc..799cbcd93 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -126,13 +126,39 @@ func (t *Task) doStop() {
 	}
 }
 
+func (*runApp) handleCPUIDInstruction(t *Task) error {
+	if len(arch.CPUIDInstruction) == 0 {
+		// CPUID emulation isn't supported, but this code can be
+		// executed, because the ptrace platform returns
+		// ErrContextSignalCPUID on page faults too. Look at
+		// pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more
+		// details.
+		return platform.ErrContextSignal
+	}
+	// Is this a CPUID instruction?
+	region := trace.StartRegion(t.traceContext, cpuidRegion)
+	expected := arch.CPUIDInstruction[:]
+	found := make([]byte, len(expected))
+	_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+	if err == nil && bytes.Equal(expected, found) {
+		// Skip the cpuid instruction.
+		t.Arch().CPUIDEmulate(t)
+		t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+		region.End()
+
+		return nil
+	}
+	region.End() // Not an actual CPUID, but required copy-in.
+	return platform.ErrContextSignal
+}
+
 // The runApp state checks for interrupts before executing untrusted
 // application code.
 //
 // +stateify savable
 type runApp struct{}
 
-func (*runApp) execute(t *Task) taskRunState {
+func (app *runApp) execute(t *Task) taskRunState {
 	if t.interrupted() {
 		// Checkpointing instructs tasks to stop by sending an interrupt, so we
 		// must check for stops before entering runInterrupt (instead of
@@ -237,21 +263,10 @@ func (*runApp) execute(t *Task) taskRunState {
 		return (*runApp)(nil)
 
 	case platform.ErrContextSignalCPUID:
-		// Is this a CPUID instruction?
-		region := trace.StartRegion(t.traceContext, cpuidRegion)
-		expected := arch.CPUIDInstruction[:]
-		found := make([]byte, len(expected))
-		_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
-		if err == nil && bytes.Equal(expected, found) {
-			// Skip the cpuid instruction.
-			t.Arch().CPUIDEmulate(t)
-			t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
-			region.End()
-
+		if err := app.handleCPUIDInstruction(t); err == nil {
 			// Resume execution.
 			return (*runApp)(nil)
 		}
-		region.End() // Not an actual CPUID, but required copy-in.
 
 		// The instruction at the given RIP was not a CPUID, and we
 		// fallthrough to the default signal deliver behavior below.
-- 
cgit v1.2.3


From ab7ecdd66d2aa4835bbe655ba672bd30813a2a88 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 28 Feb 2020 17:53:25 -0800
Subject: watchdog: print panic error message before other messages

This is needed for syzkaller to proper classify issues.

Right now, all watchdog issues are duped to one with the
subject "panic: Sentry detected stuck task(s). See stack
trace and message above for more details".

PiperOrigin-RevId: 297975363
---
 pkg/sentry/watchdog/watchdog.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index bfb2fac26..f7d6009a0 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -221,7 +221,7 @@ func (w *Watchdog) waitForStart() {
 		return
 	}
 	var buf bytes.Buffer
-	buf.WriteString("Watchdog.Start() not called within %s:\n")
+	buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
 	w.doAction(w.StartupTimeoutAction, false, &buf)
 }
 
@@ -325,7 +325,7 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo
 
 func (w *Watchdog) reportStuckWatchdog() {
 	var buf bytes.Buffer
-	buf.WriteString("Watchdog goroutine is stuck:\n")
+	buf.WriteString("Watchdog goroutine is stuck:")
 	w.doAction(w.TaskTimeoutAction, false, &buf)
 }
 
@@ -359,7 +359,7 @@ func (w *Watchdog) doAction(action Action, skipStack bool, msg *bytes.Buffer) {
 		case <-metricsEmitted:
 		case <-time.After(1 * time.Second):
 		}
-		panic(fmt.Sprintf("Stack for running G's are skipped while panicking.\n%s", msg.String()))
+		panic(fmt.Sprintf("%s\nStack for running G's are skipped while panicking.", msg.String()))
 	default:
 		panic(fmt.Sprintf("Unknown watchdog action %v", action))
 
-- 
cgit v1.2.3


From 62bd3ca8a37543c060292ca86669e537b9ebf36d Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 2 Mar 2020 10:05:52 -0800
Subject: Take write lock when removing xattr

PiperOrigin-RevId: 298380654
---
 pkg/sentry/fs/fsutil/inode.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index daecc4ffe..1922ff08c 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -259,8 +259,8 @@ func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode, ui
 
 // RemoveXattr implements fs.InodeOperations.RemoveXattr.
 func (i *InodeSimpleExtendedAttributes) RemoveXattr(_ context.Context, _ *fs.Inode, name string) error {
-	i.mu.RLock()
-	defer i.mu.RUnlock()
+	i.mu.Lock()
+	defer i.mu.Unlock()
 	if _, ok := i.xattrs[name]; ok {
 		delete(i.xattrs, name)
 		return nil
-- 
cgit v1.2.3


From 42fb7d349137bd8847e7c3df6493fde3bc8e6e89 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 2 Mar 2020 10:32:20 -0800
Subject: socket: take readMu to access readView

DATA RACE in netstack.(*SocketOperations).fetchReadView

Write at 0x00c001dca138 by goroutine 1001:
  gvisor.dev/gvisor/pkg/sentry/socket/netstack.(*SocketOperations).fetchReadView()
      pkg/sentry/socket/netstack/netstack.go:418 +0x85
  gvisor.dev/gvisor/pkg/sentry/socket/netstack.(*SocketOperations).coalescingRead()
      pkg/sentry/socket/netstack/netstack.go:2309 +0x67
  gvisor.dev/gvisor/pkg/sentry/socket/netstack.(*SocketOperations).nonBlockingRead()
      pkg/sentry/socket/netstack/netstack.go:2378 +0x183d

Previous read at 0x00c001dca138 by goroutine 1111:
  gvisor.dev/gvisor/pkg/sentry/socket/netstack.(*SocketOperations).Ioctl()
      pkg/sentry/socket/netstack/netstack.go:2666 +0x533
  gvisor.dev/gvisor/pkg/sentry/syscalls/linux.Ioctl()

Reported-by: syzbot+d4c3885fcc346f08deb6@syzkaller.appspotmail.com
PiperOrigin-RevId: 298387377
---
 pkg/sentry/socket/netstack/netstack.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 48c268bfa..1eeb37446 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -2663,7 +2663,9 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
 		}
 
 		// Add bytes removed from the endpoint but not yet sent to the caller.
+		s.readMu.Lock()
 		v += len(s.readView)
+		s.readMu.Unlock()
 
 		if v > math.MaxInt32 {
 			v = math.MaxInt32
-- 
cgit v1.2.3


From 8821a7104f8c8f263b88def1a646d518ec3f5dd2 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Mon, 2 Mar 2020 13:14:12 -0800
Subject: Do not read-lock NIC recursively

A deadlock may occur if a write lock on a RWMutex is blocked between
nested read lock attempts as the inner read lock attempt will be
blocked in this scenario.

Example (T1 and T2 are differnt goroutines):
  T1: obtain read-lock
  T2: attempt write-lock (blocks)
  T1: attempt inner/nested read-lock (blocks)

Here we can see that T1 and T2 are deadlocked.

Tests: Existing tests pass.
PiperOrigin-RevId: 298426678
---
 pkg/tcpip/stack/nic.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 46d3a6646..3e6196aee 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -451,7 +451,7 @@ func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEn
 	cs := make([]ipv6AddrCandidate, 0, len(primaryAddrs))
 	for _, r := range primaryAddrs {
 		// If r is not valid for outgoing connections, it is not a valid endpoint.
-		if !r.isValidForOutgoing() {
+		if !r.isValidForOutgoingRLocked() {
 			continue
 		}
 
-- 
cgit v1.2.3


From 33101752501fafea99d77f34bbd65f3e0083d22e Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Mon, 2 Mar 2020 14:43:52 -0800
Subject: Fix data-race when reading/writing e.amss.

PiperOrigin-RevId: 298451319
---
 pkg/tcpip/transport/tcp/connect.go  | 11 +++++++++--
 pkg/tcpip/transport/tcp/endpoint.go | 29 ++++++++++++++++++-----------
 test/syscalls/linux/tcp_socket.cc   | 15 +++++++++++++++
 3 files changed, 42 insertions(+), 13 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index cd247f3e1..ae4f3f3a9 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -295,6 +295,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	h.state = handshakeSynRcvd
 	h.ep.mu.Lock()
 	ttl := h.ep.ttl
+	amss := h.ep.amss
 	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
 	synOpts := header.TCPSynOptions{
@@ -307,7 +308,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 		// permits SACK. This is not explicitly defined in the RFC but
 		// this is the behaviour implemented by Linux.
 		SACKPermitted: rcvSynOpts.SACKPermitted,
-		MSS:           h.ep.amss,
+		MSS:           amss,
 	}
 	if ttl == 0 {
 		ttl = s.route.DefaultTTL()
@@ -356,6 +357,10 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			return tcpip.ErrInvalidEndpointState
 		}
 
+		h.ep.mu.RLock()
+		amss := h.ep.amss
+		h.ep.mu.RUnlock()
+
 		h.resetState()
 		synOpts := header.TCPSynOptions{
 			WS:            h.rcvWndScale,
@@ -363,7 +368,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			TSVal:         h.ep.timestamp(),
 			TSEcr:         h.ep.recentTimestamp(),
 			SACKPermitted: h.ep.sackPermitted,
-			MSS:           h.ep.amss,
+			MSS:           amss,
 		}
 		h.ep.sendSynTCP(&s.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
 		return nil
@@ -530,6 +535,7 @@ func (h *handshake) execute() *tcpip.Error {
 
 	// Send the initial SYN segment and loop until the handshake is
 	// completed.
+	h.ep.mu.Lock()
 	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
 
 	synOpts := header.TCPSynOptions{
@@ -540,6 +546,7 @@ func (h *handshake) execute() *tcpip.Error {
 		SACKPermitted: bool(sackEnabled),
 		MSS:           h.ep.amss,
 	}
+	h.ep.mu.Unlock()
 
 	// Execute is also called in a listen context so we want to make sure we
 	// only send the TS/SACK option when we received the TS/SACK in the
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 9e72730bd..8b9154e69 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -959,15 +959,18 @@ func (e *endpoint) initialReceiveWindow() int {
 // ModerateRecvBuf adjusts the receive buffer and the advertised window
 // based on the number of bytes copied to user space.
 func (e *endpoint) ModerateRecvBuf(copied int) {
+	e.mu.RLock()
 	e.rcvListMu.Lock()
 	if e.rcvAutoParams.disabled {
 		e.rcvListMu.Unlock()
+		e.mu.RUnlock()
 		return
 	}
 	now := time.Now()
 	if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt {
 		e.rcvAutoParams.copied += copied
 		e.rcvListMu.Unlock()
+		e.mu.RUnlock()
 		return
 	}
 	prevRTTCopied := e.rcvAutoParams.copied + copied
@@ -1008,7 +1011,7 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 			e.rcvBufSize = rcvWnd
 			availAfter := e.receiveBufferAvailableLocked()
 			mask := uint32(notifyReceiveWindowChanged)
-			if crossed, above := e.windowCrossedACKThreshold(availAfter - availBefore); crossed && above {
+			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
 				mask |= notifyNonZeroReceiveWindow
 			}
 			e.notifyProtocolGoroutine(mask)
@@ -1023,6 +1026,7 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 	e.rcvAutoParams.measureTime = now
 	e.rcvAutoParams.copied = 0
 	e.rcvListMu.Unlock()
+	e.mu.RUnlock()
 }
 
 // IPTables implements tcpip.Endpoint.IPTables.
@@ -1052,7 +1056,6 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 
 	v, err := e.readLocked()
 	e.rcvListMu.Unlock()
-
 	e.mu.RUnlock()
 
 	if err == tcpip.ErrClosedForReceive {
@@ -1085,7 +1088,7 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 	// enough buffer space, to either fit an aMSS or half a receive buffer
 	// (whichever smaller), then notify the protocol goroutine to send a
 	// window update.
-	if crossed, above := e.windowCrossedACKThreshold(len(v)); crossed && above {
+	if crossed, above := e.windowCrossedACKThresholdLocked(len(v)); crossed && above {
 		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 	}
 
@@ -1303,9 +1306,9 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	return num, tcpip.ControlMessages{}, nil
 }
 
-// windowCrossedACKThreshold checks if the receive window to be announced now
-// would be under aMSS or under half receive buffer, whichever smaller. This is
-// useful as a receive side silly window syndrome prevention mechanism. If
+// windowCrossedACKThresholdLocked checks if the receive window to be announced
+// now would be under aMSS or under half receive buffer, whichever smaller. This
+// is useful as a receive side silly window syndrome prevention mechanism. If
 // window grows to reasonable value, we should send ACK to the sender to inform
 // the rx space is now large. We also want ensure a series of small read()'s
 // won't trigger a flood of spurious tiny ACK's.
@@ -1316,7 +1319,9 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 // crossed will be true if the window size crossed the ACK threshold.
 // above will be true if the new window is >= ACK threshold and false
 // otherwise.
-func (e *endpoint) windowCrossedACKThreshold(deltaBefore int) (crossed bool, above bool) {
+//
+// Precondition: e.mu and e.rcvListMu must be held.
+func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed bool, above bool) {
 	newAvail := e.receiveBufferAvailableLocked()
 	oldAvail := newAvail - deltaBefore
 	if oldAvail < 0 {
@@ -1379,6 +1384,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 
 		mask := uint32(notifyReceiveWindowChanged)
 
+		e.mu.RLock()
 		e.rcvListMu.Lock()
 
 		// Make sure the receive buffer size allows us to send a
@@ -1405,11 +1411,11 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		// Immediately send an ACK to uncork the sender silly window
 		// syndrome prevetion, when our available space grows above aMSS
 		// or half receive buffer, whichever smaller.
-		if crossed, above := e.windowCrossedACKThreshold(availAfter - availBefore); crossed && above {
+		if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
 			mask |= notifyNonZeroReceiveWindow
 		}
 		e.rcvListMu.Unlock()
-
+		e.mu.RUnlock()
 		e.notifyProtocolGoroutine(mask)
 		return nil
 
@@ -2414,13 +2420,14 @@ func (e *endpoint) updateSndBufferUsage(v int) {
 // to be read, or when the connection is closed for receiving (in which case
 // s will be nil).
 func (e *endpoint) readyToRead(s *segment) {
+	e.mu.RLock()
 	e.rcvListMu.Lock()
 	if s != nil {
 		s.incRef()
 		e.rcvBufUsed += s.data.Size()
 		// Increase counter if the receive window falls down below MSS
 		// or half receive buffer size, whichever smaller.
-		if crossed, above := e.windowCrossedACKThreshold(-s.data.Size()); crossed && !above {
+		if crossed, above := e.windowCrossedACKThresholdLocked(-s.data.Size()); crossed && !above {
 			e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
 		}
 		e.rcvList.PushBack(s)
@@ -2428,7 +2435,7 @@ func (e *endpoint) readyToRead(s *segment) {
 		e.rcvClosed = true
 	}
 	e.rcvListMu.Unlock()
-
+	e.mu.RUnlock()
 	e.waiterQueue.Notify(waiter.EventIn)
 }
 
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index c4591a3b9..579463384 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1349,6 +1349,21 @@ TEST_P(SimpleTcpSocketTest, RecvOnClosedSocket) {
               SyscallFailsWithErrno(ENOTCONN));
 }
 
+TEST_P(SimpleTcpSocketTest, TCPConnectSoRcvBufRace) {
+  auto s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(GetParam(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  RetryEINTR(connect)(s.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                      addrlen);
+  int buf_sz = 1 << 18;
+  EXPECT_THAT(
+      setsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &buf_sz, sizeof(buf_sz)),
+      SyscallSucceedsWithValue(0));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From 43abb24657e737dee1108ff0d512b2e1b6d8a3f6 Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Mon, 2 Mar 2020 16:30:51 -0800
Subject: Fix panic caused by invalid address for Bind in packet sockets.

PiperOrigin-RevId: 298476533
---
 pkg/sentry/socket/netstack/netstack.go |  4 ++++
 test/syscalls/linux/packet_socket.cc   | 13 +++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 1eeb37446..13a9a60b4 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -712,6 +712,10 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	if len(sockaddr) < 2 {
+		return syserr.ErrInvalidArgument
+	}
+
 	family := usermem.ByteOrder.Uint16(sockaddr)
 	var addr tcpip.FullAddress
 
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index bc22de788..248762ca9 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -417,6 +417,19 @@ TEST_P(CookedPacketTest, BindDrop) {
   EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
 }
 
+// Bind with invalid address.
+TEST_P(CookedPacketTest, BindFail) {
+  // Null address.
+  ASSERT_THAT(bind(socket_, nullptr, sizeof(struct sockaddr)),
+              SyscallFailsWithErrno(EFAULT));
+
+  // Address of size 1.
+  uint8_t addr = 0;
+  ASSERT_THAT(
+      bind(socket_, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallFailsWithErrno(EINVAL));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest,
                          ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
-- 
cgit v1.2.3


From c15b8515eb4a07699e5f2401f0332286f0a51043 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 3 Mar 2020 13:40:59 -0800
Subject: Fix datarace on TransportEndpointInfo.ID and clean up semantics.

Ensures that all access to TransportEndpointInfo.ID is either:
* In a function ending in a Locked suffix.
* While holding the appropriate mutex.

This primary affects the checkV4Mapped method on affected endpoints, which has
been renamed to checkV4MappedLocked. Also document the method and change its
argument to be a value instead of a pointer which had caused some awkwardness.

This race was possible in the udp and icmp endpoints between Connect and uses
of TransportEndpointInfo.ID including in both itself and Bind.

The tcp endpoint did not suffer from this bug, but benefited from better
documentation.

Updates #357

PiperOrigin-RevId: 298682913
---
 pkg/tcpip/stack/stack.go                  | 12 +++++++-----
 pkg/tcpip/transport/icmp/endpoint.go      | 23 +++++++++++------------
 pkg/tcpip/transport/tcp/endpoint.go       | 15 ++++++++-------
 pkg/tcpip/transport/udp/endpoint.go       | 30 ++++++++++++++++--------------
 pkg/tcpip/transport/udp/endpoint_state.go |  3 +++
 5 files changed, 45 insertions(+), 38 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index ebb6c5e3b..13354d884 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -551,11 +551,13 @@ type TransportEndpointInfo struct {
 	RegisterNICID tcpip.NICID
 }
 
-// AddrNetProto unwraps the specified address if it is a V4-mapped V6 address
-// and returns the network protocol number to be used to communicate with the
-// specified address. It returns an error if the passed address is incompatible
-// with the receiver.
-func (e *TransportEndpointInfo) AddrNetProto(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+// AddrNetProtoLocked unwraps the specified address if it is a V4-mapped V6
+// address and returns the network protocol number to be used to communicate
+// with the specified address. It returns an error if the passed address is
+// incompatible with the receiver.
+//
+// Preconditon: the parent endpoint mu must be held while calling this method.
+func (e *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
 	netProto := e.NetProto
 	switch len(addr.Addr) {
 	case header.IPv4AddressSize:
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 426da1ee6..2a396e9bc 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -291,15 +291,13 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 			nicID = e.BindNICID
 		}
 
-		toCopy := *to
-		to = &toCopy
-		netProto, err := e.checkV4Mapped(to)
+		dst, netProto, err := e.checkV4MappedLocked(*to)
 		if err != nil {
 			return 0, nil, err
 		}
 
-		// Find the enpoint.
-		r, err := e.stack.FindRoute(nicID, e.BindAddr, to.Addr, netProto, false /* multicastLoop */)
+		// Find the endpoint.
+		r, err := e.stack.FindRoute(nicID, e.BindAddr, dst.Addr, netProto, false /* multicastLoop */)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -480,13 +478,14 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	})
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, false /* v6only */)
+// checkV4MappedLocked determines the effective network protocol and converts
+// addr to its canonical form.
+func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, false /* v6only */)
 	if err != nil {
-		return 0, err
+		return tcpip.FullAddress{}, 0, err
 	}
-	*addr = unwrapped
-	return netProto, nil
+	return unwrapped, netProto, nil
 }
 
 // Disconnect implements tcpip.Endpoint.Disconnect.
@@ -517,7 +516,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
@@ -630,7 +629,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 8b9154e69..40cc664c0 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1874,13 +1874,14 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	}
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+// checkV4MappedLocked determines the effective network protocol and converts
+// addr to its canonical form.
+func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only)
 	if err != nil {
-		return 0, err
+		return tcpip.FullAddress{}, 0, err
 	}
-	*addr = unwrapped
-	return netProto, nil
+	return unwrapped, netProto, nil
 }
 
 // Disconnect implements tcpip.Endpoint.Disconnect.
@@ -1910,7 +1911,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 
 	connectingAddr := addr.Addr
 
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
@@ -2276,7 +2277,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	}
 
 	e.BindAddr = addr.Addr
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 1c6a600b8..0af4514e1 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -443,19 +443,19 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 			return 0, nil, tcpip.ErrBroadcastDisabled
 		}
 
-		netProto, err := e.checkV4Mapped(to)
+		dst, netProto, err := e.checkV4MappedLocked(*to)
 		if err != nil {
 			return 0, nil, err
 		}
 
-		r, _, err := e.connectRoute(nicID, *to, netProto)
+		r, _, err := e.connectRoute(nicID, dst, netProto)
 		if err != nil {
 			return 0, nil, err
 		}
 		defer r.Release()
 
 		route = &r
-		dstPort = to.Port
+		dstPort = dst.Port
 	}
 
 	if route.IsResolutionRequired() {
@@ -566,7 +566,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		defer e.mu.Unlock()
 
 		fa := tcpip.FullAddress{Addr: v.InterfaceAddr}
-		netProto, err := e.checkV4Mapped(&fa)
+		fa, netProto, err := e.checkV4MappedLocked(fa)
 		if err != nil {
 			return err
 		}
@@ -927,13 +927,14 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	return nil
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+// checkV4MappedLocked determines the effective network protocol and converts
+// addr to its canonical form.
+func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only)
 	if err != nil {
-		return 0, err
+		return tcpip.FullAddress{}, 0, err
 	}
-	*addr = unwrapped
-	return netProto, nil
+	return unwrapped, netProto, nil
 }
 
 // Disconnect implements tcpip.Endpoint.Disconnect.
@@ -981,10 +982,6 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 
 // Connect connects the endpoint to its peer. Specifying a NIC is optional.
 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
-	netProto, err := e.checkV4Mapped(&addr)
-	if err != nil {
-		return err
-	}
 	if addr.Port == 0 {
 		// We don't support connecting to port zero.
 		return tcpip.ErrInvalidEndpointState
@@ -1012,6 +1009,11 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
+	addr, netProto, err := e.checkV4MappedLocked(addr)
+	if err != nil {
+		return err
+	}
+
 	r, nicID, err := e.connectRoute(nicID, addr, netProto)
 	if err != nil {
 		return err
@@ -1139,7 +1141,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 43fb047ed..466bd9381 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -69,6 +69,9 @@ func (e *endpoint) afterLoad() {
 
 // Resume implements tcpip.ResumableEndpoint.Resume.
 func (e *endpoint) Resume(s *stack.Stack) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
 	e.stack = s
 
 	for _, m := range e.multicastMemberships {
-- 
cgit v1.2.3


From 844e4d284cddf9795a0db7c38f926fe7b49bb873 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Tue, 3 Mar 2020 14:09:45 -0800
Subject: Extract local variables for readability

PiperOrigin-RevId: 298690552
---
 pkg/ilist/list.go | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

(limited to 'pkg')

diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index 019caadca..f3a609b57 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -88,8 +88,9 @@ func (l *List) Back() Element {
 
 // PushFront inserts the element e at the front of list l.
 func (l *List) PushFront(e Element) {
-	ElementMapper{}.linkerFor(e).SetNext(l.head)
-	ElementMapper{}.linkerFor(e).SetPrev(nil)
+	linker := ElementMapper{}.linkerFor(e)
+	linker.SetNext(l.head)
+	linker.SetPrev(nil)
 
 	if l.head != nil {
 		ElementMapper{}.linkerFor(l.head).SetPrev(e)
@@ -102,8 +103,9 @@ func (l *List) PushFront(e Element) {
 
 // PushBack inserts the element e at the back of list l.
 func (l *List) PushBack(e Element) {
-	ElementMapper{}.linkerFor(e).SetNext(nil)
-	ElementMapper{}.linkerFor(e).SetPrev(l.tail)
+	linker := ElementMapper{}.linkerFor(e)
+	linker.SetNext(nil)
+	linker.SetPrev(l.tail)
 
 	if l.tail != nil {
 		ElementMapper{}.linkerFor(l.tail).SetNext(e)
@@ -132,10 +134,14 @@ func (l *List) PushBackList(m *List) {
 
 // InsertAfter inserts e after b.
 func (l *List) InsertAfter(b, e Element) {
-	a := ElementMapper{}.linkerFor(b).Next()
-	ElementMapper{}.linkerFor(e).SetNext(a)
-	ElementMapper{}.linkerFor(e).SetPrev(b)
-	ElementMapper{}.linkerFor(b).SetNext(e)
+	bLinker := ElementMapper{}.linkerFor(b)
+	eLinker := ElementMapper{}.linkerFor(e)
+
+	a := bLinker.Next()
+
+	eLinker.SetNext(a)
+	eLinker.SetPrev(b)
+	bLinker.SetNext(e)
 
 	if a != nil {
 		ElementMapper{}.linkerFor(a).SetPrev(e)
@@ -146,10 +152,13 @@ func (l *List) InsertAfter(b, e Element) {
 
 // InsertBefore inserts e before a.
 func (l *List) InsertBefore(a, e Element) {
-	b := ElementMapper{}.linkerFor(a).Prev()
-	ElementMapper{}.linkerFor(e).SetNext(a)
-	ElementMapper{}.linkerFor(e).SetPrev(b)
-	ElementMapper{}.linkerFor(a).SetPrev(e)
+	aLinker := ElementMapper{}.linkerFor(a)
+	eLinker := ElementMapper{}.linkerFor(e)
+
+	b := aLinker.Prev()
+	eLinker.SetNext(a)
+	eLinker.SetPrev(b)
+	aLinker.SetPrev(e)
 
 	if b != nil {
 		ElementMapper{}.linkerFor(b).SetNext(e)
-- 
cgit v1.2.3


From 277a0d5a1fbe8272d4729c01ee4c6e374d047ebc Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 3 Mar 2020 14:34:36 -0800
Subject: platform/ptrace: don't call probeSeccomp on arm64

The support of PTRACE_SYSEMU on arm64 was added in the 5.3 kernel,
so we can be sure that the current version is higher that 5.3.

And this change moves vsyscall seccomp rules to the arch specific file,
because vsyscall isn't supported on arm64.

PiperOrigin-RevId: 298696493
---
 pkg/sentry/platform/ptrace/subprocess_amd64.go | 80 +++++++++++++++++++++++---
 pkg/sentry/platform/ptrace/subprocess_arm64.go | 11 +++-
 pkg/sentry/platform/ptrace/subprocess_linux.go | 65 +--------------------
 3 files changed, 84 insertions(+), 72 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index e99798c56..cd74945e7 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -21,6 +21,7 @@ import (
 	"strings"
 	"syscall"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -183,13 +184,76 @@ func enableCpuidFault() {
 
 // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
 // Ref attachedThread() for more detail.
-func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
-	return append(rules, seccomp.RuleSet{
-		Rules: seccomp.SyscallRules{
-			syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
-				{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet {
+	rules = append(rules,
+		// Rules for trapping vsyscall access.
+		seccomp.RuleSet{
+			Rules: seccomp.SyscallRules{
+				syscall.SYS_GETTIMEOFDAY: {},
+				syscall.SYS_TIME:         {},
+				unix.SYS_GETCPU:          {}, // SYS_GETCPU was not defined in package syscall on amd64.
 			},
-		},
-		Action: linux.SECCOMP_RET_ALLOW,
-	})
+			Action:   linux.SECCOMP_RET_TRAP,
+			Vsyscall: true,
+		})
+	if defaultAction != linux.SECCOMP_RET_ALLOW {
+		rules = append(rules,
+			seccomp.RuleSet{
+				Rules: seccomp.SyscallRules{
+					syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+						{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+					},
+				},
+				Action: linux.SECCOMP_RET_ALLOW,
+			})
+	}
+	return rules
+}
+
+// probeSeccomp returns true iff seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8. This check is dynamic
+// because kernels have be backported behavior.
+//
+// See createStub for more information.
+//
+// Precondition: the runtime OS thread must be locked.
+func probeSeccomp() bool {
+	// Create a completely new, destroyable process.
+	t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO)
+	if err != nil {
+		panic(fmt.Sprintf("seccomp probe failed: %v", err))
+	}
+	defer t.destroy()
+
+	// Set registers to the yield system call. This call is not allowed
+	// by the filters specified in the attachThread function.
+	regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
+	if err := t.setRegs(&regs); err != nil {
+		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+	}
+
+	for {
+		// Attempt an emulation.
+		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
+			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+		}
+
+		sig := t.wait(stopped)
+		if sig == (syscallEvent | syscall.SIGTRAP) {
+			// Did the seccomp errno hook already run? This would
+			// indicate that seccomp is first in line and we're
+			// less than 4.8.
+			if err := t.getRegs(&regs); err != nil {
+				panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
+			}
+			if _, err := syscallReturnValue(&regs); err == nil {
+				// The seccomp errno mode ran first, and reset
+				// the error in the registers.
+				return false
+			}
+			// The seccomp hook did not run yet, and therefore it
+			// is safe to use RET_KILL mode for dispatched calls.
+			return true
+		}
+	}
 }
diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go
index 7b975137f..7f5c393f0 100644
--- a/pkg/sentry/platform/ptrace/subprocess_arm64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go
@@ -160,6 +160,15 @@ func enableCpuidFault() {
 
 // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
 // Ref attachedThread() for more detail.
-func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
+func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet {
 	return rules
 }
+
+// probeSeccomp returns true if seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8.
+//
+// On arm64, the support of PTRACE_SYSEMU was added in the 5.3 kernel, so
+// probeSeccomp can always return true.
+func probeSeccomp() bool {
+	return true
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 74968dfdf..2ce528601 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -20,7 +20,6 @@ import (
 	"fmt"
 	"syscall"
 
-	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/procid"
@@ -30,54 +29,6 @@ import (
 
 const syscallEvent syscall.Signal = 0x80
 
-// probeSeccomp returns true iff seccomp is run after ptrace notifications,
-// which is generally the case for kernel version >= 4.8. This check is dynamic
-// because kernels have be backported behavior.
-//
-// See createStub for more information.
-//
-// Precondition: the runtime OS thread must be locked.
-func probeSeccomp() bool {
-	// Create a completely new, destroyable process.
-	t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO)
-	if err != nil {
-		panic(fmt.Sprintf("seccomp probe failed: %v", err))
-	}
-	defer t.destroy()
-
-	// Set registers to the yield system call. This call is not allowed
-	// by the filters specified in the attachThread function.
-	regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
-	if err := t.setRegs(&regs); err != nil {
-		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
-	}
-
-	for {
-		// Attempt an emulation.
-		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
-			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
-		}
-
-		sig := t.wait(stopped)
-		if sig == (syscallEvent | syscall.SIGTRAP) {
-			// Did the seccomp errno hook already run? This would
-			// indicate that seccomp is first in line and we're
-			// less than 4.8.
-			if err := t.getRegs(&regs); err != nil {
-				panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
-			}
-			if _, err := syscallReturnValue(&regs); err == nil {
-				// The seccomp errno mode ran first, and reset
-				// the error in the registers.
-				return false
-			}
-			// The seccomp hook did not run yet, and therefore it
-			// is safe to use RET_KILL mode for dispatched calls.
-			return true
-		}
-	}
-}
-
 // createStub creates a fresh stub processes.
 //
 // Precondition: the runtime OS thread must be locked.
@@ -123,18 +74,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 	// stub and all its children. This is used to create child stubs
 	// (below), so we must include the ability to fork, but otherwise lock
 	// down available calls only to what is needed.
-	rules := []seccomp.RuleSet{
-		// Rules for trapping vsyscall access.
-		{
-			Rules: seccomp.SyscallRules{
-				syscall.SYS_GETTIMEOFDAY: {},
-				syscall.SYS_TIME:         {},
-				unix.SYS_GETCPU:          {}, // SYS_GETCPU was not defined in package syscall on amd64.
-			},
-			Action:   linux.SECCOMP_RET_TRAP,
-			Vsyscall: true,
-		},
-	}
+	rules := []seccomp.RuleSet{}
 	if defaultAction != linux.SECCOMP_RET_ALLOW {
 		rules = append(rules, seccomp.RuleSet{
 			Rules: seccomp.SyscallRules{
@@ -173,9 +113,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 			},
 			Action: linux.SECCOMP_RET_ALLOW,
 		})
-
-		rules = appendArchSeccompRules(rules)
 	}
+	rules = appendArchSeccompRules(rules, defaultAction)
 	instrs, err := seccomp.BuildProgram(rules, defaultAction)
 	if err != nil {
 		return nil, err
-- 
cgit v1.2.3


From 371abe00f052dec37106f2dc22921bc84fb94818 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Tue, 3 Mar 2020 15:06:07 -0800
Subject: Avoid memory leaks

Properly discard segments from the segment heap.

PiperOrigin-RevId: 298704074
---
 pkg/tcpip/transport/tcp/rcv.go          | 4 ++++
 pkg/tcpip/transport/tcp/segment_heap.go | 1 +
 2 files changed, 5 insertions(+)

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 958f03ac1..d80aff1b6 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -195,6 +195,10 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 
 		for i := first; i < len(r.pendingRcvdSegments); i++ {
 			r.pendingRcvdSegments[i].decRef()
+			// Note that slice truncation does not allow garbage collection of
+			// truncated items, thus truncated items must be set to nil to avoid
+			// memory leaks.
+			r.pendingRcvdSegments[i] = nil
 		}
 		r.pendingRcvdSegments = r.pendingRcvdSegments[:first]
 
diff --git a/pkg/tcpip/transport/tcp/segment_heap.go b/pkg/tcpip/transport/tcp/segment_heap.go
index 9fd061d7d..e28f213ba 100644
--- a/pkg/tcpip/transport/tcp/segment_heap.go
+++ b/pkg/tcpip/transport/tcp/segment_heap.go
@@ -41,6 +41,7 @@ func (h *segmentHeap) Pop() interface{} {
 	old := *h
 	n := len(old)
 	x := old[n-1]
+	old[n-1] = nil
 	*h = old[:n-1]
 	return x
 }
-- 
cgit v1.2.3


From 122d47aed17abf4301596e19fc8ac9cdad8118d9 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 3 Mar 2020 15:27:23 -0800
Subject: Update cached file size when cache is skipped

gofer.dentryReadWriter.WriteFromBlocks was not updating
gofer.dentry.size after a write operation that skips the
cache.

Updates #1198

PiperOrigin-RevId: 298708646
---
 pkg/sentry/fsimpl/gofer/regular_file.go | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 54c1031a7..e95209661 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -361,8 +361,15 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro
 	rw.d.handleMu.RLock()
 	if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
 		n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off)
-		rw.d.handleMu.RUnlock()
 		rw.off += n
+		rw.d.dataMu.Lock()
+		if rw.off > rw.d.size {
+			atomic.StoreUint64(&rw.d.size, rw.off)
+			// The remote file's size will implicitly be extended to the correct
+			// value when we write back to it.
+		}
+		rw.d.dataMu.Unlock()
+		rw.d.handleMu.RUnlock()
 		return n, err
 	}
 
-- 
cgit v1.2.3


From a690b5762480ae80ef4264402fcfa8d84b57339a Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 4 Mar 2020 14:29:11 -0800
Subject: Ensure that safemem.BlockSeqOf(safemem.Block{}) produces an empty
 BlockSeq.

PiperOrigin-RevId: 298941855
---
 pkg/safemem/seq_test.go   | 21 +++++++++++++++++++++
 pkg/safemem/seq_unsafe.go |  3 +++
 2 files changed, 24 insertions(+)

(limited to 'pkg')

diff --git a/pkg/safemem/seq_test.go b/pkg/safemem/seq_test.go
index eba4bb535..de34005e9 100644
--- a/pkg/safemem/seq_test.go
+++ b/pkg/safemem/seq_test.go
@@ -20,6 +20,27 @@ import (
 	"testing"
 )
 
+func TestBlockSeqOfEmptyBlock(t *testing.T) {
+	bs := BlockSeqOf(Block{})
+	if !bs.IsEmpty() {
+		t.Errorf("BlockSeqOf(Block{}).IsEmpty(): got false, wanted true; BlockSeq is %v", bs)
+	}
+}
+
+func TestBlockSeqOfNonemptyBlock(t *testing.T) {
+	b := BlockFromSafeSlice(make([]byte, 1))
+	bs := BlockSeqOf(b)
+	if bs.IsEmpty() {
+		t.Fatalf("BlockSeqOf(non-empty Block).IsEmpty(): got true, wanted false; BlockSeq is %v", bs)
+	}
+	if head := bs.Head(); head != b {
+		t.Fatalf("BlockSeqOf(non-empty Block).Head(): got %v, wanted %v", head, b)
+	}
+	if tail := bs.Tail(); !tail.IsEmpty() {
+		t.Fatalf("BlockSeqOf(non-empty Block).Tail().IsEmpty(): got false, wanted true: tail is %v", tail)
+	}
+}
+
 type blockSeqTest struct {
 	desc string
 
diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go
index dcdfc9600..f5f0574f8 100644
--- a/pkg/safemem/seq_unsafe.go
+++ b/pkg/safemem/seq_unsafe.go
@@ -56,6 +56,9 @@ type BlockSeq struct {
 
 // BlockSeqOf returns a BlockSeq representing the single Block b.
 func BlockSeqOf(b Block) BlockSeq {
+	if b.length == 0 {
+		return BlockSeq{}
+	}
 	bs := BlockSeq{
 		data:   b.start,
 		length: -1,
-- 
cgit v1.2.3


From 80b40bbb06f3c9ca23e3ad152b481ab222ec0e47 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 4 Mar 2020 16:15:22 -0800
Subject: tests: Don't print log messages on stdout

A parser of test results doesn't expect to see any extra messages.

PiperOrigin-RevId: 298966577
---
 pkg/seccomp/seccomp_test.go     | 2 +-
 runsc/container/console_test.go | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'pkg')

diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index da5a5e4b2..88766f33b 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -451,7 +451,7 @@ func TestRandom(t *testing.T) {
 		}
 	}
 
-	fmt.Printf("Testing filters: %v", syscallRules)
+	t.Logf("Testing filters: %v", syscallRules)
 	instrs, err := BuildProgram([]RuleSet{
 		RuleSet{
 			Rules:  syscallRules,
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index c2518d52b..651615d4c 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -333,13 +333,13 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	// file. Writes after a certain point will block unless we drain the
 	// PTY, so we must continually copy from it.
 	//
-	// We log the output to stdout for debugabilitly, and also to a buffer,
+	// We log the output to stderr for debugabilitly, and also to a buffer,
 	// since we wait on particular output from bash below. We use a custom
 	// blockingBuffer which is thread-safe and also blocks on Read calls,
 	// which makes this a suitable Reader for WaitUntilRead.
 	ptyBuf := newBlockingBuffer()
 	tee := io.TeeReader(ptyMaster, ptyBuf)
-	go io.Copy(os.Stdout, tee)
+	go io.Copy(os.Stderr, tee)
 
 	// Start the container.
 	if err := c.Start(conf); err != nil {
-- 
cgit v1.2.3


From 9b3aad33c4470908953b7b548b12cba77799f342 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 5 Mar 2020 15:55:40 -0800
Subject: Use a pool of arrays to avoid slice headers from escaping in TCP
 options pool.

By putting slices into the pool, the slice header escapes. This can be avoided
by not putting the slice header into the pool.

This removes an allocation from the TCP segment send path.

PiperOrigin-RevId: 299215480
---
 pkg/tcpip/transport/tcp/BUILD             |  1 +
 pkg/tcpip/transport/tcp/connect.go        |  6 +++---
 pkg/tcpip/transport/tcp/connect_unsafe.go | 30 ++++++++++++++++++++++++++++++
 3 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 pkg/tcpip/transport/tcp/connect_unsafe.go

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 272e8f570..a32f9eacf 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -32,6 +32,7 @@ go_library(
     srcs = [
         "accept.go",
         "connect.go",
+        "connect_unsafe.go",
         "cubic.go",
         "cubic_state.go",
         "dispatcher.go",
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index ae4f3f3a9..c0f73ef16 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -624,17 +624,17 @@ func parseSynSegmentOptions(s *segment) header.TCPSynOptions {
 
 var optionPool = sync.Pool{
 	New: func() interface{} {
-		return make([]byte, maxOptionSize)
+		return &[maxOptionSize]byte{}
 	},
 }
 
 func getOptions() []byte {
-	return optionPool.Get().([]byte)
+	return (*optionPool.Get().(*[maxOptionSize]byte))[:]
 }
 
 func putOptions(options []byte) {
 	// Reslice to full capacity.
-	optionPool.Put(options[0:cap(options)])
+	optionPool.Put(optionsToArray(options))
 }
 
 func makeSynOptions(opts header.TCPSynOptions) []byte {
diff --git a/pkg/tcpip/transport/tcp/connect_unsafe.go b/pkg/tcpip/transport/tcp/connect_unsafe.go
new file mode 100644
index 000000000..cfc304616
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/connect_unsafe.go
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"reflect"
+	"unsafe"
+)
+
+// optionsToArray converts a slice of capacity >-= maxOptionSize to an array.
+//
+// optionsToArray panics if the capacity of options is smaller than
+// maxOptionSize.
+func optionsToArray(options []byte) *[maxOptionSize]byte {
+	// Reslice to full capacity.
+	options = options[0:maxOptionSize]
+	return (*[maxOptionSize]byte)(unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&options)).Data))
+}
-- 
cgit v1.2.3


From 9b64b658c1b9a4986bc5a4ebd9e5ddeb9f52dfa3 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Thu, 5 Mar 2020 17:39:11 -0800
Subject: Fix S/R on inet.Namespace.

PiperOrigin-RevId: 299238067
---
 pkg/sentry/inet/namespace.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sentry/inet/namespace.go b/pkg/sentry/inet/namespace.go
index c16667e7f..029af3025 100644
--- a/pkg/sentry/inet/namespace.go
+++ b/pkg/sentry/inet/namespace.go
@@ -23,7 +23,10 @@ type Namespace struct {
 
 	// creator allows kernel to create new network stack for network namespaces.
 	// If nil, no networking will function if network is namespaced.
-	creator NetworkStackCreator
+	//
+	// At afterLoad(), creator will be used to create network stack. Stateify
+	// needs to wait for this field to be loaded before calling afterLoad().
+	creator NetworkStackCreator `state:"wait"`
 
 	// isRoot indicates whether this is the root network namespace.
 	isRoot bool
-- 
cgit v1.2.3


From da48fc6cca23a38faef51c5b5f8ae609940773a0 Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Thu, 5 Mar 2020 18:21:39 -0800
Subject: Stub oom_score_adj and oom_score.

Adds an oom_score_adj and oom_score proc file stub. oom_score_adj accepts
writes of values -1000 to 1000 and persists the value with the task. New tasks
inherit the parent's oom_score_adj.

oom_score is a read-only stub that always returns the value '0'.

Issue #202

PiperOrigin-RevId: 299245355
---
 pkg/sentry/fs/proc/task.go               | 126 ++++++++++++++++++++++++++-----
 pkg/sentry/fsimpl/proc/task.go           |  12 +--
 pkg/sentry/fsimpl/proc/task_files.go     |  43 +++++++++++
 pkg/sentry/fsimpl/proc/tasks_test.go     |  32 ++++----
 pkg/sentry/kernel/task.go                |  33 ++++++++
 pkg/sentry/kernel/task_clone.go          |   6 ++
 pkg/sentry/kernel/task_start.go          |   4 +
 test/syscalls/BUILD                      |   8 +-
 test/syscalls/linux/BUILD                |  13 ++++
 test/syscalls/linux/proc.cc              |  21 ++++++
 test/syscalls/linux/proc_pid_oomscore.cc |  72 ++++++++++++++++++
 11 files changed, 330 insertions(+), 40 deletions(-)
 create mode 100644 test/syscalls/linux/proc_pid_oomscore.cc

(limited to 'pkg')

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 8ab8d8a02..4e9b0fc00 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -72,24 +72,26 @@ var _ fs.InodeOperations = (*taskDir)(nil)
 // newTaskDir creates a new proc task entry.
 func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
 	contents := map[string]*fs.Inode{
-		"auxv":      newAuxvec(t, msrc),
-		"cmdline":   newExecArgInode(t, msrc, cmdlineExecArg),
-		"comm":      newComm(t, msrc),
-		"environ":   newExecArgInode(t, msrc, environExecArg),
-		"exe":       newExe(t, msrc),
-		"fd":        newFdDir(t, msrc),
-		"fdinfo":    newFdInfoDir(t, msrc),
-		"gid_map":   newGIDMap(t, msrc),
-		"io":        newIO(t, msrc, isThreadGroup),
-		"maps":      newMaps(t, msrc),
-		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
-		"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
-		"ns":        newNamespaceDir(t, msrc),
-		"smaps":     newSmaps(t, msrc),
-		"stat":      newTaskStat(t, msrc, isThreadGroup, p.pidns),
-		"statm":     newStatm(t, msrc),
-		"status":    newStatus(t, msrc, p.pidns),
-		"uid_map":   newUIDMap(t, msrc),
+		"auxv":          newAuxvec(t, msrc),
+		"cmdline":       newExecArgInode(t, msrc, cmdlineExecArg),
+		"comm":          newComm(t, msrc),
+		"environ":       newExecArgInode(t, msrc, environExecArg),
+		"exe":           newExe(t, msrc),
+		"fd":            newFdDir(t, msrc),
+		"fdinfo":        newFdInfoDir(t, msrc),
+		"gid_map":       newGIDMap(t, msrc),
+		"io":            newIO(t, msrc, isThreadGroup),
+		"maps":          newMaps(t, msrc),
+		"mountinfo":     seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
+		"mounts":        seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
+		"ns":            newNamespaceDir(t, msrc),
+		"oom_score":     newOOMScore(t, msrc),
+		"oom_score_adj": newOOMScoreAdj(t, msrc),
+		"smaps":         newSmaps(t, msrc),
+		"stat":          newTaskStat(t, msrc, isThreadGroup, p.pidns),
+		"statm":         newStatm(t, msrc),
+		"status":        newStatus(t, msrc, p.pidns),
+		"uid_map":       newUIDMap(t, msrc),
 	}
 	if isThreadGroup {
 		contents["task"] = p.newSubtasks(t, msrc)
@@ -796,4 +798,92 @@ func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc
 	return int64(n), err
 }
 
+// newOOMScore returns a oom_score file. It is a stub that always returns 0.
+// TODO(gvisor.dev/issue/1967)
+func newOOMScore(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	return newStaticProcInode(t, msrc, []byte("0\n"))
+}
+
+// oomScoreAdj is a file containing the oom_score adjustment for a task.
+//
+// +stateify savable
+type oomScoreAdj struct {
+	fsutil.SimpleFileInode
+
+	t *kernel.Task
+}
+
+// +stateify savable
+type oomScoreAdjFile struct {
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+
+	t *kernel.Task
+}
+
+// newOOMScoreAdj returns a oom_score_adj file.
+func newOOMScoreAdj(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	i := &oomScoreAdj{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
+		t:               t,
+	}
+	return newProcInode(t, i, msrc, fs.SpecialFile, t)
+}
+
+// Truncate implements fs.InodeOperations.Truncate. Truncate is called when
+// O_TRUNC is specified for any kind of existing Dirent but is not called via
+// (f)truncate for proc files.
+func (*oomScoreAdj) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &oomScoreAdjFile{t: o.t}), nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		return 0, io.EOF
+	}
+	adj, err := f.t.OOMScoreAdj()
+	if err != nil {
+		return 0, err
+	}
+	adjBytes := []byte(strconv.FormatInt(int64(adj), 10) + "\n")
+	n, err := dst.CopyOut(ctx, adjBytes)
+	return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+
+	if err := f.t.SetOOMScoreAdj(v); err != nil {
+		return 0, err
+	}
+
+	return n, nil
+}
+
 // LINT.ThenChange(../../fsimpl/proc/task.go|../../fsimpl/proc/task_files.go)
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 2d814668a..18e5cd6f6 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -62,11 +62,13 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames
 			"pid":  newNamespaceSymlink(task, inoGen.NextIno(), "pid"),
 			"user": newNamespaceSymlink(task, inoGen.NextIno(), "user"),
 		}),
-		"smaps":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}),
-		"stat":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
-		"statm":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}),
-		"status":  newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
-		"uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}),
+		"oom_score":     newTaskOwnedFile(task, inoGen.NextIno(), 0444, newStaticFile("0\n")),
+		"oom_score_adj": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &oomScoreAdj{task: task}),
+		"smaps":         newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}),
+		"stat":          newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
+		"statm":         newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}),
+		"status":        newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
+		"uid_map":       newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}),
 	}
 	if isThreadGroup {
 		contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers)
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index efd3b3453..5a231ac86 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -525,3 +525,46 @@ func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
 	return nil
 }
+
+// oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file.
+//
+// +stateify savable
+type oomScoreAdj struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	adj, err := o.task.OOMScoreAdj()
+	if err != nil {
+		return err
+	}
+	fmt.Fprintf(buf, "%d\n", adj)
+	return nil
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+
+	if err := o.task.SetOOMScoreAdj(v); err != nil {
+		return 0, err
+	}
+
+	return n, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index c5d531fe0..0eb401619 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -63,21 +63,23 @@ var (
 		"thread-self": threadSelfLink.NextOff,
 	}
 	taskStaticFiles = map[string]testutil.DirentType{
-		"auxv":    linux.DT_REG,
-		"cgroup":  linux.DT_REG,
-		"cmdline": linux.DT_REG,
-		"comm":    linux.DT_REG,
-		"environ": linux.DT_REG,
-		"gid_map": linux.DT_REG,
-		"io":      linux.DT_REG,
-		"maps":    linux.DT_REG,
-		"ns":      linux.DT_DIR,
-		"smaps":   linux.DT_REG,
-		"stat":    linux.DT_REG,
-		"statm":   linux.DT_REG,
-		"status":  linux.DT_REG,
-		"task":    linux.DT_DIR,
-		"uid_map": linux.DT_REG,
+		"auxv":          linux.DT_REG,
+		"cgroup":        linux.DT_REG,
+		"cmdline":       linux.DT_REG,
+		"comm":          linux.DT_REG,
+		"environ":       linux.DT_REG,
+		"gid_map":       linux.DT_REG,
+		"io":            linux.DT_REG,
+		"maps":          linux.DT_REG,
+		"ns":            linux.DT_DIR,
+		"oom_score":     linux.DT_REG,
+		"oom_score_adj": linux.DT_REG,
+		"smaps":         linux.DT_REG,
+		"stat":          linux.DT_REG,
+		"statm":         linux.DT_REG,
+		"status":        linux.DT_REG,
+		"task":          linux.DT_DIR,
+		"uid_map":       linux.DT_REG,
 	}
 )
 
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 2cee2e6ed..c0dbbe890 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -37,6 +37,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -554,6 +555,13 @@ type Task struct {
 	//
 	// startTime is protected by mu.
 	startTime ktime.Time
+
+	// oomScoreAdj is the task's OOM score adjustment. This is currently not
+	// used but is maintained for consistency.
+	// TODO(gvisor.dev/issue/1967)
+	//
+	// oomScoreAdj is protected by mu, and is owned by the task goroutine.
+	oomScoreAdj int32
 }
 
 func (t *Task) savePtraceTracer() *Task {
@@ -847,3 +855,28 @@ func (t *Task) AbstractSockets() *AbstractSocketNamespace {
 func (t *Task) ContainerID() string {
 	return t.containerID
 }
+
+// OOMScoreAdj gets the task's OOM score adjustment.
+func (t *Task) OOMScoreAdj() (int32, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.ExitState() == TaskExitDead {
+		return 0, syserror.ESRCH
+	}
+	return t.oomScoreAdj, nil
+}
+
+// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be
+// between -1000 and 1000 inclusive.
+func (t *Task) SetOOMScoreAdj(adj int32) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.ExitState() == TaskExitDead {
+		return syserror.ESRCH
+	}
+	if adj > 1000 || adj < -1000 {
+		return syserror.EINVAL
+	}
+	t.oomScoreAdj = adj
+	return nil
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 78866f280..dda502bb8 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -264,6 +264,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		rseqSignature = t.rseqSignature
 	}
 
+	adj, err := t.OOMScoreAdj()
+	if err != nil {
+		return 0, nil, err
+	}
+
 	cfg := &TaskConfig{
 		Kernel:                  t.k,
 		ThreadGroup:             tg,
@@ -282,6 +287,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		RSeqAddr:                rseqAddr,
 		RSeqSignature:           rseqSignature,
 		ContainerID:             t.ContainerID(),
+		OOMScoreAdj:             adj,
 	}
 	if opts.NewThreadGroup {
 		cfg.Parent = t
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index a5035bb7f..2bbf48bb8 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -93,6 +93,9 @@ type TaskConfig struct {
 
 	// ContainerID is the container the new task belongs to.
 	ContainerID string
+
+	// oomScoreAdj is the task's OOM score adjustment.
+	OOMScoreAdj int32
 }
 
 // NewTask creates a new task defined by cfg.
@@ -143,6 +146,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		rseqSignature:      cfg.RSeqSignature,
 		futexWaiter:        futex.NewWaiter(),
 		containerID:        cfg.ContainerID,
+		oomScoreAdj:        cfg.OOMScoreAdj,
 	}
 	t.creds.Store(cfg.Credentials)
 	t.endStopCond.L = &t.tg.signalHandlers.mu
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index a69b0ce13..9800a0cdf 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -318,10 +318,14 @@ syscall_test(
     test = "//test/syscalls/linux:proc_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test")
-
 syscall_test(test = "//test/syscalls/linux:proc_net_test")
 
+syscall_test(test = "//test/syscalls/linux:proc_pid_oomscore_test")
+
+syscall_test(test = "//test/syscalls/linux:proc_pid_smaps_test")
+
+syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test")
+
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:pselect_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 0fbd556de..43455f1a3 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1631,6 +1631,19 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "proc_pid_oomscore_test",
+    testonly = 1,
+    srcs = ["proc_pid_oomscore.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:fs_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_binary(
     name = "proc_pid_smaps_test",
     testonly = 1,
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index f91187e75..5a70f6c3b 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1431,6 +1431,12 @@ TEST(ProcPidFile, SubprocessRunning) {
 
   EXPECT_THAT(ReadWhileRunning("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("oom_score", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("oom_score_adj", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
 }
 
 // Test whether /proc/PID/ files can be read for a zombie process.
@@ -1466,6 +1472,12 @@ TEST(ProcPidFile, SubprocessZombie) {
   EXPECT_THAT(ReadWhileZombied("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
 
+  EXPECT_THAT(ReadWhileZombied("oom_score", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileZombied("oom_score_adj", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
   //
@@ -1527,6 +1539,15 @@ TEST(ProcPidFile, SubprocessExited) {
 
   EXPECT_THAT(ReadWhileExited("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
+    EXPECT_THAT(ReadWhileExited("oom_score", buf, sizeof(buf)),
+                SyscallFailsWithErrno(ESRCH));
+  }
+
+  EXPECT_THAT(ReadWhileExited("oom_score_adj", buf, sizeof(buf)),
+              SyscallFailsWithErrno(ESRCH));
 }
 
 PosixError DirContainsImpl(absl::string_view path,
diff --git a/test/syscalls/linux/proc_pid_oomscore.cc b/test/syscalls/linux/proc_pid_oomscore.cc
new file mode 100644
index 000000000..707821a3f
--- /dev/null
+++ b/test/syscalls/linux/proc_pid_oomscore.cc
@@ -0,0 +1,72 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+
+#include <exception>
+#include <iostream>
+#include <string>
+
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<int> ReadProcNumber(std::string path) {
+  ASSIGN_OR_RETURN_ERRNO(std::string contents, GetContents(path));
+  EXPECT_EQ(contents[contents.length() - 1], '\n');
+
+  int num;
+  if (!absl::SimpleAtoi(contents, &num)) {
+    return PosixError(EINVAL, "invalid value: " + contents);
+  }
+
+  return num;
+}
+
+TEST(ProcPidOomscoreTest, BasicRead) {
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score"));
+  EXPECT_LE(oom_score, 1000);
+  EXPECT_GE(oom_score, -1000);
+}
+
+TEST(ProcPidOomscoreAdjTest, BasicRead) {
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj"));
+
+  // oom_score_adj defaults to 0.
+  EXPECT_EQ(oom_score, 0);
+}
+
+TEST(ProcPidOomscoreAdjTest, BasicWrite) {
+  constexpr int test_value = 7;
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/oom_score_adj", O_WRONLY));
+  ASSERT_THAT(
+      RetryEINTR(write)(fd.get(), std::to_string(test_value).c_str(), 1),
+      SyscallSucceeds());
+
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj"));
+  EXPECT_EQ(oom_score, test_value);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From d6f5e71df2c8ff3d763cba703786af68af1f9841 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 6 Mar 2020 08:01:45 -0800
Subject: Get strings for stack.DHCPv6ConfigurationFromNDPRA

Useful for logs to print the string representation of the value
instead of the integer value.

PiperOrigin-RevId: 299356847
---
 pkg/tcpip/stack/BUILD                              |  1 +
 .../stack/dhcpv6configurationfromndpra_string.go   | 39 ++++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 705cf01ee..8febd54c8 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -18,6 +18,7 @@ go_template_instance(
 go_library(
     name = "stack",
     srcs = [
+        "dhcpv6configurationfromndpra_string.go",
         "icmp_rate_limit.go",
         "linkaddrcache.go",
         "linkaddrentry_list.go",
diff --git a/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go b/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go
new file mode 100644
index 000000000..8b4213eec
--- /dev/null
+++ b/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go
@@ -0,0 +1,39 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by "stringer -type=DHCPv6ConfigurationFromNDPRA"; DO NOT EDIT.
+
+package stack
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[DHCPv6NoConfiguration-0]
+	_ = x[DHCPv6ManagedAddress-1]
+	_ = x[DHCPv6OtherConfigurations-2]
+}
+
+const _DHCPv6ConfigurationFromNDPRA_name = "DHCPv6NoConfigurationDHCPv6ManagedAddressDHCPv6OtherConfigurations"
+
+var _DHCPv6ConfigurationFromNDPRA_index = [...]uint8{0, 21, 41, 66}
+
+func (i DHCPv6ConfigurationFromNDPRA) String() string {
+	if i < 0 || i >= DHCPv6ConfigurationFromNDPRA(len(_DHCPv6ConfigurationFromNDPRA_index)-1) {
+		return "DHCPv6ConfigurationFromNDPRA(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+	return _DHCPv6ConfigurationFromNDPRA_name[_DHCPv6ConfigurationFromNDPRA_index[i]:_DHCPv6ConfigurationFromNDPRA_index[i+1]]
+}
-- 
cgit v1.2.3


From f50d9a31e9e734a02e0191f6bc91b387bb21f9ab Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 6 Mar 2020 11:32:13 -0800
Subject: Specify the source of outgoing NDP RS

If the NIC has a valid IPv6 address assigned, use it as the
source address for outgoing NDP Router Solicitation packets.

Test: stack_test.TestRouterSolicitation
PiperOrigin-RevId: 299398763
---
 pkg/tcpip/checker/checker.go | 106 +++++++++++++++++++++++++++----------------
 pkg/tcpip/stack/ndp.go       |  29 ++++++++++--
 pkg/tcpip/stack/ndp_test.go  |  39 +++++++++++++---
 3 files changed, 125 insertions(+), 49 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index c6c160dfc..8dc0f7c0e 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -785,6 +785,52 @@ func NDPNSTargetAddress(want tcpip.Address) TransportChecker {
 	}
 }
 
+// ndpOptions checks that optsBuf only contains opts.
+func ndpOptions(t *testing.T, optsBuf header.NDPOptions, opts []header.NDPOption) {
+	t.Helper()
+
+	it, err := optsBuf.Iter(true)
+	if err != nil {
+		t.Errorf("optsBuf.Iter(true): %s", err)
+		return
+	}
+
+	i := 0
+	for {
+		opt, done, err := it.Next()
+		if err != nil {
+			// This should never happen as Iter(true) above did not return an error.
+			t.Fatalf("unexpected error when iterating over NDP options: %s", err)
+		}
+		if done {
+			break
+		}
+
+		if i >= len(opts) {
+			t.Errorf("got unexpected option: %s", opt)
+			continue
+		}
+
+		switch wantOpt := opts[i].(type) {
+		case header.NDPSourceLinkLayerAddressOption:
+			gotOpt, ok := opt.(header.NDPSourceLinkLayerAddressOption)
+			if !ok {
+				t.Errorf("got type = %T at index = %d; want = %T", opt, i, wantOpt)
+			} else if got, want := gotOpt.EthernetAddress(), wantOpt.EthernetAddress(); got != want {
+				t.Errorf("got EthernetAddress() = %s at index %d, want = %s", got, i, want)
+			}
+		default:
+			t.Fatalf("checker not implemented for expected NDP option: %T", wantOpt)
+		}
+
+		i++
+	}
+
+	if missing := opts[i:]; len(missing) > 0 {
+		t.Errorf("missing options: %s", missing)
+	}
+}
+
 // NDPNSOptions creates a checker that checks that the packet contains the
 // provided NDP options within an NDP Neighbor Solicitation message.
 //
@@ -796,47 +842,31 @@ func NDPNSOptions(opts []header.NDPOption) TransportChecker {
 
 		icmp := h.(header.ICMPv6)
 		ns := header.NDPNeighborSolicit(icmp.NDPPayload())
-		it, err := ns.Options().Iter(true)
-		if err != nil {
-			t.Errorf("opts.Iter(true): %s", err)
-			return
-		}
-
-		i := 0
-		for {
-			opt, done, _ := it.Next()
-			if done {
-				break
-			}
-
-			if i >= len(opts) {
-				t.Errorf("got unexpected option: %s", opt)
-				continue
-			}
-
-			switch wantOpt := opts[i].(type) {
-			case header.NDPSourceLinkLayerAddressOption:
-				gotOpt, ok := opt.(header.NDPSourceLinkLayerAddressOption)
-				if !ok {
-					t.Errorf("got type = %T at index = %d; want = %T", opt, i, wantOpt)
-				} else if got, want := gotOpt.EthernetAddress(), wantOpt.EthernetAddress(); got != want {
-					t.Errorf("got EthernetAddress() = %s at index %d, want = %s", got, i, want)
-				}
-			default:
-				panic("not implemented")
-			}
-
-			i++
-		}
-
-		if missing := opts[i:]; len(missing) > 0 {
-			t.Errorf("missing options: %s", missing)
-		}
+		ndpOptions(t, ns.Options(), opts)
 	}
 }
 
 // NDPRS creates a checker that checks that the packet contains a valid NDP
 // Router Solicitation message (as per the raw wire format).
-func NDPRS() NetworkChecker {
-	return NDP(header.ICMPv6RouterSolicit, header.NDPRSMinimumSize)
+//
+// checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDPRS as far as the size of the message is concerned. The values within the
+// message are up to checkers to validate.
+func NDPRS(checkers ...TransportChecker) NetworkChecker {
+	return NDP(header.ICMPv6RouterSolicit, header.NDPRSMinimumSize, checkers...)
+}
+
+// NDPRSOptions creates a checker that checks that the packet contains the
+// provided NDP options within an NDP Router Solicitation message.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPRS message as far as the size is concerned.
+func NDPRSOptions(opts []header.NDPOption) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		rs := header.NDPRouterSolicit(icmp.NDPPayload())
+		ndpOptions(t, rs.Options(), opts)
+	}
 }
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index f651871ce..a9f4d5dad 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -1220,9 +1220,15 @@ func (ndp *ndpState) startSolicitingRouters() {
 	}
 
 	ndp.rtrSolicitTimer = time.AfterFunc(delay, func() {
-		// Send an RS message with the unspecified source address.
-		ref := ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, forceSpoofing)
-		r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+		// As per RFC 4861 section 4.1, the source of the RS is an address assigned
+		// to the sending interface, or the unspecified address if no address is
+		// assigned to the sending interface.
+		ref := ndp.nic.primaryIPv6Endpoint(header.IPv6AllRoutersMulticastAddress)
+		if ref == nil {
+			ref = ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, forceSpoofing)
+		}
+		localAddr := ref.ep.ID().LocalAddress
+		r := makeRoute(header.IPv6ProtocolNumber, localAddr, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 		defer r.Release()
 
 		// Route should resolve immediately since
@@ -1234,10 +1240,25 @@ func (ndp *ndpState) startSolicitingRouters() {
 			log.Fatalf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID())
 		}
 
-		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize
+		// As per RFC 4861 section 4.1, an NDP RS SHOULD include the source
+		// link-layer address option if the source address of the NDP RS is
+		// specified. This option MUST NOT be included if the source address is
+		// unspecified.
+		//
+		// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
+		// LinkEndpoint.LinkAddress) before reaching this point.
+		var optsSerializer header.NDPOptionsSerializer
+		if localAddr != header.IPv6Any && header.IsValidUnicastEthernetAddress(r.LocalLinkAddress) {
+			optsSerializer = header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(r.LocalLinkAddress),
+			}
+		}
+		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize + int(optsSerializer.Length())
 		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + payloadSize)
 		pkt := header.ICMPv6(hdr.Prepend(payloadSize))
 		pkt.SetType(header.ICMPv6RouterSolicit)
+		rs := header.NDPRouterSolicit(pkt.NDPPayload())
+		rs.Options().Serialize(optsSerializer)
 		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 		sent := r.Stats().ICMP.V6PacketsSent
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 6e9306d09..98b1c807c 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -3384,6 +3384,10 @@ func TestRouterSolicitation(t *testing.T) {
 	tests := []struct {
 		name                        string
 		linkHeaderLen               uint16
+		linkAddr                    tcpip.LinkAddress
+		nicAddr                     tcpip.Address
+		expectedSrcAddr             tcpip.Address
+		expectedNDPOpts             []header.NDPOption
 		maxRtrSolicit               uint8
 		rtrSolicitInt               time.Duration
 		effectiveRtrSolicitInt      time.Duration
@@ -3392,6 +3396,7 @@ func TestRouterSolicitation(t *testing.T) {
 	}{
 		{
 			name:                        "Single RS with delay",
+			expectedSrcAddr:             header.IPv6Any,
 			maxRtrSolicit:               1,
 			rtrSolicitInt:               time.Second,
 			effectiveRtrSolicitInt:      time.Second,
@@ -3401,6 +3406,8 @@ func TestRouterSolicitation(t *testing.T) {
 		{
 			name:                        "Two RS with delay",
 			linkHeaderLen:               1,
+			nicAddr:                     llAddr1,
+			expectedSrcAddr:             llAddr1,
 			maxRtrSolicit:               2,
 			rtrSolicitInt:               time.Second,
 			effectiveRtrSolicitInt:      time.Second,
@@ -3408,8 +3415,14 @@ func TestRouterSolicitation(t *testing.T) {
 			effectiveMaxRtrSolicitDelay: 500 * time.Millisecond,
 		},
 		{
-			name:                        "Single RS without delay",
-			linkHeaderLen:               2,
+			name:            "Single RS without delay",
+			linkHeaderLen:   2,
+			linkAddr:        linkAddr1,
+			nicAddr:         llAddr1,
+			expectedSrcAddr: llAddr1,
+			expectedNDPOpts: []header.NDPOption{
+				header.NDPSourceLinkLayerAddressOption(linkAddr1),
+			},
 			maxRtrSolicit:               1,
 			rtrSolicitInt:               time.Second,
 			effectiveRtrSolicitInt:      time.Second,
@@ -3419,6 +3432,8 @@ func TestRouterSolicitation(t *testing.T) {
 		{
 			name:                        "Two RS without delay and invalid zero interval",
 			linkHeaderLen:               3,
+			linkAddr:                    linkAddr1,
+			expectedSrcAddr:             header.IPv6Any,
 			maxRtrSolicit:               2,
 			rtrSolicitInt:               0,
 			effectiveRtrSolicitInt:      4 * time.Second,
@@ -3427,6 +3442,8 @@ func TestRouterSolicitation(t *testing.T) {
 		},
 		{
 			name:                        "Three RS without delay",
+			linkAddr:                    linkAddr1,
+			expectedSrcAddr:             header.IPv6Any,
 			maxRtrSolicit:               3,
 			rtrSolicitInt:               500 * time.Millisecond,
 			effectiveRtrSolicitInt:      500 * time.Millisecond,
@@ -3435,6 +3452,8 @@ func TestRouterSolicitation(t *testing.T) {
 		},
 		{
 			name:                        "Two RS with invalid negative delay",
+			linkAddr:                    linkAddr1,
+			expectedSrcAddr:             header.IPv6Any,
 			maxRtrSolicit:               2,
 			rtrSolicitInt:               time.Second,
 			effectiveRtrSolicitInt:      time.Second,
@@ -3457,7 +3476,7 @@ func TestRouterSolicitation(t *testing.T) {
 			t.Run(test.name, func(t *testing.T) {
 				t.Parallel()
 				e := channelLinkWithHeaderLength{
-					Endpoint:     channel.New(int(test.maxRtrSolicit), 1280, linkAddr1),
+					Endpoint:     channel.New(int(test.maxRtrSolicit), 1280, test.linkAddr),
 					headerLength: test.linkHeaderLen,
 				}
 				e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
@@ -3481,10 +3500,10 @@ func TestRouterSolicitation(t *testing.T) {
 
 					checker.IPv6(t,
 						p.Pkt.Header.View(),
-						checker.SrcAddr(header.IPv6Any),
+						checker.SrcAddr(test.expectedSrcAddr),
 						checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
 						checker.TTL(header.NDPHopLimit),
-						checker.NDPRS(),
+						checker.NDPRS(checker.NDPRSOptions(test.expectedNDPOpts)),
 					)
 
 					if l, want := p.Pkt.Header.AvailableLength(), int(test.linkHeaderLen); l != want {
@@ -3510,13 +3529,19 @@ func TestRouterSolicitation(t *testing.T) {
 					t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 				}
 
-				// Make sure each RS got sent at the right
-				// times.
+				if addr := test.nicAddr; addr != "" {
+					if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr); err != nil {
+						t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr, err)
+					}
+				}
+
+				// Make sure each RS is sent at the right time.
 				remaining := test.maxRtrSolicit
 				if remaining > 0 {
 					waitForPkt(test.effectiveMaxRtrSolicitDelay + defaultAsyncEventTimeout)
 					remaining--
 				}
+
 				for ; remaining > 0; remaining-- {
 					waitForNothing(test.effectiveRtrSolicitInt - defaultTimeout)
 					waitForPkt(defaultAsyncEventTimeout)
-- 
cgit v1.2.3


From d5dbe366bf7c9f5b648b8114a9dc7f45589899b1 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Fri, 6 Mar 2020 11:41:10 -0800
Subject: shutdown(s, SHUT_WR) in TIME-WAIT returns ENOTCONN

From RFC 793 s3.9 p61 Event Processing:

CLOSE Call during TIME-WAIT: return with "error: connection closing"

Fixes #1603

PiperOrigin-RevId: 299401353
---
 pkg/tcpip/transport/tcp/endpoint.go |  5 ++++-
 test/syscalls/linux/tcp_socket.cc   | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 40cc664c0..dc9c18b6f 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2117,10 +2117,13 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 		// Close for write.
 		if (e.shutdownFlags & tcpip.ShutdownWrite) != 0 {
 			e.sndBufMu.Lock()
-
 			if e.sndClosed {
 				// Already closed.
 				e.sndBufMu.Unlock()
+				if e.EndpointState() == StateTimeWait {
+					e.mu.Unlock()
+					return tcpip.ErrNotConnected
+				}
 				break
 			}
 
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 579463384..d9c1ac0e1 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -143,6 +143,20 @@ TEST_P(TcpSocketTest, ConnectOnEstablishedConnection) {
       SyscallFailsWithErrno(EISCONN));
 }
 
+TEST_P(TcpSocketTest, ShutdownWriteInTimeWait) {
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  EXPECT_THAT(shutdown(s_, SHUT_RDWR), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));  // Wait to enter TIME_WAIT.
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(TcpSocketTest, ShutdownWriteInFinWait1) {
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));  // Wait to enter FIN-WAIT2.
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+}
+
 TEST_P(TcpSocketTest, DataCoalesced) {
   char buf[10];
 
-- 
cgit v1.2.3


From 6fa5cee82c0f515b001dee5f3840e1f875b2f477 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Fri, 6 Mar 2020 12:30:37 -0800
Subject: Prevent memory leaks in ilist

When list elements are removed from a list but not discarded, it becomes
important to invalidate the references they hold to their former
neighbors to prevent memory leaks.

PiperOrigin-RevId: 299412421
---
 pkg/ilist/list.go                                |  8 ++++++--
 pkg/sentry/fs/dirent_cache.go                    |  2 --
 pkg/sentry/fs/inotify.go                         |  5 ++++-
 pkg/sentry/kernel/epoll/epoll_state.go           | 13 ++++++++-----
 pkg/tcpip/network/fragmentation/fragmentation.go |  8 +++++---
 5 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'pkg')

diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index f3a609b57..8f93e4d6d 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -169,8 +169,9 @@ func (l *List) InsertBefore(a, e Element) {
 
 // Remove removes e from l.
 func (l *List) Remove(e Element) {
-	prev := ElementMapper{}.linkerFor(e).Prev()
-	next := ElementMapper{}.linkerFor(e).Next()
+	linker := ElementMapper{}.linkerFor(e)
+	prev := linker.Prev()
+	next := linker.Next()
 
 	if prev != nil {
 		ElementMapper{}.linkerFor(prev).SetNext(next)
@@ -183,6 +184,9 @@ func (l *List) Remove(e Element) {
 	} else {
 		l.tail = prev
 	}
+
+	linker.SetNext(nil)
+	linker.SetPrev(nil)
 }
 
 // Entry is a default implementation of Linker. Users can add anonymous fields
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index 25514ace4..33de32c69 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -101,8 +101,6 @@ func (c *DirentCache) remove(d *Dirent) {
 		panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d))
 	}
 	c.list.Remove(d)
-	d.SetPrev(nil)
-	d.SetNext(nil)
 	d.DecRef()
 	c.currentSize--
 	if c.limit != nil {
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 928c90aa0..e3a715c1f 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -143,7 +143,10 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i
 	}
 
 	var writeLen int64
-	for event := i.events.Front(); event != nil; event = event.Next() {
+	for it := i.events.Front(); it != nil; {
+		event := it
+		it = it.Next()
+
 		// Does the buffer have enough remaining space to hold the event we're
 		// about to write out?
 		if dst.NumBytes() < int64(event.sizeOf()) {
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
index a0d35d350..8e9f200d0 100644
--- a/pkg/sentry/kernel/epoll/epoll_state.go
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -38,11 +38,14 @@ func (e *EventPoll) afterLoad() {
 		}
 	}
 
-	for it := e.waitingList.Front(); it != nil; it = it.Next() {
-		if it.id.File.Readiness(it.mask) != 0 {
-			e.waitingList.Remove(it)
-			e.readyList.PushBack(it)
-			it.curList = &e.readyList
+	for it := e.waitingList.Front(); it != nil; {
+		entry := it
+		it = it.Next()
+
+		if entry.id.File.Readiness(entry.mask) != 0 {
+			e.waitingList.Remove(entry)
+			e.readyList.PushBack(entry)
+			entry.curList = &e.readyList
 			e.Notify(waiter.EventIn)
 		}
 	}
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index 92f2aa13a..f42abc4bb 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -115,10 +115,12 @@ func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buf
 	// Evict reassemblers if we are consuming more memory than highLimit until
 	// we reach lowLimit.
 	if f.size > f.highLimit {
-		tail := f.rList.Back()
-		for f.size > f.lowLimit && tail != nil {
+		for f.size > f.lowLimit {
+			tail := f.rList.Back()
+			if tail == nil {
+				break
+			}
 			f.release(tail)
-			tail = tail.Prev()
 		}
 	}
 	f.mu.Unlock()
-- 
cgit v1.2.3


From 960f6a975b7e44c0efe8fd38c66b02017c4fe137 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 6 Mar 2020 12:58:45 -0800
Subject: Add plumbing for importing fds in VFS2, along with non-socket,
 non-TTY impl.

In VFS2, imported file descriptors are stored in a kernfs-based filesystem.
Upon calling ImportFD, the host fd can be accessed in two ways:
1. a FileDescription that can be added to the FDTable, and
2. a Dentry in the host.filesystem mount, which we will want to access through
magic symlinks in /proc/[pid]/fd/.

An implementation of the kernfs.Inode interface stores a unique host fd. This
inode can be inserted into file descriptions as well as dentries.

This change also plumbs in three FileDescriptionImpls corresponding to fds for
sockets, TTYs, and other files (only the latter is implemented here).
These implementations will mostly make corresponding syscalls to the host.
Where possible, the logic is ported over from pkg/sentry/fs/host.

Updates #1672

PiperOrigin-RevId: 299417263
---
 pkg/sentry/fs/host/util.go                     |  12 +-
 pkg/sentry/fsimpl/host/BUILD                   |  27 +++
 pkg/sentry/fsimpl/host/default_file.go         | 233 ++++++++++++++++++++
 pkg/sentry/fsimpl/host/host.go                 | 286 +++++++++++++++++++++++++
 pkg/sentry/fsimpl/host/util.go                 |  86 ++++++++
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go |   2 +-
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go       |  18 +-
 pkg/sentry/fsimpl/kernfs/filesystem.go         |   6 +-
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go    |  54 ++---
 pkg/sentry/fsimpl/kernfs/kernfs.go             |   7 +-
 pkg/sentry/fsimpl/proc/subtasks.go             |  13 +-
 pkg/sentry/fsimpl/proc/task.go                 |  23 +-
 pkg/sentry/fsimpl/proc/tasks.go                |  19 +-
 pkg/sentry/vfs/BUILD                           |   4 +
 pkg/sentry/vfs/file_description_impl_util.go   |   4 +-
 15 files changed, 731 insertions(+), 63 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/host/BUILD
 create mode 100644 pkg/sentry/fsimpl/host/default_file.go
 create mode 100644 pkg/sentry/fsimpl/host/host.go
 create mode 100644 pkg/sentry/fsimpl/host/util.go

(limited to 'pkg')

diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
index e37e687c6..7c60dc1db 100644
--- a/pkg/sentry/fs/host/util.go
+++ b/pkg/sentry/fs/host/util.go
@@ -24,7 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -152,9 +152,9 @@ func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr {
 		Usage:            s.Blocks * 512,
 		Perms:            fs.FilePermsFromMode(linux.FileMode(s.Mode)),
 		Owner:            owner(mo, s),
-		AccessTime:       ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec),
-		ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
-		StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
+		AccessTime:       time.FromUnix(s.Atim.Sec, s.Atim.Nsec),
+		ModificationTime: time.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
+		StatusChangeTime: time.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
 		Links:            uint64(s.Nlink),
 	}
 }
@@ -165,6 +165,8 @@ type dirInfo struct {
 	bufp int    // location of next record in buf.
 }
 
+// LINT.IfChange
+
 // isBlockError unwraps os errors and checks if they are caused by EAGAIN or
 // EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock.
 func isBlockError(err error) bool {
@@ -177,6 +179,8 @@ func isBlockError(err error) bool {
 	return false
 }
 
+// LINT.ThenChange(../../fsimpl/host/util.go)
+
 func hostEffectiveKIDs() (uint32, []uint32, error) {
 	gids, err := os.Getgroups()
 	if err != nil {
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
new file mode 100644
index 000000000..731f192b3
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -0,0 +1,27 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "host",
+    srcs = [
+        "default_file.go",
+        "host.go",
+        "util.go",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/safemem",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go
new file mode 100644
index 000000000..172cdb161
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/default_file.go
@@ -0,0 +1,233 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"math"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// defaultFileFD implements FileDescriptionImpl for non-socket, non-TTY files.
+type defaultFileFD struct {
+	fileDescription
+
+	// canMap specifies whether we allow the file to be memory mapped.
+	canMap bool
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// offset specifies the current file offset.
+	offset int64
+}
+
+// TODO(gvisor.dev/issue/1672): Implement Waitable interface.
+
+// PRead implements FileDescriptionImpl.
+func (f *defaultFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if f.inode.isStream {
+		return 0, syserror.ESPIPE
+	}
+
+	return readFromHostFD(ctx, f.inode.hostFD, dst, offset, int(opts.Flags))
+}
+
+// Read implements FileDescriptionImpl.
+func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if f.inode.isStream {
+		// These files can't be memory mapped, assert this.
+		if f.canMap {
+			panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
+		}
+
+		f.mu.Lock()
+		n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags))
+		f.mu.Unlock()
+		if isBlockError(err) {
+			// If we got any data at all, return it as a "completed" partial read
+			// rather than retrying until complete.
+			if n != 0 {
+				err = nil
+			} else {
+				err = syserror.ErrWouldBlock
+			}
+		}
+		return n, err
+	}
+	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+	f.mu.Lock()
+	n, err := readFromHostFD(ctx, f.inode.hostFD, dst, f.offset, int(opts.Flags))
+	f.offset += n
+	f.mu.Unlock()
+	return n, err
+}
+
+func readFromHostFD(ctx context.Context, fd int, dst usermem.IOSequence, offset int64, flags int) (int64, error) {
+	if flags&^(linux.RWF_VALID) != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	reader := safemem.FromVecReaderFunc{
+		func(srcs [][]byte) (int64, error) {
+			n, err := unix.Preadv2(fd, srcs, offset, flags)
+			return int64(n), err
+		},
+	}
+	n, err := dst.CopyOutFrom(ctx, reader)
+	return int64(n), err
+}
+
+// PWrite implements FileDescriptionImpl.
+func (f *defaultFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if f.inode.isStream {
+		return 0, syserror.ESPIPE
+	}
+
+	return writeToHostFD(ctx, f.inode.hostFD, src, offset, int(opts.Flags))
+}
+
+// Write implements FileDescriptionImpl.
+func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if f.inode.isStream {
+		// These files can't be memory mapped, assert this.
+		if f.canMap {
+			panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
+		}
+
+		f.mu.Lock()
+		n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags))
+		f.mu.Unlock()
+		if isBlockError(err) {
+			err = syserror.ErrWouldBlock
+		}
+		return n, err
+	}
+	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+	// TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file.
+	f.mu.Lock()
+	n, err := writeToHostFD(ctx, f.inode.hostFD, src, f.offset, int(opts.Flags))
+	f.offset += n
+	f.mu.Unlock()
+	return n, err
+}
+
+func writeToHostFD(ctx context.Context, fd int, src usermem.IOSequence, offset int64, flags int) (int64, error) {
+	if flags&^(linux.RWF_VALID) != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	writer := safemem.FromVecWriterFunc{
+		func(srcs [][]byte) (int64, error) {
+			n, err := unix.Pwritev2(fd, srcs, offset, flags)
+			return int64(n), err
+		},
+	}
+	n, err := src.CopyInTo(ctx, writer)
+	return int64(n), err
+}
+
+// Seek implements FileDescriptionImpl.
+//
+// Note that we do not support seeking on directories, since we do not even
+// allow directory fds to be imported at all.
+func (f *defaultFileFD) Seek(_ context.Context, offset int64, whence int32) (int64, error) {
+	// TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null.
+	if f.inode.isStream {
+		return 0, syserror.ESPIPE
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		if offset < 0 {
+			return f.offset, syserror.EINVAL
+		}
+		f.offset = offset
+
+	case linux.SEEK_CUR:
+		// Check for overflow. Note that underflow cannot occur, since f.offset >= 0.
+		if offset > math.MaxInt64-f.offset {
+			return f.offset, syserror.EOVERFLOW
+		}
+		if f.offset+offset < 0 {
+			return f.offset, syserror.EINVAL
+		}
+		f.offset += offset
+
+	case linux.SEEK_END:
+		var s syscall.Stat_t
+		if err := syscall.Fstat(f.inode.hostFD, &s); err != nil {
+			return f.offset, err
+		}
+		size := s.Size
+
+		// Check for overflow. Note that underflow cannot occur, since size >= 0.
+		if offset > math.MaxInt64-size {
+			return f.offset, syserror.EOVERFLOW
+		}
+		if size+offset < 0 {
+			return f.offset, syserror.EINVAL
+		}
+		f.offset = size + offset
+
+	case linux.SEEK_DATA, linux.SEEK_HOLE:
+		// Modifying the offset in the host file table should not matter, since
+		// this is the only place where we use it.
+		//
+		// For reading and writing, we always rely on our internal offset.
+		n, err := unix.Seek(f.inode.hostFD, offset, int(whence))
+		if err != nil {
+			return f.offset, err
+		}
+		f.offset = n
+
+	default:
+		// Invalid whence.
+		return f.offset, syserror.EINVAL
+	}
+
+	return f.offset, nil
+}
+
+// Sync implements FileDescriptionImpl.
+func (f *defaultFileFD) Sync(context.Context) error {
+	// TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything.
+	return unix.Fsync(f.inode.hostFD)
+}
+
+// ConfigureMMap implements FileDescriptionImpl.
+func (f *defaultFileFD) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
+	if !f.canMap {
+		return syserror.ENODEV
+	}
+	// TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface.
+	return syserror.ENODEV
+}
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
new file mode 100644
index 000000000..c205e6a0b
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -0,0 +1,286 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package host provides a filesystem implementation for host files imported as
+// file descriptors.
+package host
+
+import (
+	"errors"
+	"fmt"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+}
+
+// ImportFD sets up and returns a vfs.FileDescription from a donated fd.
+func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID, isTTY bool) (*vfs.FileDescription, error) {
+	// Must be importing to a mount of host.filesystem.
+	fs, ok := mnt.Filesystem().Impl().(*filesystem)
+	if !ok {
+		return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl())
+	}
+
+	// Retrieve metadata.
+	var s syscall.Stat_t
+	if err := syscall.Fstat(hostFD, &s); err != nil {
+		return nil, err
+	}
+
+	fileMode := linux.FileMode(s.Mode)
+	fileType := fileMode.FileType()
+	// Pipes, character devices, and sockets can return EWOULDBLOCK for
+	// operations that would block.
+	isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
+
+	i := &inode{
+		hostFD:   hostFD,
+		isStream: isStream,
+		isTTY:    isTTY,
+		ino:      fs.NextIno(),
+		mode:     fileMode,
+		uid:      ownerUID,
+		gid:      ownerGID,
+	}
+
+	d := &kernfs.Dentry{}
+	d.Init(i)
+	// i.open will take a reference on d.
+	defer d.DecRef()
+
+	return i.open(d.VFSDentry(), mnt)
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	// When the reference count reaches zero, the host fd is closed.
+	refs.AtomicRefCount
+
+	// hostFD contains the host fd that this file was originally created from,
+	// which must be available at time of restore.
+	//
+	// This field is initialized at creation time and is immutable.
+	hostFD int
+
+	// isStream is true if the host fd points to a file representing a stream,
+	// e.g. a socket or a pipe. Such files are not seekable and can return
+	// EWOULDBLOCK for I/O operations.
+	//
+	// This field is initialized at creation time and is immutable.
+	isStream bool
+
+	// isTTY is true if this file represents a TTY.
+	//
+	// This field is initialized at creation time and is immutable.
+	isTTY bool
+
+	// ino is an inode number unique within this filesystem.
+	ino uint64
+
+	// mu protects the inode metadata below.
+	mu sync.Mutex
+
+	// mode is the file mode of this inode. Note that this value may become out
+	// of date if the mode is changed on the host, e.g. with chmod.
+	mode linux.FileMode
+
+	// uid and gid of the file owner. Note that these refer to the owner of the
+	// file created on import, not the fd on the host.
+	uid auth.KUID
+	gid auth.KGID
+}
+
+// Note that these flags may become out of date, since they can be modified
+// on the host, e.g. with fcntl.
+func fileFlagsFromHostFD(fd int) (int, error) {
+	flags, err := unix.FcntlInt(uintptr(fd), syscall.F_GETFL, 0)
+	if err != nil {
+		log.Warningf("Failed to get file flags for donated FD %d: %v", fd, err)
+		return 0, err
+	}
+	// TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags.
+	flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND
+	return flags, nil
+}
+
+// CheckPermissions implements kernfs.Inode.
+func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, atx, false /* isDir */, uint16(i.mode), i.uid, i.gid)
+}
+
+// Mode implements kernfs.Inode.
+func (i *inode) Mode() linux.FileMode {
+	return i.mode
+}
+
+// Stat implements kernfs.Inode.
+func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	var s unix.Statx_t
+	if err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(opts.Mask), &s); err != nil {
+		return linux.Statx{}, err
+	}
+	ls := unixToLinuxStatx(s)
+
+	// Use our own internal inode number and file owner.
+	//
+	// TODO(gvisor.dev/issue/1672): Use a kernfs-specific device number as well.
+	// If we use the device number from the host, it may collide with another
+	// sentry-internal device number. We handle device/inode numbers without
+	// relying on the host to prevent collisions.
+	ls.Ino = i.ino
+	ls.UID = uint32(i.uid)
+	ls.GID = uint32(i.gid)
+
+	// Update file mode from the host.
+	i.mode = linux.FileMode(ls.Mode)
+
+	return ls, nil
+}
+
+// SetStat implements kernfs.Inode.
+func (i *inode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
+	s := opts.Stat
+
+	m := s.Mask
+	if m == 0 {
+		return nil
+	}
+	if m&(linux.STATX_UID|linux.STATX_GID) != 0 {
+		return syserror.EPERM
+	}
+	if m&linux.STATX_MODE != 0 {
+		if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil {
+			return err
+		}
+		i.mode = linux.FileMode(s.Mode)
+	}
+	if m&linux.STATX_SIZE != 0 {
+		if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
+			return err
+		}
+	}
+	if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
+		timestamps := []unix.Timespec{
+			toTimespec(s.Atime, m&linux.STATX_ATIME == 0),
+			toTimespec(s.Mtime, m&linux.STATX_MTIME == 0),
+		}
+		if err := unix.UtimesNanoAt(i.hostFD, "", timestamps, unix.AT_EMPTY_PATH); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// DecRef implements kernfs.Inode.
+func (i *inode) DecRef() {
+	i.AtomicRefCount.DecRefWithDestructor(i.Destroy)
+}
+
+// Destroy implements kernfs.Inode.
+func (i *inode) Destroy() {
+	if err := unix.Close(i.hostFD); err != nil {
+		log.Warningf("failed to close host fd %d: %v", i.hostFD, err)
+	}
+}
+
+// Open implements kernfs.Inode.
+func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return i.open(vfsd, rp.Mount())
+}
+
+func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) {
+
+	fileType := i.mode.FileType()
+	if fileType == syscall.S_IFSOCK {
+		if i.isTTY {
+			return nil, errors.New("cannot use host socket as TTY")
+		}
+		// TODO(gvisor.dev/issue/1672): support importing sockets.
+		return nil, errors.New("importing host sockets not supported")
+	}
+
+	if i.isTTY {
+		// TODO(gvisor.dev/issue/1672): support importing host fd as TTY.
+		return nil, errors.New("importing host fd as TTY not supported")
+	}
+
+	// For simplicity, set offset to 0. Technically, we should
+	// only set to 0 on files that are not seekable (sockets, pipes, etc.),
+	// and use the offset from the host fd otherwise.
+	fd := &defaultFileFD{
+		fileDescription: fileDescription{
+			inode: i,
+		},
+		canMap: canMap(uint32(fileType)),
+		mu:     sync.Mutex{},
+		offset: 0,
+	}
+
+	vfsfd := &fd.vfsfd
+	flags, err := fileFlagsFromHostFD(i.hostFD)
+	if err != nil {
+		return nil, err
+	}
+
+	if err := vfsfd.Init(fd, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return vfsfd, nil
+}
+
+// fileDescription is embedded by host fd implementations of FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+
+	// inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but
+	// cached to reduce indirections and casting. fileDescription does not hold
+	// a reference on the inode through the inode field (since one is already
+	// held via the Dentry).
+	//
+	// inode is immutable after fileDescription creation.
+	inode *inode
+}
+
+// SetStat implements vfs.FileDescriptionImpl.
+func (f *fileDescription) SetStat(_ context.Context, opts vfs.SetStatOptions) error {
+	return f.inode.SetStat(nil, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.
+func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	return f.inode.Stat(nil, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.
+func (f *fileDescription) Release() {
+	// noop
+}
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
new file mode 100644
index 000000000..e1ccacb4d
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -0,0 +1,86 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec {
+	if omit {
+		return unix.Timespec{
+			Sec:  0,
+			Nsec: unix.UTIME_OMIT,
+		}
+	}
+	return unix.Timespec{
+		Sec:  int64(ts.Sec),
+		Nsec: int64(ts.Nsec),
+	}
+}
+
+func unixToLinuxStatx(s unix.Statx_t) linux.Statx {
+	return linux.Statx{
+		Mask:           s.Mask,
+		Blksize:        s.Blksize,
+		Attributes:     s.Attributes,
+		Nlink:          s.Nlink,
+		UID:            s.Uid,
+		GID:            s.Gid,
+		Mode:           s.Mode,
+		Ino:            s.Ino,
+		Size:           s.Size,
+		Blocks:         s.Blocks,
+		AttributesMask: s.Attributes_mask,
+		Atime:          unixToLinuxStatxTimestamp(s.Atime),
+		Btime:          unixToLinuxStatxTimestamp(s.Btime),
+		Ctime:          unixToLinuxStatxTimestamp(s.Ctime),
+		Mtime:          unixToLinuxStatxTimestamp(s.Mtime),
+		RdevMajor:      s.Rdev_major,
+		RdevMinor:      s.Rdev_minor,
+		DevMajor:       s.Dev_major,
+		DevMinor:       s.Dev_minor,
+	}
+}
+
+func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp {
+	return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec}
+}
+
+// wouldBlock returns true for file types that can return EWOULDBLOCK
+// for blocking operations, e.g. pipes, character devices, and sockets.
+func wouldBlock(fileType uint32) bool {
+	return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
+}
+
+// canMap returns true if a file with fileType is allowed to be memory mapped.
+// This is ported over from VFS1, but it's probably not the best way for us
+// to check if a file can be memory mapped.
+func canMap(fileType uint32) bool {
+	// TODO(gvisor.dev/issue/1672): Also allow "special files" to be mapped (see fs/host:canMap()).
+	//
+	// TODO(b/38213152): Some obscure character devices can be mapped.
+	return fileType == syscall.S_IFREG
+}
+
+// isBlockError checks if an error is EAGAIN or EWOULDBLOCK.
+// If so, they can be transformed into syserror.ErrWouldBlock.
+func isBlockError(err error) bool {
+	return err == syserror.EAGAIN || err == syserror.EWOULDBLOCK
+}
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index d092ccb2a..1c026f4d8 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -122,7 +122,7 @@ func (fd *DynamicBytesFD) Release() {}
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return fd.inode.Stat(fs), nil
+	return fd.inode.Stat(fs, opts)
 }
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 5650512e0..da821d524 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -107,9 +107,13 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
 
+	opts := vfs.StatOptions{Mask: linux.STATX_INO}
 	// Handle ".".
 	if fd.off == 0 {
-		stat := fd.inode().Stat(vfsFS)
+		stat, err := fd.inode().Stat(vfsFS, opts)
+		if err != nil {
+			return err
+		}
 		dirent := vfs.Dirent{
 			Name:    ".",
 			Type:    linux.DT_DIR,
@@ -125,7 +129,10 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	// Handle "..".
 	if fd.off == 1 {
 		parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode
-		stat := parentInode.Stat(vfsFS)
+		stat, err := parentInode.Stat(vfsFS, opts)
+		if err != nil {
+			return err
+		}
 		dirent := vfs.Dirent{
 			Name:    "..",
 			Type:    linux.FileMode(stat.Mode).DirentType(),
@@ -146,7 +153,10 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	childIdx := fd.off - 2
 	for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
 		inode := it.Dentry.Impl().(*Dentry).inode
-		stat := inode.Stat(vfsFS)
+		stat, err := inode.Stat(vfsFS, opts)
+		if err != nil {
+			return err
+		}
 		dirent := vfs.Dirent{
 			Name:    it.Name,
 			Type:    linux.FileMode(stat.Mode).DirentType(),
@@ -190,7 +200,7 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
 func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := fd.filesystem()
 	inode := fd.inode()
-	return inode.Stat(fs), nil
+	return inode.Stat(fs, opts)
 }
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 292f58afd..1d7e04ad4 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// This file implements vfs.FilesystemImpl for kernfs.
-
 package kernfs
 
+// This file implements vfs.FilesystemImpl for kernfs.
+
 import (
 	"fmt"
 
@@ -634,7 +634,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	if err != nil {
 		return linux.Statx{}, err
 	}
-	return inode.Stat(fs.VFSFilesystem()), nil
+	return inode.Stat(fs.VFSFilesystem(), opts)
 }
 
 // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 099d70a16..d50018b18 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -36,20 +36,20 @@ type InodeNoopRefCount struct {
 }
 
 // IncRef implements Inode.IncRef.
-func (n *InodeNoopRefCount) IncRef() {
+func (InodeNoopRefCount) IncRef() {
 }
 
 // DecRef implements Inode.DecRef.
-func (n *InodeNoopRefCount) DecRef() {
+func (InodeNoopRefCount) DecRef() {
 }
 
 // TryIncRef implements Inode.TryIncRef.
-func (n *InodeNoopRefCount) TryIncRef() bool {
+func (InodeNoopRefCount) TryIncRef() bool {
 	return true
 }
 
 // Destroy implements Inode.Destroy.
-func (n *InodeNoopRefCount) Destroy() {
+func (InodeNoopRefCount) Destroy() {
 }
 
 // InodeDirectoryNoNewChildren partially implements the Inode interface.
@@ -58,27 +58,27 @@ func (n *InodeNoopRefCount) Destroy() {
 type InodeDirectoryNoNewChildren struct{}
 
 // NewFile implements Inode.NewFile.
-func (*InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
 // NewDir implements Inode.NewDir.
-func (*InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
 // NewLink implements Inode.NewLink.
-func (*InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
 // NewSymlink implements Inode.NewSymlink.
-func (*InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
 // NewNode implements Inode.NewNode.
-func (*InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
@@ -90,62 +90,62 @@ type InodeNotDirectory struct {
 }
 
 // HasChildren implements Inode.HasChildren.
-func (*InodeNotDirectory) HasChildren() bool {
+func (InodeNotDirectory) HasChildren() bool {
 	return false
 }
 
 // NewFile implements Inode.NewFile.
-func (*InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
 	panic("NewFile called on non-directory inode")
 }
 
 // NewDir implements Inode.NewDir.
-func (*InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
 	panic("NewDir called on non-directory inode")
 }
 
 // NewLink implements Inode.NewLinkink.
-func (*InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
 	panic("NewLink called on non-directory inode")
 }
 
 // NewSymlink implements Inode.NewSymlink.
-func (*InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
 	panic("NewSymlink called on non-directory inode")
 }
 
 // NewNode implements Inode.NewNode.
-func (*InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
 	panic("NewNode called on non-directory inode")
 }
 
 // Unlink implements Inode.Unlink.
-func (*InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
 	panic("Unlink called on non-directory inode")
 }
 
 // RmDir implements Inode.RmDir.
-func (*InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
 	panic("RmDir called on non-directory inode")
 }
 
 // Rename implements Inode.Rename.
-func (*InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
 	panic("Rename called on non-directory inode")
 }
 
 // Lookup implements Inode.Lookup.
-func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
 	panic("Lookup called on non-directory inode")
 }
 
 // IterDirents implements Inode.IterDirents.
-func (*InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
 	panic("IterDirents called on non-directory inode")
 }
 
 // Valid implements Inode.Valid.
-func (*InodeNotDirectory) Valid(context.Context) bool {
+func (InodeNotDirectory) Valid(context.Context) bool {
 	return true
 }
 
@@ -157,17 +157,17 @@ func (*InodeNotDirectory) Valid(context.Context) bool {
 type InodeNoDynamicLookup struct{}
 
 // Lookup implements Inode.Lookup.
-func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
 	return nil, syserror.ENOENT
 }
 
 // IterDirents implements Inode.IterDirents.
-func (*InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	return offset, nil
 }
 
 // Valid implements Inode.Valid.
-func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool {
+func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
 	return true
 }
 
@@ -177,7 +177,7 @@ func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool {
 type InodeNotSymlink struct{}
 
 // Readlink implements Inode.Readlink.
-func (*InodeNotSymlink) Readlink(context.Context) (string, error) {
+func (InodeNotSymlink) Readlink(context.Context) (string, error) {
 	return "", syserror.EINVAL
 }
 
@@ -219,7 +219,7 @@ func (a *InodeAttrs) Mode() linux.FileMode {
 // Stat partially implements Inode.Stat. Note that this function doesn't provide
 // all the stat fields, and the embedder should consider extending the result
 // with filesystem-specific fields.
-func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx {
+func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
 	var stat linux.Statx
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
 	stat.Ino = atomic.LoadUint64(&a.ino)
@@ -230,7 +230,7 @@ func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx {
 
 	// TODO: Implement other stat fields like timestamps.
 
-	return stat
+	return stat, nil
 }
 
 // SetStat implements Inode.SetStat.
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index c74fa999b..a8ab2a2ba 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -176,8 +176,6 @@ type Dentry struct {
 	vfsd  vfs.Dentry
 	inode Inode
 
-	refs uint64
-
 	// flags caches useful information about the dentry from the inode. See the
 	// dflags* consts above. Must be accessed by atomic ops.
 	flags uint32
@@ -302,7 +300,8 @@ type Inode interface {
 	// this inode. The returned file description should hold a reference on the
 	// inode for its lifetime.
 	//
-	// Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry.
+	// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
+	// the inode on which Open() is being called.
 	Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
 }
 
@@ -328,7 +327,7 @@ type inodeMetadata interface {
 
 	// Stat returns the metadata for this inode. This corresponds to
 	// vfs.FilesystemImpl.StatAt.
-	Stat(fs *vfs.Filesystem) linux.Statx
+	Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
 
 	// SetStat updates the metadata for this inode. This corresponds to
 	// vfs.FilesystemImpl.SetStatAt.
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index f3f4e49b4..611645f3f 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -121,8 +121,13 @@ func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.O
 }
 
 // Stat implements kernfs.Inode.
-func (i *subtasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx {
-	stat := i.InodeAttrs.Stat(vsfs)
-	stat.Nlink += uint32(i.task.ThreadGroup().Count())
-	return stat
+func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.InodeAttrs.Stat(vsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	if opts.Mask&linux.STATX_NLINK != 0 {
+		stat.Nlink += uint32(i.task.ThreadGroup().Count())
+	}
+	return stat, nil
 }
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 18e5cd6f6..c0d643f51 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -154,12 +154,21 @@ func newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, childre
 }
 
 // Stat implements kernfs.Inode.
-func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx {
-	stat := i.Inode.Stat(fs)
-	uid, gid := i.getOwner(linux.FileMode(stat.Mode))
-	stat.UID = uint32(uid)
-	stat.GID = uint32(gid)
-	return stat
+func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.Inode.Stat(fs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	if opts.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 {
+		uid, gid := i.getOwner(linux.FileMode(stat.Mode))
+		if opts.Mask&linux.STATX_UID != 0 {
+			stat.UID = uint32(uid)
+		}
+		if opts.Mask&linux.STATX_GID != 0 {
+			stat.GID = uint32(gid)
+		}
+	}
+	return stat, nil
 }
 
 // CheckPermissions implements kernfs.Inode.
@@ -236,7 +245,7 @@ func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentr
 // member, there is one entry containing three colon-separated fields:
 //   hierarchy-ID:controller-list:cgroup-path"
 func newCgroupData(controllers map[string]string) dynamicInode {
-	buf := bytes.Buffer{}
+	var buf bytes.Buffer
 
 	// The hierarchy ids must be positive integers (for cgroup v1), but the
 	// exact number does not matter, so long as they are unique. We can
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 10c08fa90..b1e39c82f 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -211,17 +211,22 @@ func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.Open
 	return fd.VFSFileDescription(), nil
 }
 
-func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx {
-	stat := i.InodeAttrs.Stat(vsfs)
+func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.InodeAttrs.Stat(vsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
 
-	// Add dynamic children to link count.
-	for _, tg := range i.pidns.ThreadGroups() {
-		if leader := tg.Leader(); leader != nil {
-			stat.Nlink++
+	if opts.Mask&linux.STATX_NLINK != 0 {
+		// Add dynamic children to link count.
+		for _, tg := range i.pidns.ThreadGroups() {
+			if leader := tg.Leader(); leader != nil {
+				stat.Nlink++
+			}
 		}
 	}
 
-	return stat
+	return stat, nil
 }
 
 func cpuInfoData(k *kernel.Kernel) string {
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 07c8383e6..cb4deb068 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -42,10 +42,13 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/fd",
         "//pkg/fspath",
         "//pkg/gohacks",
         "//pkg/log",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
@@ -53,6 +56,7 @@ go_library(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index c2a52ec1b..45191d1c3 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -33,8 +33,8 @@ import (
 // implementations to adapt:
 //   - Have a local fileDescription struct (containing FileDescription) which
 //     embeds FileDescriptionDefaultImpl and overrides the default methods
-//     which are common to all fd implementations for that for that filesystem
-//     like StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
+//     which are common to all fd implementations for that filesystem like
+//     StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
 //   - This should be embedded in all file description implementations as the
 //     first field by value.
 //   - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl.
-- 
cgit v1.2.3


From 228813fd26aadea012cd8c39e084e4b0fb23d273 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 6 Mar 2020 15:22:30 -0800
Subject: Update comments and debug level for profiling options.

PiperOrigin-RevId: 299448307
---
 pkg/sentry/control/pprof.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 5d1907c0e..663e51989 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -117,15 +117,15 @@ func (p *Profile) HeapProfile(o *ProfileOpts, _ *struct{}) error {
 	return nil
 }
 
-// GoroutineProfile is an RPC stub which dumps out the stack trace for all running
-// goroutines.
+// GoroutineProfile is an RPC stub which dumps out the stack trace for all
+// running goroutines.
 func (p *Profile) GoroutineProfile(o *ProfileOpts, _ *struct{}) error {
 	if len(o.FilePayload.Files) < 1 {
 		return errNoOutput
 	}
 	output := o.FilePayload.Files[0]
 	defer output.Close()
-	if err := pprof.Lookup("goroutine").WriteTo(output, 0); err != nil {
+	if err := pprof.Lookup("goroutine").WriteTo(output, 2); err != nil {
 		return err
 	}
 	return nil
-- 
cgit v1.2.3