From ae1cdd6d5a522e4de94747348e35d1964dc43c77 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 14 Aug 2020 17:27:23 -0700
Subject: Use a single NetworkEndpoint per NIC per protocol

The NetworkEndpoint does not need to be created for each address.
Most of the work the NetworkEndpoint does is address agnostic.

PiperOrigin-RevId: 326759605
---
 pkg/tcpip/network/BUILD             |  1 +
 pkg/tcpip/network/arp/arp.go        | 15 +------
 pkg/tcpip/network/ip_test.go        | 77 +++++++++++++++++----------------
 pkg/tcpip/network/ipv4/icmp.go      |  7 +--
 pkg/tcpip/network/ipv4/ipv4.go      | 20 +--------
 pkg/tcpip/network/ipv6/icmp.go      |  7 +--
 pkg/tcpip/network/ipv6/icmp_test.go |  6 +--
 pkg/tcpip/network/ipv6/ipv6.go      | 20 ++-------
 pkg/tcpip/network/ipv6/ndp_test.go  |  5 +--
 pkg/tcpip/stack/forwarder_test.go   | 18 ++------
 pkg/tcpip/stack/ndp.go              |  8 ++--
 pkg/tcpip/stack/nic.go              | 86 +++++++++++++++++--------------------
 pkg/tcpip/stack/nic_test.go         | 30 ++++---------
 pkg/tcpip/stack/registration.go     |  8 +---
 pkg/tcpip/stack/stack.go            |  6 +--
 pkg/tcpip/stack/stack_test.go       | 20 ++-------
 pkg/tcpip/transport/udp/udp_test.go | 10 +----
 17 files changed, 123 insertions(+), 221 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index 6a4839fb8..46083925c 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -12,6 +12,7 @@ go_test(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/loopback",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 1ad788a17..920872c3f 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -66,14 +66,6 @@ func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
 	return e.linkEP.Capabilities()
 }
 
-func (e *endpoint) ID() *stack.NetworkEndpointID {
-	return &stack.NetworkEndpointID{ProtocolAddress}
-}
-
-func (e *endpoint) PrefixLen() int {
-	return 0
-}
-
 func (e *endpoint) MaxHeaderLength() uint16 {
 	return e.linkEP.MaxHeaderLength() + header.ARPSize
 }
@@ -142,16 +134,13 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 	return tcpip.Address(h.ProtocolAddressSender()), ProtocolAddress
 }
 
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, sender stack.LinkEndpoint, st *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
-	if addrWithPrefix.Address != ProtocolAddress {
-		return nil, tcpip.ErrBadLocalAddress
-	}
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, sender stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
 	return &endpoint{
 		protocol:      p,
 		nicID:         nicID,
 		linkEP:        sender,
 		linkAddrCache: linkAddrCache,
-	}, nil
+	}
 }
 
 // LinkAddressProtocol implements stack.LinkAddressResolver.LinkAddressProtocol.
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 491d936a1..9007346fe 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
@@ -41,6 +42,7 @@ const (
 	ipv6SubnetAddr     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 	ipv6SubnetMask     = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00"
 	ipv6Gateway        = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"
+	nicID              = 1
 )
 
 // testObject implements two interfaces: LinkEndpoint and TransportDispatcher.
@@ -195,15 +197,15 @@ func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
 		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol(), tcp.NewProtocol()},
 	})
-	s.CreateNIC(1, loopback.New())
-	s.AddAddress(1, ipv4.ProtocolNumber, local)
+	s.CreateNIC(nicID, loopback.New())
+	s.AddAddress(nicID, ipv4.ProtocolNumber, local)
 	s.SetRouteTable([]tcpip.Route{{
 		Destination: header.IPv4EmptySubnet,
 		Gateway:     ipv4Gateway,
 		NIC:         1,
 	}})
 
-	return s.FindRoute(1, local, remote, ipv4.ProtocolNumber, false /* multicastLoop */)
+	return s.FindRoute(nicID, local, remote, ipv4.ProtocolNumber, false /* multicastLoop */)
 }
 
 func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
@@ -211,31 +213,45 @@ func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 		NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
 		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol(), tcp.NewProtocol()},
 	})
-	s.CreateNIC(1, loopback.New())
-	s.AddAddress(1, ipv6.ProtocolNumber, local)
+	s.CreateNIC(nicID, loopback.New())
+	s.AddAddress(nicID, ipv6.ProtocolNumber, local)
 	s.SetRouteTable([]tcpip.Route{{
 		Destination: header.IPv6EmptySubnet,
 		Gateway:     ipv6Gateway,
 		NIC:         1,
 	}})
 
-	return s.FindRoute(1, local, remote, ipv6.ProtocolNumber, false /* multicastLoop */)
+	return s.FindRoute(nicID, local, remote, ipv6.ProtocolNumber, false /* multicastLoop */)
 }
 
-func buildDummyStack() *stack.Stack {
-	return stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
+func buildDummyStack(t *testing.T) *stack.Stack {
+	t.Helper()
+
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
 		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol(), tcp.NewProtocol()},
 	})
+	e := channel.New(0, 1280, "")
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	if err := s.AddAddress(nicID, header.IPv4ProtocolNumber, localIpv4Addr); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, localIpv4Addr, err)
+	}
+
+	if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, localIpv6Addr); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, localIpv6Addr, err)
+	}
+
+	return s
 }
 
 func TestIPv4Send(t *testing.T) {
 	o := testObject{t: t, v4: true}
 	proto := ipv4.NewProtocol()
-	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, nil, &o, buildDummyStack())
-	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
-	}
+	ep := proto.NewEndpoint(nicID, nil, nil, &o, buildDummyStack(t))
+	defer ep.Close()
 
 	// Allocate and initialize the payload view.
 	payload := buffer.NewView(100)
@@ -271,10 +287,8 @@ func TestIPv4Send(t *testing.T) {
 func TestIPv4Receive(t *testing.T) {
 	o := testObject{t: t, v4: true}
 	proto := ipv4.NewProtocol()
-	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, &o, nil, buildDummyStack())
-	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
-	}
+	ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+	defer ep.Close()
 
 	totalLen := header.IPv4MinimumSize + 30
 	view := buffer.NewView(totalLen)
@@ -343,10 +357,7 @@ func TestIPv4ReceiveControl(t *testing.T) {
 		t.Run(c.name, func(t *testing.T) {
 			o := testObject{t: t}
 			proto := ipv4.NewProtocol()
-			ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, &o, nil, buildDummyStack())
-			if err != nil {
-				t.Fatalf("NewEndpoint failed: %v", err)
-			}
+			ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
 			defer ep.Close()
 
 			const dataOffset = header.IPv4MinimumSize*2 + header.ICMPv4MinimumSize
@@ -407,10 +418,8 @@ func TestIPv4ReceiveControl(t *testing.T) {
 func TestIPv4FragmentationReceive(t *testing.T) {
 	o := testObject{t: t, v4: true}
 	proto := ipv4.NewProtocol()
-	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, &o, nil, buildDummyStack())
-	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
-	}
+	ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+	defer ep.Close()
 
 	totalLen := header.IPv4MinimumSize + 24
 
@@ -486,10 +495,8 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 func TestIPv6Send(t *testing.T) {
 	o := testObject{t: t}
 	proto := ipv6.NewProtocol()
-	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv6Addr, localIpv6PrefixLen}, nil, nil, &o, buildDummyStack())
-	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
-	}
+	ep := proto.NewEndpoint(nicID, nil, &o, channel.New(0, 1280, ""), buildDummyStack(t))
+	defer ep.Close()
 
 	// Allocate and initialize the payload view.
 	payload := buffer.NewView(100)
@@ -525,10 +532,8 @@ func TestIPv6Send(t *testing.T) {
 func TestIPv6Receive(t *testing.T) {
 	o := testObject{t: t}
 	proto := ipv6.NewProtocol()
-	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv6Addr, localIpv6PrefixLen}, nil, &o, nil, buildDummyStack())
-	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
-	}
+	ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+	defer ep.Close()
 
 	totalLen := header.IPv6MinimumSize + 30
 	view := buffer.NewView(totalLen)
@@ -606,11 +611,7 @@ func TestIPv6ReceiveControl(t *testing.T) {
 		t.Run(c.name, func(t *testing.T) {
 			o := testObject{t: t}
 			proto := ipv6.NewProtocol()
-			ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv6Addr, localIpv6PrefixLen}, nil, &o, nil, buildDummyStack())
-			if err != nil {
-				t.Fatalf("NewEndpoint failed: %v", err)
-			}
-
+			ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
 			defer ep.Close()
 
 			dataOffset := header.IPv6MinimumSize*2 + header.ICMPv6MinimumSize
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index 067d770f3..b5659a36b 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -37,8 +37,9 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack
 	// false.
 	//
 	// Drop packet if it doesn't have the basic IPv4 header or if the
-	// original source address doesn't match the endpoint's address.
-	if hdr.SourceAddress() != e.id.LocalAddress {
+	// original source address doesn't match an address we own.
+	src := hdr.SourceAddress()
+	if e.stack.CheckLocalAddress(e.NICID(), ProtocolNumber, src) == 0 {
 		return
 	}
 
@@ -53,7 +54,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack
 	// Skip the ip header, then deliver control message.
 	pkt.Data.TrimFront(hlen)
 	p := hdr.TransportProtocol()
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
+	e.dispatcher.DeliverTransportControlPacket(src, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
 func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer) {
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 3cd48ceb3..79872ec9a 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -52,8 +52,6 @@ const (
 
 type endpoint struct {
 	nicID      tcpip.NICID
-	id         stack.NetworkEndpointID
-	prefixLen  int
 	linkEP     stack.LinkEndpoint
 	dispatcher stack.TransportDispatcher
 	protocol   *protocol
@@ -61,18 +59,14 @@ type endpoint struct {
 }
 
 // NewEndpoint creates a new ipv4 endpoint.
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
-	e := &endpoint{
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
+	return &endpoint{
 		nicID:      nicID,
-		id:         stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
-		prefixLen:  addrWithPrefix.PrefixLen,
 		linkEP:     linkEP,
 		dispatcher: dispatcher,
 		protocol:   p,
 		stack:      st,
 	}
-
-	return e, nil
 }
 
 // DefaultTTL is the default time-to-live value for this endpoint.
@@ -96,16 +90,6 @@ func (e *endpoint) NICID() tcpip.NICID {
 	return e.nicID
 }
 
-// ID returns the ipv4 endpoint ID.
-func (e *endpoint) ID() *stack.NetworkEndpointID {
-	return &e.id
-}
-
-// PrefixLen returns the ipv4 endpoint subnet prefix length in bits.
-func (e *endpoint) PrefixLen() int {
-	return e.prefixLen
-}
-
 // MaxHeaderLength returns the maximum length needed by ipv4 headers (and
 // underlying protocols).
 func (e *endpoint) MaxHeaderLength() uint16 {
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 39ae19295..66d3a953a 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -39,8 +39,9 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack
 	// is truncated, which would cause IsValid to return false.
 	//
 	// Drop packet if it doesn't have the basic IPv6 header or if the
-	// original source address doesn't match the endpoint's address.
-	if hdr.SourceAddress() != e.id.LocalAddress {
+	// original source address doesn't match an address we own.
+	src := hdr.SourceAddress()
+	if e.stack.CheckLocalAddress(e.NICID(), ProtocolNumber, src) == 0 {
 		return
 	}
 
@@ -67,7 +68,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack
 	}
 
 	// Deliver the control packet to the transport endpoint.
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
+	e.dispatcher.DeliverTransportControlPacket(src, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
 func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragmentHeader bool) {
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 2a2f7de01..9e4eeea77 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -114,10 +114,8 @@ func TestICMPCounts(t *testing.T) {
 	if netProto == nil {
 		t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
 	}
-	ep, err := netProto.NewEndpoint(0, tcpip.AddressWithPrefix{lladdr1, netProto.DefaultPrefixLen()}, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
-	if err != nil {
-		t.Fatalf("NewEndpoint(_) = _, %s, want = _, nil", err)
-	}
+	ep := netProto.NewEndpoint(0, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
+	defer ep.Close()
 
 	r, err := s.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
 	if err != nil {
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 0ade655b2..0eafe9790 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -46,12 +46,11 @@ const (
 
 type endpoint struct {
 	nicID         tcpip.NICID
-	id            stack.NetworkEndpointID
-	prefixLen     int
 	linkEP        stack.LinkEndpoint
 	linkAddrCache stack.LinkAddressCache
 	dispatcher    stack.TransportDispatcher
 	protocol      *protocol
+	stack         *stack.Stack
 }
 
 // DefaultTTL is the default hop limit for this endpoint.
@@ -70,16 +69,6 @@ func (e *endpoint) NICID() tcpip.NICID {
 	return e.nicID
 }
 
-// ID returns the ipv6 endpoint ID.
-func (e *endpoint) ID() *stack.NetworkEndpointID {
-	return &e.id
-}
-
-// PrefixLen returns the ipv6 endpoint subnet prefix length in bits.
-func (e *endpoint) PrefixLen() int {
-	return e.prefixLen
-}
-
 // Capabilities implements stack.NetworkEndpoint.Capabilities.
 func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
 	return e.linkEP.Capabilities()
@@ -464,16 +453,15 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 }
 
 // NewEndpoint creates a new ipv6 endpoint.
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
 	return &endpoint{
 		nicID:         nicID,
-		id:            stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
-		prefixLen:     addrWithPrefix.PrefixLen,
 		linkEP:        linkEP,
 		linkAddrCache: linkAddrCache,
 		dispatcher:    dispatcher,
 		protocol:      p,
-	}, nil
+		stack:         st,
+	}
 }
 
 // SetOption implements NetworkProtocol.SetOption.
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index 2efa82e60..af71a7d6b 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -63,10 +63,7 @@ func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address) (*stack
 		t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
 	}
 
-	ep, err := netProto.NewEndpoint(0, tcpip.AddressWithPrefix{rlladdr, netProto.DefaultPrefixLen()}, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
-	if err != nil {
-		t.Fatalf("NewEndpoint(_) = _, %s, want = _, nil", err)
-	}
+	ep := netProto.NewEndpoint(0, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
 
 	return s, ep
 }
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index 944f622fd..5a684eb9d 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -46,8 +46,6 @@ const (
 // protocol. They're all one byte fields to simplify parsing.
 type fwdTestNetworkEndpoint struct {
 	nicID      tcpip.NICID
-	id         NetworkEndpointID
-	prefixLen  int
 	proto      *fwdTestNetworkProtocol
 	dispatcher TransportDispatcher
 	ep         LinkEndpoint
@@ -61,18 +59,10 @@ func (f *fwdTestNetworkEndpoint) NICID() tcpip.NICID {
 	return f.nicID
 }
 
-func (f *fwdTestNetworkEndpoint) PrefixLen() int {
-	return f.prefixLen
-}
-
 func (*fwdTestNetworkEndpoint) DefaultTTL() uint8 {
 	return 123
 }
 
-func (f *fwdTestNetworkEndpoint) ID() *NetworkEndpointID {
-	return &f.id
-}
-
 func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt *PacketBuffer) {
 	// Dispatch the packet to the transport protocol.
 	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(pkt.NetworkHeader().View()[protocolNumberOffset]), pkt)
@@ -99,7 +89,7 @@ func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkH
 	// endpoint.
 	b := pkt.NetworkHeader().Push(fwdTestNetHeaderLen)
 	b[dstAddrOffset] = r.RemoteAddress[0]
-	b[srcAddrOffset] = f.id.LocalAddress[0]
+	b[srcAddrOffset] = r.LocalAddress[0]
 	b[protocolNumberOffset] = byte(params.Protocol)
 
 	return f.ep.WritePacket(r, gso, fwdTestNetNumber, pkt)
@@ -151,15 +141,13 @@ func (*fwdTestNetworkProtocol) Parse(pkt *PacketBuffer) (tcpip.TransportProtocol
 	return tcpip.TransportProtocolNumber(netHeader[protocolNumberOffset]), true, true
 }
 
-func (f *fwdTestNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, ep LinkEndpoint, _ *Stack) (NetworkEndpoint, *tcpip.Error) {
+func (f *fwdTestNetworkProtocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, ep LinkEndpoint, _ *Stack) NetworkEndpoint {
 	return &fwdTestNetworkEndpoint{
 		nicID:      nicID,
-		id:         NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
-		prefixLen:  addrWithPrefix.PrefixLen,
 		proto:      f,
 		dispatcher: dispatcher,
 		ep:         ep,
-	}, nil
+	}
 }
 
 func (f *fwdTestNetworkProtocol) SetOption(option interface{}) *tcpip.Error {
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 93567806b..b0873d1af 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -728,7 +728,7 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 func (ndp *ndpState) sendDADPacket(addr tcpip.Address, ref *referencedNetworkEndpoint) *tcpip.Error {
 	snmc := header.SolicitedNodeAddr(addr)
 
-	r := makeRoute(header.IPv6ProtocolNumber, ref.ep.ID().LocalAddress, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+	r := makeRoute(header.IPv6ProtocolNumber, ref.address(), snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 	defer r.Release()
 
 	// Route should resolve immediately since snmc is a multicast address so a
@@ -1353,7 +1353,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 		return false
 	}
 
-	stableAddr := prefixState.stableAddr.ref.ep.ID().LocalAddress
+	stableAddr := prefixState.stableAddr.ref.address()
 	now := time.Now()
 
 	// As per RFC 4941 section 3.3 step 4, the valid lifetime of a temporary
@@ -1690,7 +1690,7 @@ func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPr
 
 	prefix := addr.Subnet()
 	state, ok := ndp.slaacPrefixes[prefix]
-	if !ok || state.stableAddr.ref == nil || addr.Address != state.stableAddr.ref.ep.ID().LocalAddress {
+	if !ok || state.stableAddr.ref == nil || addr.Address != state.stableAddr.ref.address() {
 		return
 	}
 
@@ -1867,7 +1867,7 @@ func (ndp *ndpState) startSolicitingRouters() {
 		}
 		ndp.nic.mu.Unlock()
 
-		localAddr := ref.ep.ID().LocalAddress
+		localAddr := ref.address()
 		r := makeRoute(header.IPv6ProtocolNumber, localAddr, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 		defer r.Release()
 
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 2315ea5b9..10d2b7964 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -45,8 +45,9 @@ type NIC struct {
 	linkEP  LinkEndpoint
 	context NICContext
 
-	stats NICStats
-	neigh *neighborCache
+	stats            NICStats
+	neigh            *neighborCache
+	networkEndpoints map[tcpip.NetworkProtocolNumber]NetworkEndpoint
 
 	mu struct {
 		sync.RWMutex
@@ -114,12 +115,13 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 	// of IPv6 is supported on this endpoint's LinkEndpoint.
 
 	nic := &NIC{
-		stack:   stack,
-		id:      id,
-		name:    name,
-		linkEP:  ep,
-		context: ctx,
-		stats:   makeNICStats(),
+		stack:            stack,
+		id:               id,
+		name:             name,
+		linkEP:           ep,
+		context:          ctx,
+		stats:            makeNICStats(),
+		networkEndpoints: make(map[tcpip.NetworkProtocolNumber]NetworkEndpoint),
 	}
 	nic.mu.primary = make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint)
 	nic.mu.endpoints = make(map[NetworkEndpointID]*referencedNetworkEndpoint)
@@ -140,7 +142,9 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 		nic.mu.packetEPs[netProto] = []PacketEndpoint{}
 	}
 	for _, netProto := range stack.networkProtocols {
-		nic.mu.packetEPs[netProto.Number()] = []PacketEndpoint{}
+		netNum := netProto.Number()
+		nic.mu.packetEPs[netNum] = nil
+		nic.networkEndpoints[netNum] = netProto.NewEndpoint(id, stack, nic, ep, stack)
 	}
 
 	// Check for Neighbor Unreachability Detection support.
@@ -205,7 +209,7 @@ func (n *NIC) disableLocked() *tcpip.Error {
 		// Stop DAD for all the unicast IPv6 endpoints that are in the
 		// permanentTentative state.
 		for _, r := range n.mu.endpoints {
-			if addr := r.ep.ID().LocalAddress; r.getKind() == permanentTentative && header.IsV6UnicastAddress(addr) {
+			if addr := r.address(); r.getKind() == permanentTentative && header.IsV6UnicastAddress(addr) {
 				n.mu.ndp.stopDuplicateAddressDetection(addr)
 			}
 		}
@@ -300,7 +304,7 @@ func (n *NIC) enable() *tcpip.Error {
 	// Addresses may have aleady completed DAD but in the time since the NIC was
 	// last enabled, other devices may have acquired the same addresses.
 	for _, r := range n.mu.endpoints {
-		addr := r.ep.ID().LocalAddress
+		addr := r.address()
 		if k := r.getKind(); (k != permanent && k != permanentTentative) || !header.IsV6UnicastAddress(addr) {
 			continue
 		}
@@ -362,6 +366,11 @@ func (n *NIC) remove() *tcpip.Error {
 		}
 	}
 
+	// Release any resources the network endpoint may hold.
+	for _, ep := range n.networkEndpoints {
+		ep.Close()
+	}
+
 	// Detach from link endpoint, so no packet comes in.
 	n.linkEP.Attach(nil)
 
@@ -510,7 +519,7 @@ func (n *NIC) primaryIPv6EndpointRLocked(remoteAddr tcpip.Address) *referencedNe
 			continue
 		}
 
-		addr := r.ep.ID().LocalAddress
+		addr := r.address()
 		scope, err := header.ScopeForIPv6Address(addr)
 		if err != nil {
 			// Should never happen as we got r from the primary IPv6 endpoint list and
@@ -539,10 +548,10 @@ func (n *NIC) primaryIPv6EndpointRLocked(remoteAddr tcpip.Address) *referencedNe
 		sb := cs[j]
 
 		// Prefer same address as per RFC 6724 section 5 rule 1.
-		if sa.ref.ep.ID().LocalAddress == remoteAddr {
+		if sa.ref.address() == remoteAddr {
 			return true
 		}
-		if sb.ref.ep.ID().LocalAddress == remoteAddr {
+		if sb.ref.address() == remoteAddr {
 			return false
 		}
 
@@ -819,17 +828,11 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 		}
 	}
 
-	netProto, ok := n.stack.networkProtocols[protocolAddress.Protocol]
+	ep, ok := n.networkEndpoints[protocolAddress.Protocol]
 	if !ok {
 		return nil, tcpip.ErrUnknownProtocol
 	}
 
-	// Create the new network endpoint.
-	ep, err := netProto.NewEndpoint(n.id, protocolAddress.AddressWithPrefix, n.stack, n, n.linkEP, n.stack)
-	if err != nil {
-		return nil, err
-	}
-
 	isIPv6Unicast := protocolAddress.Protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(protocolAddress.AddressWithPrefix.Address)
 
 	// If the address is an IPv6 address and it is a permanent address,
@@ -842,6 +845,7 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 
 	ref := &referencedNetworkEndpoint{
 		refs:       1,
+		addr:       protocolAddress.AddressWithPrefix,
 		ep:         ep,
 		nic:        n,
 		protocol:   protocolAddress.Protocol,
@@ -898,7 +902,7 @@ func (n *NIC) AllAddresses() []tcpip.ProtocolAddress {
 	defer n.mu.RUnlock()
 
 	addrs := make([]tcpip.ProtocolAddress, 0, len(n.mu.endpoints))
-	for nid, ref := range n.mu.endpoints {
+	for _, ref := range n.mu.endpoints {
 		// Don't include tentative, expired or temporary endpoints to
 		// avoid confusion and prevent the caller from using those.
 		switch ref.getKind() {
@@ -907,11 +911,8 @@ func (n *NIC) AllAddresses() []tcpip.ProtocolAddress {
 		}
 
 		addrs = append(addrs, tcpip.ProtocolAddress{
-			Protocol: ref.protocol,
-			AddressWithPrefix: tcpip.AddressWithPrefix{
-				Address:   nid.LocalAddress,
-				PrefixLen: ref.ep.PrefixLen(),
-			},
+			Protocol:          ref.protocol,
+			AddressWithPrefix: ref.addrWithPrefix(),
 		})
 	}
 	return addrs
@@ -934,11 +935,8 @@ func (n *NIC) PrimaryAddresses() []tcpip.ProtocolAddress {
 			}
 
 			addrs = append(addrs, tcpip.ProtocolAddress{
-				Protocol: proto,
-				AddressWithPrefix: tcpip.AddressWithPrefix{
-					Address:   ref.ep.ID().LocalAddress,
-					PrefixLen: ref.ep.PrefixLen(),
-				},
+				Protocol:          proto,
+				AddressWithPrefix: ref.addrWithPrefix(),
 			})
 		}
 	}
@@ -969,10 +967,7 @@ func (n *NIC) primaryAddress(proto tcpip.NetworkProtocolNumber) tcpip.AddressWit
 		}
 
 		if !ref.deprecated {
-			return tcpip.AddressWithPrefix{
-				Address:   ref.ep.ID().LocalAddress,
-				PrefixLen: ref.ep.PrefixLen(),
-			}
+			return ref.addrWithPrefix()
 		}
 
 		if deprecatedEndpoint == nil {
@@ -981,10 +976,7 @@ func (n *NIC) primaryAddress(proto tcpip.NetworkProtocolNumber) tcpip.AddressWit
 	}
 
 	if deprecatedEndpoint != nil {
-		return tcpip.AddressWithPrefix{
-			Address:   deprecatedEndpoint.ep.ID().LocalAddress,
-			PrefixLen: deprecatedEndpoint.ep.PrefixLen(),
-		}
+		return deprecatedEndpoint.addrWithPrefix()
 	}
 
 	return tcpip.AddressWithPrefix{}
@@ -1048,7 +1040,7 @@ func (n *NIC) insertPrimaryEndpointLocked(r *referencedNetworkEndpoint, peb Prim
 }
 
 func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
-	id := *r.ep.ID()
+	id := NetworkEndpointID{LocalAddress: r.address()}
 
 	// Nothing to do if the reference has already been replaced with a different
 	// one. This happens in the case where 1) this endpoint's ref count hit zero
@@ -1072,8 +1064,6 @@ func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
 			break
 		}
 	}
-
-	r.ep.Close()
 }
 
 func (n *NIC) removeEndpoint(r *referencedNetworkEndpoint) {
@@ -1718,6 +1708,7 @@ const (
 
 type referencedNetworkEndpoint struct {
 	ep       NetworkEndpoint
+	addr     tcpip.AddressWithPrefix
 	nic      *NIC
 	protocol tcpip.NetworkProtocolNumber
 
@@ -1743,11 +1734,12 @@ type referencedNetworkEndpoint struct {
 	deprecated bool
 }
 
+func (r *referencedNetworkEndpoint) address() tcpip.Address {
+	return r.addr.Address
+}
+
 func (r *referencedNetworkEndpoint) addrWithPrefix() tcpip.AddressWithPrefix {
-	return tcpip.AddressWithPrefix{
-		Address:   r.ep.ID().LocalAddress,
-		PrefixLen: r.ep.PrefixLen(),
-	}
+	return r.addr
 }
 
 func (r *referencedNetworkEndpoint) getKind() networkEndpointKind {
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
index 0870c8d9c..d312a79eb 100644
--- a/pkg/tcpip/stack/nic_test.go
+++ b/pkg/tcpip/stack/nic_test.go
@@ -101,11 +101,9 @@ var _ NetworkEndpoint = (*testIPv6Endpoint)(nil)
 // We use this instead of ipv6.endpoint because the ipv6 package depends on
 // the stack package which this test lives in, causing a cyclic dependency.
 type testIPv6Endpoint struct {
-	nicID     tcpip.NICID
-	id        NetworkEndpointID
-	prefixLen int
-	linkEP    LinkEndpoint
-	protocol  *testIPv6Protocol
+	nicID    tcpip.NICID
+	linkEP   LinkEndpoint
+	protocol *testIPv6Protocol
 }
 
 // DefaultTTL implements NetworkEndpoint.DefaultTTL.
@@ -146,16 +144,6 @@ func (*testIPv6Endpoint) WriteHeaderIncludedPacket(*Route, *PacketBuffer) *tcpip
 	return tcpip.ErrNotSupported
 }
 
-// ID implements NetworkEndpoint.ID.
-func (e *testIPv6Endpoint) ID() *NetworkEndpointID {
-	return &e.id
-}
-
-// PrefixLen implements NetworkEndpoint.PrefixLen.
-func (e *testIPv6Endpoint) PrefixLen() int {
-	return e.prefixLen
-}
-
 // NICID implements NetworkEndpoint.NICID.
 func (e *testIPv6Endpoint) NICID() tcpip.NICID {
 	return e.nicID
@@ -204,14 +192,12 @@ func (*testIPv6Protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address)
 }
 
 // NewEndpoint implements NetworkProtocol.NewEndpoint.
-func (p *testIPv6Protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, _ LinkAddressCache, _ TransportDispatcher, linkEP LinkEndpoint, _ *Stack) (NetworkEndpoint, *tcpip.Error) {
+func (p *testIPv6Protocol) NewEndpoint(nicID tcpip.NICID, _ LinkAddressCache, _ TransportDispatcher, linkEP LinkEndpoint, _ *Stack) NetworkEndpoint {
 	return &testIPv6Endpoint{
-		nicID:     nicID,
-		id:        NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
-		prefixLen: addrWithPrefix.PrefixLen,
-		linkEP:    linkEP,
-		protocol:  p,
-	}, nil
+		nicID:    nicID,
+		linkEP:   linkEP,
+		protocol: p,
+	}
 }
 
 // SetOption implements NetworkProtocol.SetOption.
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 4570e8969..aca2f77f8 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -262,12 +262,6 @@ type NetworkEndpoint interface {
 	// header to the given destination address. It takes ownership of pkt.
 	WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error
 
-	// ID returns the network protocol endpoint ID.
-	ID() *NetworkEndpointID
-
-	// PrefixLen returns the network endpoint's subnet prefix length in bits.
-	PrefixLen() int
-
 	// NICID returns the id of the NIC this endpoint belongs to.
 	NICID() tcpip.NICID
 
@@ -304,7 +298,7 @@ type NetworkProtocol interface {
 	ParseAddresses(v buffer.View) (src, dst tcpip.Address)
 
 	// NewEndpoint creates a new endpoint of this protocol.
-	NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, sender LinkEndpoint, st *Stack) (NetworkEndpoint, *tcpip.Error)
+	NewEndpoint(nicID tcpip.NICID, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, sender LinkEndpoint, st *Stack) NetworkEndpoint
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 5b19c5d59..9a1c8e409 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1321,7 +1321,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	if id != 0 && !needRoute {
 		if nic, ok := s.nics[id]; ok && nic.enabled() {
 			if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
-				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()), nil
+				return makeRoute(netProto, ref.address(), remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()), nil
 			}
 		}
 	} else {
@@ -1334,10 +1334,10 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 					if len(remoteAddr) == 0 {
 						// If no remote address was provided, then the route
 						// provided will refer to the link local address.
-						remoteAddr = ref.ep.ID().LocalAddress
+						remoteAddr = ref.address()
 					}
 
-					r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback())
+					r := makeRoute(netProto, ref.address(), remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback())
 					r.directedBroadcast = route.Destination.IsBroadcast(remoteAddr)
 
 					if len(route.Gateway) > 0 {
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 0273b3c63..b5a603098 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -70,8 +70,6 @@ const (
 // protocol. They're all one byte fields to simplify parsing.
 type fakeNetworkEndpoint struct {
 	nicID      tcpip.NICID
-	id         stack.NetworkEndpointID
-	prefixLen  int
 	proto      *fakeNetworkProtocol
 	dispatcher stack.TransportDispatcher
 	ep         stack.LinkEndpoint
@@ -85,21 +83,13 @@ func (f *fakeNetworkEndpoint) NICID() tcpip.NICID {
 	return f.nicID
 }
 
-func (f *fakeNetworkEndpoint) PrefixLen() int {
-	return f.prefixLen
-}
-
 func (*fakeNetworkEndpoint) DefaultTTL() uint8 {
 	return 123
 }
 
-func (f *fakeNetworkEndpoint) ID() *stack.NetworkEndpointID {
-	return &f.id
-}
-
 func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 	// Increment the received packet count in the protocol descriptor.
-	f.proto.packetCount[int(f.id.LocalAddress[0])%len(f.proto.packetCount)]++
+	f.proto.packetCount[int(r.LocalAddress[0])%len(f.proto.packetCount)]++
 
 	// Handle control packets.
 	if pkt.NetworkHeader().View()[protocolNumberOffset] == uint8(fakeControlProtocol) {
@@ -145,7 +135,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
 	// endpoint.
 	hdr := pkt.NetworkHeader().Push(fakeNetHeaderLen)
 	hdr[dstAddrOffset] = r.RemoteAddress[0]
-	hdr[srcAddrOffset] = f.id.LocalAddress[0]
+	hdr[srcAddrOffset] = r.LocalAddress[0]
 	hdr[protocolNumberOffset] = byte(params.Protocol)
 
 	if r.Loop&stack.PacketLoop != 0 {
@@ -208,15 +198,13 @@ func (*fakeNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Addres
 	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
 }
 
-func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint, _ *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
+func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint, _ *stack.Stack) stack.NetworkEndpoint {
 	return &fakeNetworkEndpoint{
 		nicID:      nicID,
-		id:         stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
-		prefixLen:  addrWithPrefix.PrefixLen,
 		proto:      f,
 		dispatcher: dispatcher,
 		ep:         ep,
-	}, nil
+	}
 }
 
 func (f *fakeNetworkProtocol) SetOption(option interface{}) *tcpip.Error {
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 71776d6db..f87d99d5a 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -1469,13 +1469,10 @@ func TestTTL(t *testing.T) {
 				} else {
 					p = ipv6.NewProtocol()
 				}
-				ep, err := p.NewEndpoint(0, tcpip.AddressWithPrefix{}, nil, nil, nil, stack.New(stack.Options{
+				ep := p.NewEndpoint(0, nil, nil, nil, stack.New(stack.Options{
 					NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
 					TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
 				}))
-				if err != nil {
-					t.Fatal(err)
-				}
 				wantTTL = ep.DefaultTTL()
 				ep.Close()
 			}
@@ -1505,13 +1502,10 @@ func TestSetTTL(t *testing.T) {
 					} else {
 						p = ipv6.NewProtocol()
 					}
-					ep, err := p.NewEndpoint(0, tcpip.AddressWithPrefix{}, nil, nil, nil, stack.New(stack.Options{
+					ep := p.NewEndpoint(0, nil, nil, nil, stack.New(stack.Options{
 						NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
 						TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
 					}))
-					if err != nil {
-						t.Fatal(err)
-					}
 					ep.Close()
 
 					testWrite(c, flow, checker.TTL(wantTTL))
-- 
cgit v1.2.3


From 703b0d0b47641bcee80402eb7b6cf9b8c1f2cf70 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Sat, 15 Aug 2020 00:04:30 -0700
Subject: Don't support address ranges

Previously the netstack supported assignment of a range of addresses.
This feature is not used so remove it.

PiperOrigin-RevId: 326791119
---
 pkg/tcpip/stack/nic.go        |  63 ++------------
 pkg/tcpip/stack/stack.go      |  29 -------
 pkg/tcpip/stack/stack_test.go | 194 ------------------------------------------
 3 files changed, 8 insertions(+), 278 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 10d2b7964..8a9a085f0 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -51,13 +51,12 @@ type NIC struct {
 
 	mu struct {
 		sync.RWMutex
-		enabled       bool
-		spoofing      bool
-		promiscuous   bool
-		primary       map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint
-		endpoints     map[NetworkEndpointID]*referencedNetworkEndpoint
-		addressRanges []tcpip.Subnet
-		mcastJoins    map[NetworkEndpointID]uint32
+		enabled     bool
+		spoofing    bool
+		promiscuous bool
+		primary     map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint
+		endpoints   map[NetworkEndpointID]*referencedNetworkEndpoint
+		mcastJoins  map[NetworkEndpointID]uint32
 		// packetEPs is protected by mu, but the contained PacketEndpoint
 		// values are not.
 		packetEPs map[tcpip.NetworkProtocolNumber][]PacketEndpoint
@@ -670,25 +669,6 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 	// A usable reference was not found, create a temporary one if requested by
 	// the caller or if the address is found in the NIC's subnets.
 	createTempEP := spoofingOrPromiscuous
-	if !createTempEP {
-		for _, sn := range n.mu.addressRanges {
-			// Skip the subnet address.
-			if address == sn.ID() {
-				continue
-			}
-			// For now just skip the broadcast address, until we support it.
-			// FIXME(b/137608825): Add support for sending/receiving directed
-			// (subnet) broadcast.
-			if address == sn.Broadcast() {
-				continue
-			}
-			if sn.Contains(address) {
-				createTempEP = true
-				break
-			}
-		}
-	}
-
 	n.mu.RUnlock()
 
 	if !createTempEP {
@@ -982,38 +962,11 @@ func (n *NIC) primaryAddress(proto tcpip.NetworkProtocolNumber) tcpip.AddressWit
 	return tcpip.AddressWithPrefix{}
 }
 
-// AddAddressRange adds a range of addresses to n, so that it starts accepting
-// packets targeted at the given addresses and network protocol. The range is
-// given by a subnet address, and all addresses contained in the subnet are
-// used except for the subnet address itself and the subnet's broadcast
-// address.
-func (n *NIC) AddAddressRange(protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) {
-	n.mu.Lock()
-	n.mu.addressRanges = append(n.mu.addressRanges, subnet)
-	n.mu.Unlock()
-}
-
-// RemoveAddressRange removes the given address range from n.
-func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) {
-	n.mu.Lock()
-
-	// Use the same underlying array.
-	tmp := n.mu.addressRanges[:0]
-	for _, sub := range n.mu.addressRanges {
-		if sub != subnet {
-			tmp = append(tmp, sub)
-		}
-	}
-	n.mu.addressRanges = tmp
-
-	n.mu.Unlock()
-}
-
 // AddressRanges returns the Subnets associated with this NIC.
 func (n *NIC) AddressRanges() []tcpip.Subnet {
 	n.mu.RLock()
 	defer n.mu.RUnlock()
-	sns := make([]tcpip.Subnet, 0, len(n.mu.addressRanges)+len(n.mu.endpoints))
+	sns := make([]tcpip.Subnet, 0, len(n.mu.endpoints))
 	for nid := range n.mu.endpoints {
 		sn, err := tcpip.NewSubnet(nid.LocalAddress, tcpip.AddressMask(strings.Repeat("\xff", len(nid.LocalAddress))))
 		if err != nil {
@@ -1023,7 +976,7 @@ func (n *NIC) AddressRanges() []tcpip.Subnet {
 		}
 		sns = append(sns, sn)
 	}
-	return append(sns, n.mu.addressRanges...)
+	return sns
 }
 
 // insertPrimaryEndpointLocked adds r to n's primary endpoint list as required
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 9a1c8e409..ae44cd5da 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1230,35 +1230,6 @@ func (s *Stack) AddProtocolAddressWithOptions(id tcpip.NICID, protocolAddress tc
 	return nic.AddAddress(protocolAddress, peb)
 }
 
-// AddAddressRange adds a range of addresses to the specified NIC. The range is
-// given by a subnet address, and all addresses contained in the subnet are
-// used except for the subnet address itself and the subnet's broadcast
-// address.
-func (s *Stack) AddAddressRange(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) *tcpip.Error {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	if nic, ok := s.nics[id]; ok {
-		nic.AddAddressRange(protocol, subnet)
-		return nil
-	}
-
-	return tcpip.ErrUnknownNICID
-}
-
-// RemoveAddressRange removes the range of addresses from the specified NIC.
-func (s *Stack) RemoveAddressRange(id tcpip.NICID, subnet tcpip.Subnet) *tcpip.Error {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	if nic, ok := s.nics[id]; ok {
-		nic.RemoveAddressRange(subnet)
-		return nil
-	}
-
-	return tcpip.ErrUnknownNICID
-}
-
 // RemoveAddress removes an existing network-layer address from the specified
 // NIC.
 func (s *Stack) RemoveAddress(id tcpip.NICID, addr tcpip.Address) *tcpip.Error {
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index b5a603098..106645c50 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -23,7 +23,6 @@ import (
 	"math"
 	"net"
 	"sort"
-	"strings"
 	"testing"
 	"time"
 
@@ -1641,149 +1640,6 @@ func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) {
 	}
 }
 
-// Add a range of addresses, then check that a packet is delivered.
-func TestAddressRangeAcceptsMatchingPacket(t *testing.T) {
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
-	})
-
-	ep := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(1, ep); err != nil {
-		t.Fatal("CreateNIC failed:", err)
-	}
-
-	{
-		subnet, err := tcpip.NewSubnet("\x00", "\x00")
-		if err != nil {
-			t.Fatal(err)
-		}
-		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
-	}
-
-	fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol)
-
-	buf := buffer.NewView(30)
-
-	const localAddrByte byte = 0x01
-	buf[dstAddrOffset] = localAddrByte
-	subnet, err := tcpip.NewSubnet(tcpip.Address("\x00"), tcpip.AddressMask("\xF0"))
-	if err != nil {
-		t.Fatal("NewSubnet failed:", err)
-	}
-	if err := s.AddAddressRange(1, fakeNetNumber, subnet); err != nil {
-		t.Fatal("AddAddressRange failed:", err)
-	}
-
-	testRecv(t, fakeNet, localAddrByte, ep, buf)
-}
-
-func testNicForAddressRange(t *testing.T, nicID tcpip.NICID, s *stack.Stack, subnet tcpip.Subnet, rangeExists bool) {
-	t.Helper()
-
-	// Loop over all addresses and check them.
-	numOfAddresses := 1 << uint(8-subnet.Prefix())
-	if numOfAddresses < 1 || numOfAddresses > 255 {
-		t.Fatalf("got numOfAddresses = %d, want = [1 .. 255] (subnet=%s)", numOfAddresses, subnet)
-	}
-
-	addrBytes := []byte(subnet.ID())
-	for i := 0; i < numOfAddresses; i++ {
-		addr := tcpip.Address(addrBytes)
-		wantNicID := nicID
-		// The subnet and broadcast addresses are skipped.
-		if !rangeExists || addr == subnet.ID() || addr == subnet.Broadcast() {
-			wantNicID = 0
-		}
-		if gotNicID := s.CheckLocalAddress(0, fakeNetNumber, addr); gotNicID != wantNicID {
-			t.Errorf("got CheckLocalAddress(0, %d, %s) = %d, want = %d", fakeNetNumber, addr, gotNicID, wantNicID)
-		}
-		addrBytes[0]++
-	}
-
-	// Trying the next address should always fail since it is outside the range.
-	if gotNicID := s.CheckLocalAddress(0, fakeNetNumber, tcpip.Address(addrBytes)); gotNicID != 0 {
-		t.Errorf("got CheckLocalAddress(0, %d, %s) = %d, want = 0", fakeNetNumber, tcpip.Address(addrBytes), gotNicID)
-	}
-}
-
-// Set a range of addresses, then remove it again, and check at each step that
-// CheckLocalAddress returns the correct NIC for each address or zero if not
-// existent.
-func TestCheckLocalAddressForSubnet(t *testing.T) {
-	const nicID tcpip.NICID = 1
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
-	})
-
-	ep := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(nicID, ep); err != nil {
-		t.Fatal("CreateNIC failed:", err)
-	}
-
-	{
-		subnet, err := tcpip.NewSubnet("\x00", "\x00")
-		if err != nil {
-			t.Fatal(err)
-		}
-		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: nicID}})
-	}
-
-	subnet, err := tcpip.NewSubnet(tcpip.Address("\xa0"), tcpip.AddressMask("\xf0"))
-	if err != nil {
-		t.Fatal("NewSubnet failed:", err)
-	}
-
-	testNicForAddressRange(t, nicID, s, subnet, false /* rangeExists */)
-
-	if err := s.AddAddressRange(nicID, fakeNetNumber, subnet); err != nil {
-		t.Fatal("AddAddressRange failed:", err)
-	}
-
-	testNicForAddressRange(t, nicID, s, subnet, true /* rangeExists */)
-
-	if err := s.RemoveAddressRange(nicID, subnet); err != nil {
-		t.Fatal("RemoveAddressRange failed:", err)
-	}
-
-	testNicForAddressRange(t, nicID, s, subnet, false /* rangeExists */)
-}
-
-// Set a range of addresses, then send a packet to a destination outside the
-// range and then check it doesn't get delivered.
-func TestAddressRangeRejectsNonmatchingPacket(t *testing.T) {
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
-	})
-
-	ep := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(1, ep); err != nil {
-		t.Fatal("CreateNIC failed:", err)
-	}
-
-	{
-		subnet, err := tcpip.NewSubnet("\x00", "\x00")
-		if err != nil {
-			t.Fatal(err)
-		}
-		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
-	}
-
-	fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol)
-
-	buf := buffer.NewView(30)
-
-	const localAddrByte byte = 0x01
-	buf[dstAddrOffset] = localAddrByte
-	subnet, err := tcpip.NewSubnet(tcpip.Address("\x10"), tcpip.AddressMask("\xF0"))
-	if err != nil {
-		t.Fatal("NewSubnet failed:", err)
-	}
-	if err := s.AddAddressRange(1, fakeNetNumber, subnet); err != nil {
-		t.Fatal("AddAddressRange failed:", err)
-	}
-	testFailingRecv(t, fakeNet, localAddrByte, ep, buf)
-}
-
 func TestNetworkOptions(t *testing.T) {
 	s := stack.New(stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
@@ -1827,56 +1683,6 @@ func TestNetworkOptions(t *testing.T) {
 	}
 }
 
-func stackContainsAddressRange(s *stack.Stack, id tcpip.NICID, addrRange tcpip.Subnet) bool {
-	ranges, ok := s.NICAddressRanges()[id]
-	if !ok {
-		return false
-	}
-	for _, r := range ranges {
-		if r == addrRange {
-			return true
-		}
-	}
-	return false
-}
-
-func TestAddresRangeAddRemove(t *testing.T) {
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
-	})
-	ep := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(1, ep); err != nil {
-		t.Fatal("CreateNIC failed:", err)
-	}
-
-	addr := tcpip.Address("\x01\x01\x01\x01")
-	mask := tcpip.AddressMask(strings.Repeat("\xff", len(addr)))
-	addrRange, err := tcpip.NewSubnet(addr, mask)
-	if err != nil {
-		t.Fatal("NewSubnet failed:", err)
-	}
-
-	if got, want := stackContainsAddressRange(s, 1, addrRange), false; got != want {
-		t.Fatalf("got stackContainsAddressRange(...) = %t, want = %t", got, want)
-	}
-
-	if err := s.AddAddressRange(1, fakeNetNumber, addrRange); err != nil {
-		t.Fatal("AddAddressRange failed:", err)
-	}
-
-	if got, want := stackContainsAddressRange(s, 1, addrRange), true; got != want {
-		t.Fatalf("got stackContainsAddressRange(...) = %t, want = %t", got, want)
-	}
-
-	if err := s.RemoveAddressRange(1, addrRange); err != nil {
-		t.Fatal("RemoveAddressRange failed:", err)
-	}
-
-	if got, want := stackContainsAddressRange(s, 1, addrRange), false; got != want {
-		t.Fatalf("got stackContainsAddressRange(...) = %t, want = %t", got, want)
-	}
-}
-
 func TestGetMainNICAddressAddPrimaryNonPrimary(t *testing.T) {
 	for _, addrLen := range []int{4, 16} {
 		t.Run(fmt.Sprintf("addrLen=%d", addrLen), func(t *testing.T) {
-- 
cgit v1.2.3


From a22ac024239d2d757c80deca06ad86341691b04c Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Mon, 17 Aug 2020 10:03:38 -0700
Subject: [vfs] Return EIO when opening /dev/tty.

This is in compliance with VFS1. See pkg/sentry/fs/dev/tty.go in the struct
ttyInodeOperations.

Fixes the failure of python runtime test_ioctl.
Updates #3515

PiperOrigin-RevId: 327042758
---
 pkg/sentry/devices/ttydev/BUILD     |  2 +-
 pkg/sentry/devices/ttydev/ttydev.go | 46 +++----------------------------------
 2 files changed, 4 insertions(+), 44 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/devices/ttydev/BUILD b/pkg/sentry/devices/ttydev/BUILD
index 12e49b58a..b4b6ca38a 100644
--- a/pkg/sentry/devices/ttydev/BUILD
+++ b/pkg/sentry/devices/ttydev/BUILD
@@ -11,6 +11,6 @@ go_library(
         "//pkg/context",
         "//pkg/sentry/fsimpl/devtmpfs",
         "//pkg/sentry/vfs",
-        "//pkg/usermem",
+        "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go
index fd4b79c46..664e54498 100644
--- a/pkg/sentry/devices/ttydev/ttydev.go
+++ b/pkg/sentry/devices/ttydev/ttydev.go
@@ -12,10 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package ttydev implements devices for /dev/tty and (eventually)
-// /dev/console.
-//
-// TODO(b/159623826): Support /dev/console.
+// Package ttydev implements an unopenable vfs.Device for /dev/tty.
 package ttydev
 
 import (
@@ -23,7 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 const (
@@ -37,44 +34,7 @@ type ttyDevice struct{}
 
 // Open implements vfs.Device.Open.
 func (ttyDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &ttyFD{}
-	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
-		UseDentryMetadata: true,
-	}); err != nil {
-		return nil, err
-	}
-	return &fd.vfsfd, nil
-}
-
-// ttyFD implements vfs.FileDescriptionImpl for /dev/tty.
-type ttyFD struct {
-	vfsfd vfs.FileDescription
-	vfs.FileDescriptionDefaultImpl
-	vfs.DentryMetadataFileDescriptionImpl
-	vfs.NoLockFD
-}
-
-// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *ttyFD) Release(context.Context) {}
-
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
-	return 0, nil
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (fd *ttyFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	return 0, nil
-}
-
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *ttyFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	return src.NumBytes(), nil
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (fd *ttyFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
-	return src.NumBytes(), nil
+	return nil, syserror.EIO
 }
 
 // Register registers all devices implemented by this package in vfsObj.
-- 
cgit v1.2.3


From 80681bdb9541f31eafbe6e4593f76d98ff6e641a Mon Sep 17 00:00:00 2001
From: Arthur Sfez <asfez@google.com>
Date: Mon, 17 Aug 2020 10:04:03 -0700
Subject: Add a unit test for out of order IP reassembly

PiperOrigin-RevId: 327042869
---
 pkg/tcpip/network/ipv4/ipv4_test.go | 22 +++++++++++++++++++++
 pkg/tcpip/network/ipv6/ipv6_test.go | 38 +++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

(limited to 'pkg')

diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index afd3ac06d..197e3bc51 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -627,6 +627,28 @@ func TestReceiveFragments(t *testing.T) {
 			},
 			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
 		},
+		{
+			name: "Two fragments out of order",
+			fragments: []fragmentData{
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          0,
+					fragmentOffset: 64,
+					payload:        ipv4Payload1Addr1ToAddr2[64:],
+				},
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1Addr1ToAddr2[:64],
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
+		},
 		{
 			name: "Two fragments with last fragment size not a multiple of fragment block size",
 			fragments: []fragmentData{
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index 081afb051..0a183bfde 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -827,6 +827,44 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			},
 			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
 		},
+		{
+			name: "Two fragments out of order",
+			fragments: []fragmentData{
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr1ToAddr2[64:],
+						},
+					),
+				},
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr1ToAddr2[:64],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
+		},
 		{
 			name: "Two fragments with last fragment size not a multiple of fragment block size",
 			fragments: []fragmentData{
-- 
cgit v1.2.3


From d1179ffa205b6ea60b450fd1c7e91230564719c8 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 17 Aug 2020 11:40:08 -0700
Subject: Remove weak references from unix sockets.

The abstract socket namespace no longer holds any references on sockets.
Instead, TryIncRef() is used when a socket is being retrieved in
BoundEndpoint(). Abstract sockets are now responsible for removing themselves
from the namespace they are in, when they are destroyed.

Updates #1486.

PiperOrigin-RevId: 327064173
---
 pkg/refs_vfs2/BUILD                            |  6 +-
 pkg/refs_vfs2/refs.go                          |  4 +-
 pkg/sentry/kernel/BUILD                        |  1 +
 pkg/sentry/kernel/abstract_socket_namespace.go | 77 +++++++++++++++++---------
 pkg/sentry/socket/unix/BUILD                   | 14 +++++
 pkg/sentry/socket/unix/unix.go                 | 22 ++++++--
 pkg/sentry/socket/unix/unix_vfs2.go            |  6 +-
 7 files changed, 91 insertions(+), 39 deletions(-)

(limited to 'pkg')

diff --git a/pkg/refs_vfs2/BUILD b/pkg/refs_vfs2/BUILD
index 7f180c7bd..7b3e10683 100644
--- a/pkg/refs_vfs2/BUILD
+++ b/pkg/refs_vfs2/BUILD
@@ -19,10 +19,8 @@ go_template(
 )
 
 go_library(
-    name = "refs",
-    srcs = [
-        "refs.go",
-    ],
+    name = "refs_vfs2",
+    srcs = ["refs.go"],
     visibility = ["//pkg/sentry:internal"],
     deps = ["//pkg/context"],
 )
diff --git a/pkg/refs_vfs2/refs.go b/pkg/refs_vfs2/refs.go
index ee01b17b0..99a074e96 100644
--- a/pkg/refs_vfs2/refs.go
+++ b/pkg/refs_vfs2/refs.go
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package refs defines an interface for a reference-counted object.
-package refs
+// Package refs_vfs2 defines an interface for a reference-counted object.
+package refs_vfs2
 
 import (
 	"gvisor.dev/gvisor/pkg/context"
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index f6886a758..5416a310d 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -163,6 +163,7 @@ go_library(
         "//pkg/log",
         "//pkg/metric",
         "//pkg/refs",
+        "//pkg/refs_vfs2",
         "//pkg/safemem",
         "//pkg/secio",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 52ed5cea2..1b9721534 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -15,29 +15,21 @@
 package kernel
 
 import (
+	"fmt"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/refs_vfs2"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // +stateify savable
 type abstractEndpoint struct {
-	ep   transport.BoundEndpoint
-	wr   *refs.WeakRef
-	name string
-	ns   *AbstractSocketNamespace
-}
-
-// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
-func (e *abstractEndpoint) WeakRefGone(context.Context) {
-	e.ns.mu.Lock()
-	if e.ns.endpoints[e.name].ep == e.ep {
-		delete(e.ns.endpoints, e.name)
-	}
-	e.ns.mu.Unlock()
+	ep     transport.BoundEndpoint
+	socket refs_vfs2.RefCounter
+	name   string
+	ns     *AbstractSocketNamespace
 }
 
 // AbstractSocketNamespace is used to implement the Linux abstract socket functionality.
@@ -46,7 +38,11 @@ func (e *abstractEndpoint) WeakRefGone(context.Context) {
 type AbstractSocketNamespace struct {
 	mu sync.Mutex `state:"nosave"`
 
-	// Keeps mapping from name to endpoint.
+	// Keeps a mapping from name to endpoint. AbstractSocketNamespace does not hold
+	// any references on any sockets that it contains; when retrieving a socket,
+	// TryIncRef() must be called in case the socket is concurrently being
+	// destroyed. It is the responsibility of the socket to remove itself from the
+	// abstract socket namespace when it is destroyed.
 	endpoints map[string]abstractEndpoint
 }
 
@@ -58,15 +54,15 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace {
 }
 
 // A boundEndpoint wraps a transport.BoundEndpoint to maintain a reference on
-// its backing object.
+// its backing socket.
 type boundEndpoint struct {
 	transport.BoundEndpoint
-	rc refs.RefCounter
+	socket refs_vfs2.RefCounter
 }
 
 // Release implements transport.BoundEndpoint.Release.
 func (e *boundEndpoint) Release(ctx context.Context) {
-	e.rc.DecRef(ctx)
+	e.socket.DecRef(ctx)
 	e.BoundEndpoint.Release(ctx)
 }
 
@@ -81,32 +77,59 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp
 		return nil
 	}
 
-	rc := ep.wr.Get()
-	if rc == nil {
-		delete(a.endpoints, name)
+	if !ep.socket.TryIncRef() {
+		// The socket has reached zero references and is being destroyed.
 		return nil
 	}
 
-	return &boundEndpoint{ep.ep, rc}
+	return &boundEndpoint{ep.ep, ep.socket}
 }
 
 // Bind binds the given socket.
 //
-// When the last reference managed by rc is dropped, ep may be removed from the
+// When the last reference managed by socket is dropped, ep may be removed from the
 // namespace.
-func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, rc refs.RefCounter) error {
+func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refs_vfs2.RefCounter) error {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 
+	// Check if there is already a socket (which has not yet been destroyed) bound at name.
 	if ep, ok := a.endpoints[name]; ok {
-		if rc := ep.wr.Get(); rc != nil {
-			rc.DecRef(ctx)
+		if ep.socket.TryIncRef() {
+			ep.socket.DecRef(ctx)
 			return syscall.EADDRINUSE
 		}
 	}
 
 	ae := abstractEndpoint{ep: ep, name: name, ns: a}
-	ae.wr = refs.NewWeakRef(rc, &ae)
+	ae.socket = socket
 	a.endpoints[name] = ae
 	return nil
 }
+
+// Remove removes the specified socket at name from the abstract socket
+// namespace, if it has not yet been replaced.
+func (a *AbstractSocketNamespace) Remove(name string, socket refs_vfs2.RefCounter) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	ep, ok := a.endpoints[name]
+	if !ok {
+		// We never delete a map entry apart from a socket's destructor (although the
+		// map entry may be overwritten). Therefore, a socket should exist, even if it
+		// may not be the one we expect.
+		panic(fmt.Sprintf("expected socket to exist at '%s' in abstract socket namespace", name))
+	}
+
+	// A Bind() operation may race with callers of Remove(), e.g. in the
+	// following case:
+	//   socket1 reaches zero references and begins destruction
+	//   a.Bind("foo", ep, socket2) replaces socket1 with socket2
+	//   socket1's destructor calls a.Remove("foo", socket1)
+	//
+	// Therefore, we need to check that the socket at name is what we expect
+	// before modifying the map.
+	if ep.socket == socket {
+		delete(a.endpoints, name)
+	}
+}
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index 061a689a9..cb953e4dc 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -1,12 +1,25 @@
 load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
+go_template_instance(
+    name = "socket_refs",
+    out = "socket_refs.go",
+    package = "unix",
+    prefix = "socketOpsCommon",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "socketOpsCommon",
+    },
+)
+
 go_library(
     name = "unix",
     srcs = [
         "device.go",
         "io.go",
+        "socket_refs.go",
         "unix.go",
         "unix_vfs2.go",
     ],
@@ -15,6 +28,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/log",
         "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 2b8454edb..b7e8e4325 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -24,7 +24,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -80,7 +79,7 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty
 			stype: stype,
 		},
 	}
-	s.EnableLeakCheck("unix.SocketOperations")
+	s.EnableLeakCheck()
 
 	return fs.NewFile(ctx, d, flags, &s)
 }
@@ -89,17 +88,26 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty
 //
 // +stateify savable
 type socketOpsCommon struct {
-	refs.AtomicRefCount
+	socketOpsCommonRefs
 	socket.SendReceiveTimeout
 
 	ep    transport.Endpoint
 	stype linux.SockType
+
+	// abstractName and abstractNamespace indicate the name and namespace of the
+	// socket if it is bound to an abstract socket namespace. Once the socket is
+	// bound, they cannot be modified.
+	abstractName      string
+	abstractNamespace *kernel.AbstractSocketNamespace
 }
 
 // DecRef implements RefCounter.DecRef.
 func (s *socketOpsCommon) DecRef(ctx context.Context) {
-	s.DecRefWithDestructor(ctx, func(context.Context) {
+	s.socketOpsCommonRefs.DecRef(func() {
 		s.ep.Close(ctx)
+		if s.abstractNamespace != nil {
+			s.abstractNamespace.Remove(s.abstractName, s)
+		}
 	})
 }
 
@@ -284,10 +292,14 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 			if t.IsNetworkNamespaced() {
 				return syserr.ErrInvalidEndpointState
 			}
-			if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil {
+			asn := t.AbstractSockets()
+			name := p[1:]
+			if err := asn.Bind(t, name, bep, s); err != nil {
 				// syserr.ErrPortInUse corresponds to EADDRINUSE.
 				return syserr.ErrPortInUse
 			}
+			s.abstractName = name
+			s.abstractNamespace = asn
 		} else {
 			// The parent and name.
 			var d *fs.Dirent
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index dfa25241a..d066ef8ab 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -183,10 +183,14 @@ func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 			if t.IsNetworkNamespaced() {
 				return syserr.ErrInvalidEndpointState
 			}
-			if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil {
+			asn := t.AbstractSockets()
+			name := p[1:]
+			if err := asn.Bind(t, name, bep, s); err != nil {
 				// syserr.ErrPortInUse corresponds to EADDRINUSE.
 				return syserr.ErrPortInUse
 			}
+			s.abstractName = name
+			s.abstractNamespace = asn
 		} else {
 			path := fspath.Parse(p)
 			root := t.FSContext().RootDirectoryVFS2()
-- 
cgit v1.2.3


From e1635261defd19195506eab8050455e992739026 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Mon, 17 Aug 2020 12:27:59 -0700
Subject: Remove address range functions

Should have been removed in cl/326791119
https://github.com/google/gvisor/commit/9a7b5830aa063895f67ca0fdf653a46906374613

PiperOrigin-RevId: 327074156
---
 pkg/tcpip/stack/nic.go   | 18 ------------------
 pkg/tcpip/stack/stack.go | 13 -------------
 2 files changed, 31 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 8a9a085f0..728292782 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -19,7 +19,6 @@ import (
 	"math/rand"
 	"reflect"
 	"sort"
-	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/sync"
@@ -962,23 +961,6 @@ func (n *NIC) primaryAddress(proto tcpip.NetworkProtocolNumber) tcpip.AddressWit
 	return tcpip.AddressWithPrefix{}
 }
 
-// AddressRanges returns the Subnets associated with this NIC.
-func (n *NIC) AddressRanges() []tcpip.Subnet {
-	n.mu.RLock()
-	defer n.mu.RUnlock()
-	sns := make([]tcpip.Subnet, 0, len(n.mu.endpoints))
-	for nid := range n.mu.endpoints {
-		sn, err := tcpip.NewSubnet(nid.LocalAddress, tcpip.AddressMask(strings.Repeat("\xff", len(nid.LocalAddress))))
-		if err != nil {
-			// This should never happen as the mask has been carefully crafted to
-			// match the address.
-			panic("Invalid endpoint subnet: " + err.Error())
-		}
-		sns = append(sns, sn)
-	}
-	return sns
-}
-
 // insertPrimaryEndpointLocked adds r to n's primary endpoint list as required
 // by peb.
 //
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index ae44cd5da..a3f87c8af 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1102,19 +1102,6 @@ func (s *Stack) removeNICLocked(id tcpip.NICID) *tcpip.Error {
 	return nic.remove()
 }
 
-// NICAddressRanges returns a map of NICIDs to their associated subnets.
-func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	nics := map[tcpip.NICID][]tcpip.Subnet{}
-
-	for id, nic := range s.nics {
-		nics[id] = append(nics[id], nic.AddressRanges()...)
-	}
-	return nics
-}
-
 // NICInfo captures the name and addresses assigned to a NIC.
 type NICInfo struct {
 	Name              string
-- 
cgit v1.2.3


From 6c870ab053ff47a8fb13d3c0bf064d90592aa1f7 Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Mon, 17 Aug 2020 13:24:09 -0700
Subject: [vfs] Do O_DIRECTORY check after resolving symlinks.

Fixes python runtime test test_glob.
Updates #3515

We were checking is the to-be-opened dentry is a dir or not before resolving
symlinks. We should check that after resolving symlinks.
This was preventing us from opening a symlink which pointed to a directory
with O_DIRECTORY.

Also added this check in tmpfs and removed a duplicate check.

PiperOrigin-RevId: 327085895
---
 pkg/sentry/fsimpl/gofer/filesystem.go | 6 +++---
 pkg/sentry/fsimpl/tmpfs/filesystem.go | 5 ++---
 test/syscalls/linux/open.cc           | 8 ++++++++
 3 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 610a7ed78..a3903db33 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -886,9 +886,6 @@ afterTrailingSymlink:
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
-	if !child.isDir() && rp.MustBeDir() {
-		return nil, syserror.ENOTDIR
-	}
 	// Open existing child or follow symlink.
 	if child.isSymlink() && rp.ShouldFollowSymlink() {
 		target, err := child.readlink(ctx, rp.Mount())
@@ -901,6 +898,9 @@ afterTrailingSymlink:
 		start = parent
 		goto afterTrailingSymlink
 	}
+	if rp.MustBeDir() && !child.isDir() {
+		return nil, syserror.ENOTDIR
+	}
 	return child.openLocked(ctx, rp, &opts)
 }
 
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index a4864df53..cb8b2d944 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -389,9 +389,8 @@ afterTrailingSymlink:
 		start = &parentDir.dentry
 		goto afterTrailingSymlink
 	}
-	// Open existing file.
-	if mustCreate {
-		return nil, syserror.EEXIST
+	if rp.MustBeDir() && !child.inode.isDir() {
+		return nil, syserror.ENOTDIR
 	}
 	return child.open(ctx, rp, &opts, false)
 }
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
index 8f0c9cb49..77f390f3c 100644
--- a/test/syscalls/linux/open.cc
+++ b/test/syscalls/linux/open.cc
@@ -27,6 +27,7 @@
 #include "test/util/cleanup.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
@@ -408,6 +409,13 @@ TEST_F(OpenTest, FileNotDirectory) {
               SyscallFailsWithErrno(ENOTDIR));
 }
 
+TEST_F(OpenTest, SymlinkDirectory) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  std::string link = NewTempAbsPath();
+  ASSERT_THAT(symlink(dir.path().c_str(), link.c_str()), SyscallSucceeds());
+  ASSERT_NO_ERRNO(Open(link, O_RDONLY | O_DIRECTORY));
+}
+
 TEST_F(OpenTest, Null) {
   char c = '\0';
   ASSERT_THAT(open(&c, O_RDONLY), SyscallFailsWithErrno(ENOENT));
-- 
cgit v1.2.3


From 2529efaf0abe5cbfc58184697bf33017bc2a4f06 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Mon, 17 Aug 2020 15:53:58 -0700
Subject: Add Verify in merkle tree library

Verify checks input data against the merkle tree, and compares the root
hash with expectation.

PiperOrigin-RevId: 327116711
---
 pkg/merkletree/BUILD              |   1 +
 pkg/merkletree/merkletree.go      | 259 ++++++++++++++++++++++++++-----
 pkg/merkletree/merkletree_test.go | 319 ++++++++++++++++++++++++++++++++------
 3 files changed, 495 insertions(+), 84 deletions(-)

(limited to 'pkg')

diff --git a/pkg/merkletree/BUILD b/pkg/merkletree/BUILD
index 5b0e4143a..a8fcb2e19 100644
--- a/pkg/merkletree/BUILD
+++ b/pkg/merkletree/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
 go_library(
     name = "merkletree",
     srcs = ["merkletree.go"],
+    visibility = ["//pkg/sentry:internal"],
     deps = ["//pkg/usermem"],
 )
 
diff --git a/pkg/merkletree/merkletree.go b/pkg/merkletree/merkletree.go
index 906f67943..955c9c473 100644
--- a/pkg/merkletree/merkletree.go
+++ b/pkg/merkletree/merkletree.go
@@ -16,7 +16,9 @@
 package merkletree
 
 import (
+	"bytes"
 	"crypto/sha256"
+	"fmt"
 	"io"
 
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -27,50 +29,78 @@ const (
 	sha256DigestSize = 32
 )
 
-// Size defines the scale of a Merkle tree.
-type Size struct {
+// Layout defines the scale of a Merkle tree.
+type Layout struct {
 	// blockSize is the size of a data block to be hashed.
 	blockSize int64
 	// digestSize is the size of a generated hash.
 	digestSize int64
-	// hashesPerBlock is the number of hashes in a block. For example, if
-	// blockSize is 4096 bytes, and digestSize is 32 bytes, there will be 128
-	// hashesPerBlock. Therefore 128 hashes in a lower level will be put into a
-	// block and generate a single hash in an upper level.
-	hashesPerBlock int64
-	// levelStart is the start block index of each level. The number of levels in
-	// the tree is the length of the slice. The leafs (level 0) are hashes of
-	// blocks in the input data. The levels above are hashes of lower level
-	// hashes.  The highest level is the root hash.
-	levelStart []int64
+	// levelOffset contains the offset of the begnning of each level in
+	// bytes. The number of levels in the tree is the length of the slice.
+	// The leaf nodes (level 0) contain hashes of blocks of the input data.
+	// Each level N contains hashes of the blocks in level N-1. The highest
+	// level is the root hash.
+	levelOffset []int64
 }
 
-// MakeSize initializes and returns a new Size object describing the structure
-// of a tree. dataSize specifies the number of the file system size in bytes.
-func MakeSize(dataSize int64) Size {
-	size := Size{
+// InitLayout initializes and returns a new Layout object describing the structure
+// of a tree. dataSize specifies the size of input data in bytes.
+func InitLayout(dataSize int64) Layout {
+	layout := Layout{
 		blockSize: usermem.PageSize,
 		// TODO(b/156980949): Allow config other hash methods (SHA384/SHA512).
-		digestSize:     sha256DigestSize,
-		hashesPerBlock: usermem.PageSize / sha256DigestSize,
+		digestSize: sha256DigestSize,
 	}
-	numBlocks := (dataSize + size.blockSize - 1) / size.blockSize
-	level := int64(0)
+	numBlocks := (dataSize + layout.blockSize - 1) / layout.blockSize
+	level := 0
 	offset := int64(0)
 
-	// Calcuate the number of levels in the Merkle tree and the beginning offset
-	// of each level. Level 0 is the level directly above the data blocks, while
-	// level NumLevels - 1 is the root.
+	// Calculate the number of levels in the Merkle tree and the beginning
+	// offset of each level. Level 0 consists of the leaf nodes that
+	// contain the hashes of the data blocks, while level numLevels - 1 is
+	// the root.
 	for numBlocks > 1 {
-		size.levelStart = append(size.levelStart, offset)
+		layout.levelOffset = append(layout.levelOffset, offset*layout.blockSize)
 		// Round numBlocks up to fill up a block.
-		numBlocks += (size.hashesPerBlock - numBlocks%size.hashesPerBlock) % size.hashesPerBlock
-		offset += numBlocks / size.hashesPerBlock
-		numBlocks = numBlocks / size.hashesPerBlock
+		numBlocks += (layout.hashesPerBlock() - numBlocks%layout.hashesPerBlock()) % layout.hashesPerBlock()
+		offset += numBlocks / layout.hashesPerBlock()
+		numBlocks = numBlocks / layout.hashesPerBlock()
 		level++
 	}
-	size.levelStart = append(size.levelStart, offset)
-	return size
+	layout.levelOffset = append(layout.levelOffset, offset*layout.blockSize)
+	return layout
+}
+
+// hashesPerBlock() returns the number of digests in each block.  For example,
+// if blockSize is 4096 bytes, and digestSize is 32 bytes, there will be 128
+// hashesPerBlock. Therefore 128 hashes in one level will be combined in one
+// hash in the level above.
+func (layout Layout) hashesPerBlock() int64 {
+	return layout.blockSize / layout.digestSize
+}
+
+// numLevels returns the total number of levels in the Merkle tree.
+func (layout Layout) numLevels() int {
+	return len(layout.levelOffset)
+}
+
+// rootLevel returns the level of the root hash.
+func (layout Layout) rootLevel() int {
+	return layout.numLevels() - 1
+}
+
+// digestOffset finds the offset of a digest from the beginning of the tree.
+// The target digest is at level of the tree, with index from the beginning of
+// the current level.
+func (layout Layout) digestOffset(level int, index int64) int64 {
+	return layout.levelOffset[level] + index*layout.digestSize
+}
+
+// blockOffset finds the offset of a block from the beginning of the tree.  The
+// target block is at level of the tree, with index from the beginning of the
+// current level.
+func (layout Layout) blockOffset(level int, index int64) int64 {
+	return layout.levelOffset[level] + index*layout.blockSize
 }
 
 // Generate constructs a Merkle tree for the contents of data. The output is
@@ -78,21 +108,21 @@ func MakeSize(dataSize int64) Size {
 // it has been written. That is, treeWriter and treeReader should point to the
 // same underlying data but have separate cursors.
 func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter io.Writer) ([]byte, error) {
-	size := MakeSize(dataSize)
+	layout := InitLayout(dataSize)
 
-	numBlocks := (dataSize + size.blockSize - 1) / size.blockSize
+	numBlocks := (dataSize + layout.blockSize - 1) / layout.blockSize
 
 	var root []byte
-	for level := 0; level < len(size.levelStart); level++ {
+	for level := 0; level < layout.numLevels(); level++ {
 		for i := int64(0); i < numBlocks; i++ {
-			buf := make([]byte, size.blockSize)
+			buf := make([]byte, layout.blockSize)
 			var (
 				n   int
 				err error
 			)
 			if level == 0 {
-				// Read data block from the target file since level 0 is directly above
-				// the raw data block.
+				// Read data block from the target file since level 0 includes hashes
+				// of blocks in the input data.
 				n, err = data.Read(buf)
 			} else {
 				// Read data block from the tree file since levels higher than 0 are
@@ -112,7 +142,7 @@ func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter i
 			// Hash the bytes in buf.
 			digest := sha256.Sum256(buf)
 
-			if level == len(size.levelStart)-1 {
+			if level == layout.rootLevel() {
 				root = digest[:]
 			}
 
@@ -121,15 +151,164 @@ func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter i
 				return nil, err
 			}
 		}
-		// If the genereated digests do not round up to a block, zero-padding the
+		// If the generated digests do not round up to a block, zero-padding the
 		// remaining of the last block. But no need to do so for root.
-		if level != len(size.levelStart)-1 && numBlocks%size.hashesPerBlock != 0 {
-			zeroBuf := make([]byte, size.blockSize-(numBlocks%size.hashesPerBlock)*size.digestSize)
+		if level != layout.rootLevel() && numBlocks%layout.hashesPerBlock() != 0 {
+			zeroBuf := make([]byte, layout.blockSize-(numBlocks%layout.hashesPerBlock())*layout.digestSize)
 			if _, err := treeWriter.Write(zeroBuf[:]); err != nil {
 				return nil, err
 			}
 		}
-		numBlocks = (numBlocks + size.hashesPerBlock - 1) / size.hashesPerBlock
+		numBlocks = (numBlocks + layout.hashesPerBlock() - 1) / layout.hashesPerBlock()
 	}
 	return root, nil
 }
+
+// Verify verifies the content read from data with offset. The content is
+// verified against tree. If content spans across multiple blocks, each block is
+// verified. Verification fails if the hash of the data does not match the tree
+// at any level, or if the final root hash does not match expectedRoot.
+// Once the data is verified, it will be written using w.
+// Verify will modify the cursor for data, but always restores it to its
+// original position upon exit. The cursor for tree is modified and not
+// restored.
+func Verify(w io.Writer, data, tree io.ReadSeeker, dataSize int64, readOffset int64, readSize int64, expectedRoot []byte) error {
+	if readSize <= 0 {
+		return fmt.Errorf("Unexpected read size: %d", readSize)
+	}
+	layout := InitLayout(int64(dataSize))
+
+	// Calculate the index of blocks that includes the target range in input
+	// data.
+	firstDataBlock := readOffset / layout.blockSize
+	lastDataBlock := (readOffset + readSize - 1) / layout.blockSize
+
+	// Store the current offset, so we can set it back once verification
+	// finishes.
+	origOffset, err := data.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return fmt.Errorf("Find current data offset failed: %v", err)
+	}
+	defer data.Seek(origOffset, io.SeekStart)
+
+	// Move to the first block that contains target data.
+	if _, err := data.Seek(firstDataBlock*layout.blockSize, io.SeekStart); err != nil {
+		return fmt.Errorf("Seek to datablock start failed: %v", err)
+	}
+
+	buf := make([]byte, layout.blockSize)
+	var readErr error
+	bytesRead := 0
+	for i := firstDataBlock; i <= lastDataBlock; i++ {
+		// Read a block that includes all or part of target range in
+		// input data.
+		bytesRead, readErr = data.Read(buf)
+		// If at the end of input data and all previous blocks are
+		// verified, return the verified input data and EOF.
+		if readErr == io.EOF && bytesRead == 0 {
+			break
+		}
+		if readErr != nil && readErr != io.EOF {
+			return fmt.Errorf("Read from data failed: %v", err)
+		}
+		// If this is the end of file, zero the remaining bytes in buf,
+		// otherwise they are still from the previous block.
+		// TODO(b/162908070): Investigate possible issues with zero
+		// padding the data.
+		if bytesRead < len(buf) {
+			for j := bytesRead; j < len(buf); j++ {
+				buf[j] = 0
+			}
+		}
+		if err := verifyBlock(tree, layout, buf, i, expectedRoot); err != nil {
+			return err
+		}
+		// startOff is the beginning of the read range within the
+		// current data block. Note that for all blocks other than the
+		// first, startOff should be 0.
+		startOff := int64(0)
+		if i == firstDataBlock {
+			startOff = readOffset % layout.blockSize
+		}
+		// endOff is the end of the read range within the current data
+		// block. Note that for all blocks other than the last,  endOff
+		// should be the block size.
+		endOff := layout.blockSize
+		if i == lastDataBlock {
+			endOff = (readOffset+readSize-1)%layout.blockSize + 1
+		}
+		// If the provided size exceeds the end of input data, we should
+		// only copy the parts in buf that's part of input data.
+		if startOff > int64(bytesRead) {
+			startOff = int64(bytesRead)
+		}
+		if endOff > int64(bytesRead) {
+			endOff = int64(bytesRead)
+		}
+		w.Write(buf[startOff:endOff])
+
+	}
+	return readErr
+}
+
+// verifyBlock verifies a block against tree. index is the number of block in
+// original data. The block is verified through each level of the tree. It
+// fails if the calculated hash from block is different from any level of
+// hashes stored in tree. And the final root hash is compared with
+// expectedRoot.  verifyBlock modifies the cursor for tree. Users needs to
+// maintain the cursor if intended.
+func verifyBlock(tree io.ReadSeeker, layout Layout, dataBlock []byte, blockIndex int64, expectedRoot []byte) error {
+	if len(dataBlock) != int(layout.blockSize) {
+		return fmt.Errorf("incorrect block size")
+	}
+
+	expectedDigest := make([]byte, layout.digestSize)
+	treeBlock := make([]byte, layout.blockSize)
+	var digest []byte
+	for level := 0; level < layout.numLevels(); level++ {
+		// Calculate hash.
+		if level == 0 {
+			digestArray := sha256.Sum256(dataBlock)
+			digest = digestArray[:]
+		} else {
+			// Read a block in previous level that contains the
+			// hash we just generated, and generate a next level
+			// hash from it.
+			if _, err := tree.Seek(layout.blockOffset(level-1, blockIndex), io.SeekStart); err != nil {
+				return err
+			}
+			if _, err := tree.Read(treeBlock); err != nil {
+				return err
+			}
+			digestArray := sha256.Sum256(treeBlock)
+			digest = digestArray[:]
+		}
+
+		// Move to stored hash for the current block, read the digest
+		// and store in expectedDigest.
+		if _, err := tree.Seek(layout.digestOffset(level, blockIndex), io.SeekStart); err != nil {
+			return err
+		}
+		if _, err := tree.Read(expectedDigest); err != nil {
+			return err
+		}
+
+		if !bytes.Equal(digest, expectedDigest) {
+			return fmt.Errorf("Verification failed")
+		}
+
+		// If this is the root layer, no need to generate next level
+		// hash.
+		if level == layout.rootLevel() {
+			break
+		}
+		blockIndex = blockIndex / layout.hashesPerBlock()
+	}
+
+	// Verification for the tree succeeded. Now compare the root hash in the
+	// tree with expectedRoot.
+	if !bytes.Equal(digest[:], expectedRoot) {
+		return fmt.Errorf("Verification failed")
+	}
+	return nil
+}
diff --git a/pkg/merkletree/merkletree_test.go b/pkg/merkletree/merkletree_test.go
index 7344db0b6..911f61df9 100644
--- a/pkg/merkletree/merkletree_test.go
+++ b/pkg/merkletree/merkletree_test.go
@@ -17,45 +17,48 @@ package merkletree
 import (
 	"bytes"
 	"fmt"
+	"io"
+	"math/rand"
 	"testing"
+	"time"
 
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func TestSize(t *testing.T) {
+func TestLayout(t *testing.T) {
 	testCases := []struct {
-		dataSize           int64
-		expectedLevelStart []int64
+		dataSize            int64
+		expectedLevelOffset []int64
 	}{
 		{
-			dataSize:           100,
-			expectedLevelStart: []int64{0},
+			dataSize:            100,
+			expectedLevelOffset: []int64{0},
 		},
 		{
-			dataSize:           1000000,
-			expectedLevelStart: []int64{0, 2, 3},
+			dataSize:            1000000,
+			expectedLevelOffset: []int64{0, 2 * usermem.PageSize, 3 * usermem.PageSize},
 		},
 		{
-			dataSize:           4096 * int64(usermem.PageSize),
-			expectedLevelStart: []int64{0, 32, 33},
+			dataSize:            4096 * int64(usermem.PageSize),
+			expectedLevelOffset: []int64{0, 32 * usermem.PageSize, 33 * usermem.PageSize},
 		},
 	}
 
 	for _, tc := range testCases {
 		t.Run(fmt.Sprintf("%d", tc.dataSize), func(t *testing.T) {
-			s := MakeSize(tc.dataSize)
-			if s.blockSize != int64(usermem.PageSize) {
-				t.Errorf("got blockSize %d, want %d", s.blockSize, usermem.PageSize)
+			p := InitLayout(tc.dataSize)
+			if p.blockSize != int64(usermem.PageSize) {
+				t.Errorf("got blockSize %d, want %d", p.blockSize, usermem.PageSize)
 			}
-			if s.digestSize != sha256DigestSize {
-				t.Errorf("got digestSize %d, want %d", s.digestSize, sha256DigestSize)
+			if p.digestSize != sha256DigestSize {
+				t.Errorf("got digestSize %d, want %d", p.digestSize, sha256DigestSize)
 			}
-			if len(s.levelStart) != len(tc.expectedLevelStart) {
-				t.Errorf("got levels %d, want %d", len(s.levelStart), len(tc.expectedLevelStart))
+			if p.numLevels() != len(tc.expectedLevelOffset) {
+				t.Errorf("got levels %d, want %d", p.numLevels(), len(tc.expectedLevelOffset))
 			}
-			for i := 0; i < len(s.levelStart) && i < len(tc.expectedLevelStart); i++ {
-				if s.levelStart[i] != tc.expectedLevelStart[i] {
-					t.Errorf("got levelStart[%d] %d, want %d", i, s.levelStart[i], tc.expectedLevelStart[i])
+			for i := 0; i < p.numLevels() && i < len(tc.expectedLevelOffset); i++ {
+				if p.levelOffset[i] != tc.expectedLevelOffset[i] {
+					t.Errorf("got levelStart[%d] %d, want %d", i, p.levelOffset[i], tc.expectedLevelOffset[i])
 				}
 			}
 		})
@@ -66,57 +69,285 @@ func TestGenerate(t *testing.T) {
 	// The input data has size dataSize. It starts with the data in startWith,
 	// and all other bytes are zeroes.
 	testCases := []struct {
-		dataSize     int
-		startWith    []byte
+		data         []byte
 		expectedRoot []byte
 	}{
 		{
-			dataSize:     usermem.PageSize,
-			startWith:    nil,
+			data:         bytes.Repeat([]byte{0}, usermem.PageSize),
 			expectedRoot: []byte{173, 127, 172, 178, 88, 111, 198, 233, 102, 192, 4, 215, 209, 209, 107, 2, 79, 88, 5, 255, 124, 180, 124, 122, 133, 218, 189, 139, 72, 137, 44, 167},
 		},
 		{
-			dataSize:     128*usermem.PageSize + 1,
-			startWith:    nil,
+			data:         bytes.Repeat([]byte{0}, 128*usermem.PageSize+1),
 			expectedRoot: []byte{62, 93, 40, 92, 161, 241, 30, 223, 202, 99, 39, 2, 132, 113, 240, 139, 117, 99, 79, 243, 54, 18, 100, 184, 141, 121, 238, 46, 149, 202, 203, 132},
 		},
 		{
-			dataSize:     1,
-			startWith:    []byte{'a'},
+			data:         []byte{'a'},
 			expectedRoot: []byte{52, 75, 204, 142, 172, 129, 37, 14, 145, 137, 103, 203, 11, 162, 209, 205, 30, 169, 213, 72, 20, 28, 243, 24, 242, 2, 92, 43, 169, 59, 110, 210},
 		},
 		{
-			dataSize:     1,
-			startWith:    []byte{'1'},
-			expectedRoot: []byte{74, 35, 103, 179, 176, 149, 254, 112, 42, 65, 104, 66, 119, 56, 133, 124, 228, 15, 65, 161, 150, 0, 117, 174, 242, 34, 115, 115, 218, 37, 3, 105},
+			data:         bytes.Repeat([]byte{'a'}, usermem.PageSize),
+			expectedRoot: []byte{201, 62, 238, 45, 13, 176, 47, 16, 172, 199, 70, 13, 149, 118, 225, 34, 220, 248, 205, 83, 196, 191, 141, 252, 174, 27, 62, 116, 235, 207, 255, 90},
 		},
 	}
 
 	for _, tc := range testCases {
-		t.Run(fmt.Sprintf("%d", tc.dataSize), func(t *testing.T) {
-			var (
-				data bytes.Buffer
-				tree bytes.Buffer
-			)
+		t.Run(fmt.Sprintf("%d:%v", len(tc.data), tc.data[0]), func(t *testing.T) {
+			var tree bytes.Buffer
 
-			startSize := len(tc.startWith)
-			_, err := data.Write(tc.startWith)
+			root, err := Generate(bytes.NewBuffer(tc.data), int64(len(tc.data)), &tree, &tree)
 			if err != nil {
-				t.Fatalf("Failed to write to data: %v", err)
+				t.Fatalf("Generate failed: %v", err)
 			}
-			_, err = data.Write(make([]byte, tc.dataSize-startSize))
-			if err != nil {
-				t.Fatalf("Failed to write to data: %v", err)
+
+			if !bytes.Equal(root, tc.expectedRoot) {
+				t.Errorf("Unexpected root")
 			}
+		})
+	}
+}
+
+// bytesReadWriter is used to read from/write to/seek in a byte array. Unlike
+// bytes.Buffer, it keeps the whole buffer during read so that it can be reused.
+type bytesReadWriter struct {
+	// bytes contains the underlying byte array.
+	bytes []byte
+	// readPos is the currently location for Read. Write always appends to
+	// the end of the array.
+	readPos int
+}
+
+func (brw *bytesReadWriter) Write(p []byte) (int, error) {
+	brw.bytes = append(brw.bytes, p...)
+	return len(p), nil
+}
+
+func (brw *bytesReadWriter) Read(p []byte) (int, error) {
+	if brw.readPos >= len(brw.bytes) {
+		return 0, io.EOF
+	}
+	bytesRead := copy(p, brw.bytes[brw.readPos:])
+	brw.readPos += bytesRead
+	if bytesRead < len(p) {
+		return bytesRead, io.EOF
+	}
+	return bytesRead, nil
+}
+
+func (brw *bytesReadWriter) Seek(offset int64, whence int) (int64, error) {
+	off := offset
+	if whence == io.SeekCurrent {
+		off += int64(brw.readPos)
+	}
+	if whence == io.SeekEnd {
+		off += int64(len(brw.bytes))
+	}
+	if off < 0 {
+		panic("seek with negative offset")
+	}
+	if off >= int64(len(brw.bytes)) {
+		return 0, io.EOF
+	}
+	brw.readPos = int(off)
+	return off, nil
+}
+
+func TestVerify(t *testing.T) {
+	// The input data has size dataSize. The portion to be verified ranges from
+	// verifyStart with verifySize. A bit is flipped in outOfRangeByteIndex to
+	// confirm that modifications outside the verification range does not cause
+	// issue. And a bit is flipped in modifyByte to confirm that
+	// modifications in the verification range is caught during verification.
+	testCases := []struct {
+		dataSize    int64
+		verifyStart int64
+		verifySize  int64
+		// A byte in input data is modified during the test. If the
+		// modified byte falls in verification range, Verify should
+		// fail, otherwise Verify should still succeed.
+		modifyByte    int64
+		shouldSucceed bool
+	}{
+		// Verify range start outside the data range should fail.
+		{
+			dataSize:      usermem.PageSize,
+			verifyStart:   usermem.PageSize,
+			verifySize:    1,
+			modifyByte:    0,
+			shouldSucceed: false,
+		},
+		// Verifying range is valid if it starts inside data and ends
+		// outside data range, in that case start to the end of data is
+		// verified.
+		{
+			dataSize:      usermem.PageSize,
+			verifyStart:   0,
+			verifySize:    2 * usermem.PageSize,
+			modifyByte:    0,
+			shouldSucceed: false,
+		},
+		// Invalid verify range (negative size) should fail.
+		{
+			dataSize:      usermem.PageSize,
+			verifyStart:   1,
+			verifySize:    -1,
+			modifyByte:    0,
+			shouldSucceed: false,
+		},
+		// Invalid verify range (0 size) should fail.
+		{
+			dataSize:      usermem.PageSize,
+			verifyStart:   0,
+			verifySize:    0,
+			modifyByte:    0,
+			shouldSucceed: false,
+		},
+		// The test cases below use a block-aligned verify range.
+		// Modifying a byte in the verified range should cause verify
+		// to fail.
+		{
+			dataSize:      8 * usermem.PageSize,
+			verifyStart:   4 * usermem.PageSize,
+			verifySize:    usermem.PageSize,
+			modifyByte:    4 * usermem.PageSize,
+			shouldSucceed: false,
+		},
+		// Modifying a byte before the verified range should not cause
+		// verify to fail.
+		{
+			dataSize:      8 * usermem.PageSize,
+			verifyStart:   4 * usermem.PageSize,
+			verifySize:    usermem.PageSize,
+			modifyByte:    4*usermem.PageSize - 1,
+			shouldSucceed: true,
+		},
+		// Modifying a byte after the verified range should not cause
+		// verify to fail.
+		{
+			dataSize:      8 * usermem.PageSize,
+			verifyStart:   4 * usermem.PageSize,
+			verifySize:    usermem.PageSize,
+			modifyByte:    5 * usermem.PageSize,
+			shouldSucceed: true,
+		},
+		// The tests below use a non-block-aligned verify range.
+		// Modifying a byte at strat of verify range should cause
+		// verify to fail.
+		{
+			dataSize:      8 * usermem.PageSize,
+			verifyStart:   4*usermem.PageSize + 123,
+			verifySize:    2 * usermem.PageSize,
+			modifyByte:    4*usermem.PageSize + 123,
+			shouldSucceed: false,
+		},
+		// Modifying a byte at the end of verify range should cause
+		// verify to fail.
+		{
+			dataSize:      8 * usermem.PageSize,
+			verifyStart:   4*usermem.PageSize + 123,
+			verifySize:    2 * usermem.PageSize,
+			modifyByte:    6*usermem.PageSize + 123,
+			shouldSucceed: false,
+		},
+		// Modifying a byte in the middle verified block should cause
+		// verify to fail.
+		{
+			dataSize:      8 * usermem.PageSize,
+			verifyStart:   4*usermem.PageSize + 123,
+			verifySize:    2 * usermem.PageSize,
+			modifyByte:    5*usermem.PageSize + 123,
+			shouldSucceed: false,
+		},
+		// Modifying a byte in the first block in the verified range
+		// should cause verify to fail, even the modified bit itself is
+		// out of verify range.
+		{
+			dataSize:      8 * usermem.PageSize,
+			verifyStart:   4*usermem.PageSize + 123,
+			verifySize:    2 * usermem.PageSize,
+			modifyByte:    4*usermem.PageSize + 122,
+			shouldSucceed: false,
+		},
+		// Modifying a byte in the last block in the verified range
+		// should cause verify to fail, even the modified bit itself is
+		// out of verify range.
+		{
+			dataSize:      8 * usermem.PageSize,
+			verifyStart:   4*usermem.PageSize + 123,
+			verifySize:    2 * usermem.PageSize,
+			modifyByte:    6*usermem.PageSize + 124,
+			shouldSucceed: false,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("%d", tc.modifyByte), func(t *testing.T) {
+			data := make([]byte, tc.dataSize)
+			// Generate random bytes in data.
+			rand.Read(data)
+			var tree bytesReadWriter
 
-			root, err := Generate(&data, int64(tc.dataSize), &tree, &tree)
+			root, err := Generate(bytes.NewBuffer(data), int64(tc.dataSize), &tree, &tree)
 			if err != nil {
 				t.Fatalf("Generate failed: %v", err)
 			}
 
-			if !bytes.Equal(root, tc.expectedRoot) {
-				t.Errorf("Unexpected root")
+			// Flip a bit in data and checks Verify results.
+			var buf bytes.Buffer
+			data[tc.modifyByte] ^= 1
+			if tc.shouldSucceed {
+				if err := Verify(&buf, bytes.NewReader(data), &tree, tc.dataSize, tc.verifyStart, tc.verifySize, root); err != nil && err != io.EOF {
+					t.Errorf("Verification failed when expected to succeed: %v", err)
+				}
+				if int64(buf.Len()) != tc.verifySize || !bytes.Equal(data[tc.verifyStart:tc.verifyStart+tc.verifySize], buf.Bytes()) {
+					t.Errorf("Incorrect output from Verify")
+				}
+			} else {
+				if err := Verify(&buf, bytes.NewReader(data), &tree, tc.dataSize, tc.verifyStart, tc.verifySize, root); err == nil {
+					t.Errorf("Verification succeeded when expected to fail")
+				}
 			}
 		})
 	}
 }
+
+func TestVerifyRandom(t *testing.T) {
+	rand.Seed(time.Now().UnixNano())
+	// Use a random dataSize.  Minimum size 2 so that we can pick a random
+	// portion from it.
+	dataSize := rand.Int63n(200*usermem.PageSize) + 2
+	data := make([]byte, dataSize)
+	// Generate random bytes in data.
+	rand.Read(data)
+	var tree bytesReadWriter
+
+	root, err := Generate(bytes.NewBuffer(data), int64(dataSize), &tree, &tree)
+	if err != nil {
+		t.Fatalf("Generate failed: %v", err)
+	}
+
+	// Pick a random portion of data.
+	start := rand.Int63n(dataSize - 1)
+	size := rand.Int63n(dataSize) + 1
+
+	var buf bytes.Buffer
+	// Checks that the random portion of data from the original data is
+	// verified successfully.
+	if err := Verify(&buf, bytes.NewReader(data), &tree, dataSize, start, size, root); err != nil && err != io.EOF {
+		t.Errorf("Verification failed for correct data: %v", err)
+	}
+	if size > dataSize-start {
+		size = dataSize - start
+	}
+	if int64(buf.Len()) != size || !bytes.Equal(data[start:start+size], buf.Bytes()) {
+		t.Errorf("Incorrect output from Verify")
+	}
+
+	buf.Reset()
+	// Flip a random bit in randPortion, and check that verification fails.
+	randBytePos := rand.Int63n(size)
+	data[start+randBytePos] ^= 1
+
+	if err := Verify(&buf, bytes.NewReader(data), &tree, dataSize, start, size, root); err == nil {
+		t.Errorf("Verification succeeded for modified data")
+	}
+}
-- 
cgit v1.2.3


From 327a3014c4548b03b26ef669f8fe811fc28228bf Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 17 Aug 2020 16:28:19 -0700
Subject: Stop masking the IO error in handleIOError.

PiperOrigin-RevId: 327123331
---
 pkg/sentry/syscalls/linux/error.go | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 64de56ac5..46060f6f5 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -36,8 +36,8 @@ var (
 // errors, we may consume the error and return only the partial read/write.
 //
 // op and f are used only for panics.
-func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op string, f *vfs.FileDescription) error {
-	known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, ioerr, intr error, op string, f *vfs.FileDescription) error {
+	known, err := handleIOErrorImpl(t, partialResult, ioerr, intr, op)
 	if err != nil {
 		return err
 	}
@@ -46,7 +46,7 @@ func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op s
 		fs := f.Mount().Filesystem().VirtualFilesystem()
 		root := vfs.RootFromContext(t)
 		name, _ := fs.PathnameWithDeleted(t, root, f.VirtualDentry())
-		log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, err, err, op, name)
+		log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, ioerr, ioerr, op, name)
 		partialResultOnce.Do(partialResultMetric.Increment)
 	}
 	return nil
@@ -56,15 +56,15 @@ func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op s
 // errors, we may consume the error and return only the partial read/write.
 //
 // op and f are used only for panics.
-func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error {
-	known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+func handleIOError(t *kernel.Task, partialResult bool, ioerr, intr error, op string, f *fs.File) error {
+	known, err := handleIOErrorImpl(t, partialResult, ioerr, intr, op)
 	if err != nil {
 		return err
 	}
 	if !known {
 		// An unknown error is encountered with a partial read/write.
 		name, _ := f.Dirent.FullName(nil /* ignore chroot */)
-		log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations)
+		log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, ioerr, ioerr, op, name, f.FileOperations)
 		partialResultOnce.Do(partialResultMetric.Increment)
 	}
 	return nil
-- 
cgit v1.2.3


From 4d571b4bf21147c132ea827fd19a0462a004688d Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Mon, 17 Aug 2020 16:29:10 -0700
Subject: Add a skeleton for verity file system

PiperOrigin-RevId: 327123477
---
 pkg/sentry/fsimpl/verity/BUILD         |  23 +++
 pkg/sentry/fsimpl/verity/filesystem.go | 333 +++++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/verity/verity.go     | 355 +++++++++++++++++++++++++++++++++
 3 files changed, 711 insertions(+)
 create mode 100644 pkg/sentry/fsimpl/verity/BUILD
 create mode 100644 pkg/sentry/fsimpl/verity/filesystem.go
 create mode 100644 pkg/sentry/fsimpl/verity/verity.go

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD
new file mode 100644
index 000000000..28d2a4bcb
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/BUILD
@@ -0,0 +1,23 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "verity",
+    srcs = [
+        "filesystem.go",
+        "verity.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
new file mode 100644
index 000000000..78c6074bd
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -0,0 +1,333 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package verity
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	// All files should be read-only.
+	return nil
+}
+
+var dentrySlicePool = sync.Pool{
+	New: func() interface{} {
+		ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+		return &ds
+	},
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+	if ds == nil {
+		ds = dentrySlicePool.Get().(*[]*dentry)
+	}
+	*ds = append(*ds, d)
+	return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+	// Allow dentries to be GC'd.
+	for i := range *ds {
+		(*ds)[i] = nil
+	}
+	*ds = (*ds)[:0]
+	dentrySlicePool.Put(ds)
+}
+
+// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls
+// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+	fs.renameMu.RUnlock()
+	if *ds == nil {
+		return
+	}
+	if len(**ds) != 0 {
+		fs.renameMu.Lock()
+		for _, d := range **ds {
+			d.checkDropLocked(ctx)
+		}
+		fs.renameMu.Unlock()
+	}
+	putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+	if *ds == nil {
+		fs.renameMu.Unlock()
+		return
+	}
+	for _, d := range **ds {
+		d.checkDropLocked(ctx)
+	}
+	fs.renameMu.Unlock()
+	putDentrySlice(*ds)
+}
+
+// resolveLocked resolves rp to an existing file.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+	// TODO(b/159261227): Implement resolveLocked.
+	return nil, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions: fs.renameMu must be locked. !rp.Done().
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+	// TODO(b/159261227): Implement walkParentDirLocked.
+	return nil, nil
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	// Verity file system is read-only.
+	if ats&vfs.MayWrite != 0 {
+		return syserror.EROFS
+	}
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.checkPermissions(creds, ats)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !d.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+			return nil, err
+		}
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	start := rp.Start().Impl().(*dentry)
+	d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	//TODO(b/159261227): Implement OpenAt.
+	return nil, nil
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	//TODO(b/162787271): Provide integrity check for ReadlinkAt.
+	return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+	})
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+
+	var stat linux.Statx
+	stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+	}, &opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	// TODO(b/159261227): Implement StatFSAt.
+	return linux.Statfs{}, nil
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	if _, err := fs.resolveLocked(ctx, rp, &ds); err != nil {
+		return nil, err
+	}
+	return nil, syserror.ECONNREFUSED
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	lowerVD := d.lowerVD
+	return fs.vfsfs.VirtualFilesystem().ListxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+	}, size)
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	lowerVD := d.lowerVD
+	return fs.vfsfs.VirtualFilesystem().GetxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+	}, &opts)
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.renameMu.RLock()
+	defer fs.renameMu.RUnlock()
+	mnt := vd.Mount()
+	d := vd.Dentry().Impl().(*dentry)
+	for {
+		if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
+			return vfs.PrependPathAtVFSRootError{}
+		}
+		if &d.vfsd == mnt.Root() {
+			return nil
+		}
+		if d.parent == nil {
+			return vfs.PrependPathAtNonMountRootError{}
+		}
+		b.PrependComponent(d.name)
+		d = d.parent
+	}
+}
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
new file mode 100644
index 000000000..cb29d33a5
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -0,0 +1,355 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package verity provides a filesystem implementation that is a wrapper of
+// another file system.
+// The verity file system provides integrity check for the underlying file
+// system by providing verification for path traversals and each read.
+// The verity file system is read-only, except for one case: when
+// allowRuntimeEnable is true, additional Merkle files can be generated using
+// the FS_IOC_ENABLE_VERITY ioctl.
+package verity
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the default filesystem name.
+const Name = "verity"
+
+// testOnlyDebugging allows verity file system to return error instead of
+// crashing the application when a malicious action is detected. This should
+// only be set for tests.
+var testOnlyDebugging bool
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// creds is a copy of the filesystem's creator's credentials, which are
+	// used for accesses to the underlying file system. creds is immutable.
+	creds *auth.Credentials
+
+	// allowRuntimeEnable is true if using ioctl with FS_IOC_ENABLE_VERITY
+	// to build Merkle trees in the verity file system is allowed. If this
+	// is false, no new Merkle trees can be built, and only the files that
+	// had Merkle trees before startup (e.g. from a host filesystem mounted
+	// with gofer fs) can be verified.
+	allowRuntimeEnable bool
+
+	// lowerMount is the underlying file system mount.
+	lowerMount *vfs.Mount
+
+	// rootDentry is the mount root Dentry for this file system, which
+	// stores the root hash of the whole file system in bytes.
+	rootDentry *dentry
+
+	// renameMu synchronizes renaming with non-renaming operations in order
+	// to ensure consistent lock ordering between dentry.dirMu in different
+	// dentries.
+	renameMu sync.RWMutex
+}
+
+// InternalFilesystemOptions may be passed as
+// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+type InternalFilesystemOptions struct {
+	// RootMerkleFileName is the name of the verity root Merkle tree file.
+	RootMerkleFileName string
+
+	// LowerName is the name of the filesystem wrapped by verity fs.
+	LowerName string
+
+	// RootHash is the root hash of the overall verity file system.
+	RootHash []byte
+
+	// AllowRuntimeEnable specifies whether the verity file system allows
+	// enabling verification for files (i.e. building Merkle trees) during
+	// runtime.
+	AllowRuntimeEnable bool
+
+	// LowerGetFSOptions is the file system option for the lower layer file
+	// system wrapped by verity file system.
+	LowerGetFSOptions vfs.GetFilesystemOptions
+
+	// TestOnlyDebugging allows verity file system to return error instead
+	// of crashing the application when a malicious action is detected. This
+	// should only be set for tests.
+	TestOnlyDebugging bool
+}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	//TODO(b/159261227): Implement GetFilesystem.
+	return nil, nil, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+	fs.lowerMount.DecRef(ctx)
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	vfsd vfs.Dentry
+
+	refs int64
+
+	// fs is the owning filesystem. fs is immutable.
+	fs *filesystem
+
+	// mode, uid and gid are the file mode, owner, and group of the file in
+	// the underlying file system.
+	mode uint32
+	uid  uint32
+	gid  uint32
+
+	// parent is the dentry corresponding to this dentry's parent directory.
+	// name is this dentry's name in parent. If this dentry is a filesystem
+	// root, parent is nil and name is the empty string. parent and name are
+	// protected by fs.renameMu.
+	parent *dentry
+	name   string
+
+	// If this dentry represents a directory, children maps the names of
+	// children for which dentries have been instantiated to those dentries,
+	// and dirents (if not nil) is a cache of dirents as returned by
+	// directoryFDs representing this directory. children is protected by
+	// dirMu.
+	dirMu    sync.Mutex
+	children map[string]*dentry
+
+	// lowerVD is the VirtualDentry in the underlying file system.
+	lowerVD vfs.VirtualDentry
+
+	// lowerMerkleVD is the VirtualDentry of the corresponding Merkle tree
+	// in the underlying file system.
+	lowerMerkleVD vfs.VirtualDentry
+
+	// rootHash is the rootHash for the current file or directory.
+	rootHash []byte
+}
+
+// newDentry creates a new dentry representing the given verity file. The
+// dentry initially has no references; it is the caller's responsibility to set
+// the dentry's reference count and/or call dentry.destroy() as appropriate.
+// The dentry is initially invalid in that it contains no underlying dentry;
+// the caller is responsible for setting them.
+func (fs *filesystem) newDentry() *dentry {
+	d := &dentry{
+		fs: fs,
+	}
+	d.vfsd.Init(d)
+	return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+	atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&d.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef(ctx context.Context) {
+	if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+		d.fs.renameMu.Lock()
+		d.checkDropLocked(ctx)
+		d.fs.renameMu.Unlock()
+	} else if refs < 0 {
+		panic("verity.dentry.DecRef() called without holding a reference")
+	}
+}
+
+// checkDropLocked should be called after d's reference count becomes 0 or it
+// becomes deleted.
+func (d *dentry) checkDropLocked(ctx context.Context) {
+	// Dentries with a positive reference count must be retained. Dentries
+	// with a negative reference count have already been destroyed.
+	if atomic.LoadInt64(&d.refs) != 0 {
+		return
+	}
+	// Refs is still zero; destroy it.
+	d.destroyLocked(ctx)
+	return
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+func (d *dentry) destroyLocked(ctx context.Context) {
+	switch atomic.LoadInt64(&d.refs) {
+	case 0:
+		// Mark the dentry destroyed.
+		atomic.StoreInt64(&d.refs, -1)
+	case -1:
+		panic("verity.dentry.destroyLocked() called on already destroyed dentry")
+	default:
+		panic("verity.dentry.destroyLocked() called with references on the dentry")
+	}
+
+	if d.lowerVD.Ok() {
+		d.lowerVD.DecRef(ctx)
+	}
+
+	if d.lowerMerkleVD.Ok() {
+		d.lowerMerkleVD.DecRef(ctx)
+	}
+
+	if d.parent != nil {
+		d.parent.dirMu.Lock()
+		if !d.vfsd.IsDead() {
+			delete(d.parent.children, d.name)
+		}
+		d.parent.dirMu.Unlock()
+		if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+			d.parent.checkDropLocked(ctx)
+		} else if refs < 0 {
+			panic("verity.dentry.DecRef() called without holding a reference")
+		}
+	}
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
+	//TODO(b/159261227): Implement InotifyWithParent.
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+	//TODO(b/159261227): Implement Watches.
+	return nil
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+func (d *dentry) OnZeroWatches(context.Context) {
+	//TODO(b/159261227): Implement OnZeroWatches.
+}
+
+func (d *dentry) isSymlink() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
+}
+
+func (d *dentry) isDir() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+func (d *dentry) readlink(ctx context.Context) (string, error) {
+	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+	})
+}
+
+// FileDescription implements vfs.FileDescriptionImpl for verity fds.
+// FileDescription is a wrapper of the underlying lowerFD, with support to build
+// Merkle trees through the Linux fs-verity API to verify contents read from
+// lowerFD.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	// d is the corresponding dentry to the fileDescription.
+	d *dentry
+
+	// isDir specifies whehter the fileDescription points to a directory.
+	isDir bool
+
+	// lowerFD is the FileDescription corresponding to the file in the
+	// underlying file system.
+	lowerFD *vfs.FileDescription
+
+	// merkleReader is the read-only FileDescription corresponding to the
+	// Merkle tree file in the underlying file system.
+	merkleReader *vfs.FileDescription
+
+	// merkleWriter is the FileDescription corresponding to the Merkle tree
+	// file in the underlying file system for writing. This should only be
+	// used when allowRuntimeEnable is set to true.
+	merkleWriter *vfs.FileDescription
+
+	// parentMerkleWriter is the FileDescription of the Merkle tree for the
+	// directory that contains the current file/directory. This is only used
+	// if allowRuntimeEnable is set to true.
+	parentMerkleWriter *vfs.FileDescription
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *fileDescription) Release(ctx context.Context) {
+	fd.lowerFD.DecRef(ctx)
+	fd.merkleReader.DecRef(ctx)
+	if fd.merkleWriter != nil {
+		fd.merkleWriter.DecRef(ctx)
+	}
+	if fd.parentMerkleWriter != nil {
+		fd.parentMerkleWriter.DecRef(ctx)
+	}
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	// TODO(b/162788573): Add integrity check for metadata.
+	stat, err := fd.lowerFD.Stat(ctx, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	// Verity files are read-only.
+	return syserror.EPERM
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
-- 
cgit v1.2.3


From 8b5e9dbae85d0877a60112055aa304665d5e39fa Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Tue, 18 Aug 2020 10:20:17 -0700
Subject: [vfs2] Implement /proc/sys/net/ipv4/tcp_rmem and
 /proc/sys/net/ipv4/tcp_wmem.

Updates #1035

PiperOrigin-RevId: 327253907
---
 pkg/sentry/fsimpl/proc/BUILD        |   1 +
 pkg/sentry/fsimpl/proc/tasks_sys.go | 110 ++++++++++++++++++++++++++++++++++--
 2 files changed, 106 insertions(+), 5 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 6014138ff..14ecfd300 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -36,6 +36,7 @@ go_library(
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/header",
         "//pkg/usermem",
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index b71778128..6435385ef 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -25,10 +25,18 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+type tcpMemDir int
+
+const (
+	tcpRMem tcpMemDir = iota
+	tcpWMem
+)
+
 // newSysDir returns the dentry corresponding to /proc/sys directory.
 func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
 	return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
@@ -56,7 +64,9 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke
 		contents = map[string]*kernfs.Dentry{
 			"ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
 				"tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}),
+				"tcp_rmem":     fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
 				"tcp_sack":     fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
+				"tcp_wmem":     fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
 
 				// The following files are simple stubs until they are implemented in
 				// netstack, most of these files are configuration related. We use the
@@ -181,10 +191,11 @@ func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		// Tough luck.
 		val = "1\n"
 	}
-	buf.WriteString(val)
-	return nil
+	_, err := buf.WriteString(val)
+	return err
 }
 
+// Write implements vfs.WritableDynamicBytesSource.Write.
 func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
 	if offset != 0 {
 		// No need to handle partial writes thus far.
@@ -200,7 +211,7 @@ func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset
 	var v int32
 	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
 	if err != nil {
-		return n, err
+		return 0, err
 	}
 	if d.enabled == nil {
 		d.enabled = new(bool)
@@ -228,10 +239,11 @@ func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error
 		return err
 	}
 
-	buf.WriteString(fmt.Sprintf("%d\n", recovery))
-	return nil
+	_, err = buf.WriteString(fmt.Sprintf("%d\n", recovery))
+	return err
 }
 
+// Write implements vfs.WritableDynamicBytesSource.Write.
 func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
 	if offset != 0 {
 		// No need to handle partial writes thus far.
@@ -254,3 +266,91 @@ func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, off
 	}
 	return n, nil
 }
+
+// tcpMemData implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/ipv4/tcp_rmem and /proc/sys/net/ipv4/tcp_wmem.
+//
+// +stateify savable
+type tcpMemData struct {
+	kernfs.DynamicBytesFile
+
+	dir   tcpMemDir
+	stack inet.Stack `state:"wait"`
+
+	// mu protects against concurrent reads/writes to FDs based on the dentry
+	// backing this byte source.
+	mu sync.Mutex `state:"nosave"`
+}
+
+var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	size, err := d.readSizeLocked()
+	if err != nil {
+		return err
+	}
+	_, err = buf.WriteString(fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max))
+	return err
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (d *tcpMemData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		// No need to handle partial writes thus far.
+		return 0, syserror.EINVAL
+	}
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	// Limit the amount of memory allocated.
+	src = src.TakeFirst(usermem.PageSize - 1)
+	size, err := d.readSizeLocked()
+	if err != nil {
+		return 0, err
+	}
+	buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)}
+	n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+	newSize := inet.TCPBufferSize{
+		Min:     int(buf[0]),
+		Default: int(buf[1]),
+		Max:     int(buf[2]),
+	}
+	if err := d.writeSizeLocked(newSize); err != nil {
+		return 0, err
+	}
+	return n, nil
+}
+
+// Precondition: d.mu must be locked.
+func (d *tcpMemData) readSizeLocked() (inet.TCPBufferSize, error) {
+	switch d.dir {
+	case tcpRMem:
+		return d.stack.TCPReceiveBufferSize()
+	case tcpWMem:
+		return d.stack.TCPSendBufferSize()
+	default:
+		panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
+	}
+}
+
+// Precondition: d.mu must be locked.
+func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error {
+	switch d.dir {
+	case tcpRMem:
+		return d.stack.SetTCPReceiveBufferSize(size)
+	case tcpWMem:
+		return d.stack.SetTCPSendBufferSize(size)
+	default:
+		panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
+	}
+}
-- 
cgit v1.2.3


From 596ba8e719eeb13bd8c8645ad3083a1ccc941d97 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 18 Aug 2020 12:30:10 -0700
Subject: Wait for all p9 handlers to complete before server shutdown.

... including those invoked via flipcall.

PiperOrigin-RevId: 327283194
---
 pkg/p9/server.go | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

(limited to 'pkg')

diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index 60cf94fa1..b9f15e4ed 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -87,6 +87,9 @@ type connState struct {
 	// version 0 implies 9P2000.L.
 	version uint32
 
+	// pendingWg counts requests that are still being handled.
+	pendingWg sync.WaitGroup
+
 	// -- below relates to the legacy handler --
 
 	// recvOkay indicates that a receive may start.
@@ -479,7 +482,9 @@ func (cs *connState) lookupChannel(id uint32) *channel {
 
 // handle handles a single message.
 func (cs *connState) handle(m message) (r message) {
+	cs.pendingWg.Add(1)
 	defer func() {
+		cs.pendingWg.Done()
 		if r == nil {
 			// Don't allow a panic to propagate.
 			err := recover()
@@ -568,6 +573,11 @@ func (cs *connState) handleRequests() {
 }
 
 func (cs *connState) stop() {
+	// Wait for completion of all inflight requests. This is mostly so that if
+	// a request is stuck, the sandbox supervisor has the opportunity to kill
+	// us with SIGABRT to get a stack dump of the offending handler.
+	cs.pendingWg.Wait()
+
 	// Close all channels.
 	close(cs.recvOkay)
 	close(cs.recvDone)
@@ -606,11 +616,6 @@ func (cs *connState) stop() {
 
 // service services requests concurrently.
 func (cs *connState) service() error {
-	// Pending is the number of handlers that have finished receiving but
-	// not finished processing requests. These must be waiting on properly
-	// below. See the next comment for an explanation of the loop.
-	pending := 0
-
 	// Start the first request handler.
 	go cs.handleRequests() // S/R-SAFE: Irrelevant.
 	cs.recvOkay <- true
@@ -622,16 +627,9 @@ func (cs *connState) service() error {
 		select {
 		case err := <-cs.recvDone:
 			if err != nil {
-				// Wait for pending handlers.
-				for i := 0; i < pending; i++ {
-					<-cs.sendDone
-				}
-				return nil
+				return err
 			}
 
-			// This handler is now pending.
-			pending++
-
 			// Kick the next receiver, or start a new handler
 			// if no receiver is currently waiting.
 			select {
@@ -642,9 +640,6 @@ func (cs *connState) service() error {
 			}
 
 		case <-cs.sendDone:
-			// This handler is finished.
-			pending--
-
 			// Error sending a response? Nothing can be done.
 			//
 			// We don't terminate on a send error though, since
-- 
cgit v1.2.3


From 79d819a62c1db623ee8cb8f7df07c2d4702fd016 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 18 Aug 2020 14:34:15 -0700
Subject: Avoid holding locks when opening files in VFS2.

Fixes #3243, #3521

PiperOrigin-RevId: 327308890
---
 pkg/sentry/fsimpl/gofer/filesystem.go   | 32 ++++++++++++++++++-------
 pkg/sentry/fsimpl/kernfs/filesystem.go  | 28 +++++++++++++++++++---
 pkg/sentry/fsimpl/overlay/filesystem.go | 42 ++++++++++++++++++++++++---------
 pkg/sentry/fsimpl/tmpfs/filesystem.go   | 23 ++++++++++++++++--
 4 files changed, 100 insertions(+), 25 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index a3903db33..9a90351e5 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -834,7 +834,14 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	unlocked := false
+	unlock := func() {
+		if !unlocked {
+			fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+			unlocked = true
+		}
+	}
+	defer unlock()
 
 	start := rp.Start().Impl().(*dentry)
 	if !start.cachedMetadataAuthoritative() {
@@ -851,7 +858,10 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
-		return start.openLocked(ctx, rp, &opts)
+		start.IncRef()
+		defer start.DecRef(ctx)
+		unlock()
+		return start.open(ctx, rp, &opts)
 	}
 
 afterTrailingSymlink:
@@ -901,11 +911,15 @@ afterTrailingSymlink:
 	if rp.MustBeDir() && !child.isDir() {
 		return nil, syserror.ENOTDIR
 	}
-	return child.openLocked(ctx, rp, &opts)
+	child.IncRef()
+	defer child.DecRef(ctx)
+	unlock()
+	return child.open(ctx, rp, &opts)
 }
 
-// Preconditions: fs.renameMu must be locked.
-func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Preconditions: The caller must hold no locks (since opening pipes may block
+// indefinitely).
+func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(opts)
 	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
 		return nil, err
@@ -968,7 +982,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 			return nil, syserror.ENXIO
 		}
 		if d.fs.iopts.OpenSocketsByConnecting {
-			return d.connectSocketLocked(ctx, opts)
+			return d.openSocketByConnecting(ctx, opts)
 		}
 	case linux.S_IFIFO:
 		if d.isSynthetic() {
@@ -977,7 +991,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 	}
 
 	if vfd == nil {
-		if vfd, err = d.openSpecialFileLocked(ctx, mnt, opts); err != nil {
+		if vfd, err = d.openSpecialFile(ctx, mnt, opts); err != nil {
 			return nil, err
 		}
 	}
@@ -996,7 +1010,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 	return vfd, err
 }
 
-func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
 	if opts.Flags&linux.O_DIRECT != 0 {
 		return nil, syserror.EINVAL
 	}
@@ -1016,7 +1030,7 @@ func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions)
 	return fd, nil
 }
 
-func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(opts)
 	if opts.Flags&linux.O_DIRECT != 0 {
 		return nil, syserror.EINVAL
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index d7edb6342..3e5192edd 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -397,15 +397,21 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	// Do not create new file.
 	if opts.Flags&linux.O_CREAT == 0 {
 		fs.mu.RLock()
-		defer fs.processDeferredDecRefs(ctx)
-		defer fs.mu.RUnlock()
 		vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
 		if err != nil {
+			fs.mu.RUnlock()
+			fs.processDeferredDecRefs(ctx)
 			return nil, err
 		}
 		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+			fs.mu.RUnlock()
+			fs.processDeferredDecRefs(ctx)
 			return nil, err
 		}
+		inode.IncRef()
+		defer inode.DecRef(ctx)
+		fs.mu.RUnlock()
+		fs.processDeferredDecRefs(ctx)
 		return inode.Open(ctx, rp, vfsd, opts)
 	}
 
@@ -414,7 +420,14 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	vfsd := rp.Start()
 	inode := vfsd.Impl().(*Dentry).inode
 	fs.mu.Lock()
-	defer fs.mu.Unlock()
+	unlocked := false
+	unlock := func() {
+		if !unlocked {
+			fs.mu.Unlock()
+			unlocked = true
+		}
+	}
+	defer unlock()
 	if rp.Done() {
 		if rp.MustBeDir() {
 			return nil, syserror.EISDIR
@@ -425,6 +438,9 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
+		inode.IncRef()
+		defer inode.DecRef(ctx)
+		unlock()
 		return inode.Open(ctx, rp, vfsd, opts)
 	}
 afterTrailingSymlink:
@@ -466,6 +482,9 @@ afterTrailingSymlink:
 		}
 		child := childVFSD.Impl().(*Dentry)
 		parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+		child.inode.IncRef()
+		defer child.inode.DecRef(ctx)
+		unlock()
 		return child.inode.Open(ctx, rp, childVFSD, opts)
 	}
 	if err != nil {
@@ -499,6 +518,9 @@ afterTrailingSymlink:
 	if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
+	child.inode.IncRef()
+	defer child.inode.DecRef(ctx)
+	unlock()
 	return child.inode.Open(ctx, rp, &child.vfsd, opts)
 }
 
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index 986b36ead..86d0164b4 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -717,17 +717,33 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	mayCreate := opts.Flags&linux.O_CREAT != 0
 	mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
+	mayWrite := vfs.AccessTypesForOpenFlags(&opts).MayWrite()
 
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	unlocked := false
+	unlock := func() {
+		if !unlocked {
+			fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+			unlocked = true
+		}
+	}
+	defer unlock()
 
 	start := rp.Start().Impl().(*dentry)
 	if rp.Done() {
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
-		return start.openLocked(ctx, rp, &opts)
+		if mayWrite {
+			if err := start.copyUpLocked(ctx); err != nil {
+				return nil, err
+			}
+		}
+		start.IncRef()
+		defer start.DecRef(ctx)
+		unlock()
+		return start.openCopiedUp(ctx, rp, &opts)
 	}
 
 afterTrailingSymlink:
@@ -767,20 +783,24 @@ afterTrailingSymlink:
 		start = parent
 		goto afterTrailingSymlink
 	}
-	return child.openLocked(ctx, rp, &opts)
+	if mayWrite {
+		if err := child.copyUpLocked(ctx); err != nil {
+			return nil, err
+		}
+	}
+	child.IncRef()
+	defer child.DecRef(ctx)
+	unlock()
+	return child.openCopiedUp(ctx, rp, &opts)
 }
 
-// Preconditions: fs.renameMu must be locked.
-func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Preconditions: If vfs.AccessTypesForOpenFlags(opts).MayWrite(), then d has
+// been copied up.
+func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(opts)
 	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
-	if ats.MayWrite() {
-		if err := d.copyUpLocked(ctx); err != nil {
-			return nil, err
-		}
-	}
 	mnt := rp.Mount()
 
 	// Directory FDs open FDs from each layer when directory entries are read,
@@ -792,7 +812,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 			return nil, syserror.EISDIR
 		}
 		// Can't open directories writably.
-		if ats&vfs.MayWrite != 0 {
+		if ats.MayWrite() {
 			return nil, syserror.EISDIR
 		}
 		if opts.Flags&linux.O_DIRECT != 0 {
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index cb8b2d944..b0ec177e6 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -307,18 +307,28 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	// don't need fs.mu for writing.
 	if opts.Flags&linux.O_CREAT == 0 {
 		fs.mu.RLock()
-		defer fs.mu.RUnlock()
 		d, err := resolveLocked(ctx, rp)
 		if err != nil {
+			fs.mu.RUnlock()
 			return nil, err
 		}
+		d.IncRef()
+		defer d.DecRef(ctx)
+		fs.mu.RUnlock()
 		return d.open(ctx, rp, &opts, false /* afterCreate */)
 	}
 
 	mustCreate := opts.Flags&linux.O_EXCL != 0
 	start := rp.Start().Impl().(*dentry)
 	fs.mu.Lock()
-	defer fs.mu.Unlock()
+	unlocked := false
+	unlock := func() {
+		if !unlocked {
+			fs.mu.Unlock()
+			unlocked = true
+		}
+	}
+	defer unlock()
 	if rp.Done() {
 		// Reject attempts to open mount root directory with O_CREAT.
 		if rp.MustBeDir() {
@@ -327,6 +337,9 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
+		start.IncRef()
+		defer start.DecRef(ctx)
+		unlock()
 		return start.open(ctx, rp, &opts, false /* afterCreate */)
 	}
 afterTrailingSymlink:
@@ -364,6 +377,7 @@ afterTrailingSymlink:
 		creds := rp.Credentials()
 		child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode))
 		parentDir.insertChildLocked(child, name)
+		unlock()
 		fd, err := child.open(ctx, rp, &opts, true)
 		if err != nil {
 			return nil, err
@@ -392,9 +406,14 @@ afterTrailingSymlink:
 	if rp.MustBeDir() && !child.inode.isDir() {
 		return nil, syserror.ENOTDIR
 	}
+	child.IncRef()
+	defer child.DecRef(ctx)
+	unlock()
 	return child.open(ctx, rp, &opts, false)
 }
 
+// Preconditions: The caller must hold no locks (since opening pipes may block
+// indefinitely).
 func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(opts)
 	if !afterCreate {
-- 
cgit v1.2.3


From 3e6d607ee45b817d146c6a5b791a64608c1e9d0c Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Tue, 18 Aug 2020 15:57:48 -0700
Subject: RACK: Create a new list for segments.

RACK requires the segments to be in the order of their transmission
or retransmission times. This cl creates a new list and moves the
retransmitted segments to the end of the list.

PiperOrigin-RevId: 327325153
---
 pkg/tcpip/transport/tcp/BUILD       | 17 ++++++++++++++-
 pkg/tcpip/transport/tcp/connect.go  | 11 ++++++++++
 pkg/tcpip/transport/tcp/endpoint.go |  4 ++--
 pkg/tcpip/transport/tcp/segment.go  | 23 +++++++++++++++------
 pkg/tcpip/transport/tcp/snd.go      | 41 +++++++++++++++++++++++--------------
 5 files changed, 72 insertions(+), 24 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 234fb95ce..bde071f2a 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -11,7 +11,8 @@ go_template_instance(
     template = "//pkg/ilist:generic_list",
     types = {
         "Element": "*segment",
-        "Linker": "*segment",
+        "ElementMapper": "segmentMapper",
+        "Linker": "*segmentEntry",
     },
 )
 
@@ -27,6 +28,19 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "tcp_rack_segment_list",
+    out = "tcp_rack_segment_list.go",
+    package = "tcp",
+    prefix = "rackSegment",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*segment",
+        "ElementMapper": "rackSegmentMapper",
+        "Linker": "*rackSegmentEntry",
+    },
+)
+
 go_library(
     name = "tcp",
     srcs = [
@@ -55,6 +69,7 @@ go_library(
         "snd.go",
         "snd_state.go",
         "tcp_endpoint_list.go",
+        "tcp_rack_segment_list.go",
         "tcp_segment_list.go",
         "timer.go",
     ],
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 290172ac9..87980c0a1 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -924,7 +924,18 @@ func (e *endpoint) handleWrite() *tcpip.Error {
 
 	first := e.sndQueue.Front()
 	if first != nil {
+		lastSeg := e.snd.writeList.Back()
 		e.snd.writeList.PushBackList(&e.sndQueue)
+		if lastSeg == nil {
+			lastSeg = e.snd.writeList.Front()
+		} else {
+			lastSeg = lastSeg.segEntry.Next()
+		}
+		// Add new segments to rcList, as rcList and writeList should
+		// be consistent.
+		for seg := lastSeg; seg != nil; seg = seg.segEntry.Next() {
+			e.snd.rcList.PushBack(seg)
+		}
 		e.sndBufInQueue = 0
 	}
 
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 1ccedebcc..21a4b6e2f 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1428,7 +1428,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	vec = append([][]byte(nil), vec...)
 
 	var num int64
-	for s := e.rcvList.Front(); s != nil; s = s.Next() {
+	for s := e.rcvList.Front(); s != nil; s = s.segEntry.Next() {
 		views := s.data.Views()
 
 		for i := s.viewToDeliver; i < len(views); i++ {
@@ -2249,7 +2249,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	if !handshake {
 		e.segmentQueue.mu.Lock()
 		for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} {
-			for s := l.Front(); s != nil; s = s.Next() {
+			for s := l.Front(); s != nil; s = s.segEntry.Next() {
 				s.id = e.ID
 				s.route = r.Clone()
 				e.sndWaker.Assert()
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 94307d31a..a20755f78 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -30,12 +30,13 @@ import (
 //
 // +stateify savable
 type segment struct {
-	segmentEntry
-	refCnt int32
-	id     stack.TransportEndpointID `state:"manual"`
-	route  stack.Route               `state:"manual"`
-	data   buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
-	hdr    header.TCP
+	segEntry     segmentEntry
+	rackSegEntry rackSegmentEntry
+	refCnt       int32
+	id           stack.TransportEndpointID `state:"manual"`
+	route        stack.Route               `state:"manual"`
+	data         buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
+	hdr          header.TCP
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View `state:"nosave"`
@@ -61,6 +62,16 @@ type segment struct {
 	xmitCount uint32
 }
 
+// segmentMapper is the ElementMapper for the writeList.
+type segmentMapper struct{}
+
+func (segmentMapper) linkerFor(seg *segment) *segmentEntry { return &seg.segEntry }
+
+// rackSegmentMapper is the ElementMapper for the rcList.
+type rackSegmentMapper struct{}
+
+func (rackSegmentMapper) linkerFor(seg *segment) *rackSegmentEntry { return &seg.rackSegEntry }
+
 func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) *segment {
 	s := &segment{
 		refCnt: 1,
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index c55589c45..31151f23d 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -154,6 +154,7 @@ type sender struct {
 	closed      bool
 	writeNext   *segment
 	writeList   segmentList
+	rcList      rackSegmentList
 	resendTimer timer       `state:"nosave"`
 	resendWaker sleep.Waker `state:"nosave"`
 
@@ -367,7 +368,7 @@ func (s *sender) updateMaxPayloadSize(mtu, count int) {
 
 	// Rewind writeNext to the first segment exceeding the MTU. Do nothing
 	// if it is already before such a packet.
-	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
+	for seg := s.writeList.Front(); seg != nil; seg = seg.segEntry.Next() {
 		if seg == s.writeNext {
 			// We got to writeNext before we could find a segment
 			// exceeding the MTU.
@@ -622,6 +623,7 @@ func (s *sender) splitSeg(seg *segment, size int) {
 	nSeg.data.TrimFront(size)
 	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
 	s.writeList.InsertAfter(seg, nSeg)
+	s.rcList.InsertAfter(seg, nSeg)
 
 	// The segment being split does not carry PUSH flag because it is
 	// followed by the newly split segment.
@@ -653,7 +655,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 	var s3 *segment
 	var s4 *segment
 	// Step 1.
-	for seg := nextSegHint; seg != nil; seg = seg.Next() {
+	for seg := nextSegHint; seg != nil; seg = seg.segEntry.Next() {
 		// Stop iteration if we hit a segment that has never been
 		// transmitted (i.e. either it has no assigned sequence number
 		// or if it does have one, it's >= the next sequence number
@@ -683,7 +685,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 				// NextSeg():
 				//     (1.c) IsLost(S2) returns true.
 				if s.ep.scoreboard.IsLost(segSeq) {
-					return seg, seg.Next(), false
+					return seg, seg.segEntry.Next(), false
 				}
 
 				// NextSeg():
@@ -697,7 +699,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 				// SHOULD be returned.
 				if s3 == nil {
 					s3 = seg
-					hint = seg.Next()
+					hint = seg.segEntry.Next()
 				}
 			}
 			// NextSeg():
@@ -731,7 +733,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 	// range of one segment of up to SMSS octets of
 	// previously unsent data starting with sequence number
 	// HighData+1 MUST be returned."
-	for seg := s.writeNext; seg != nil; seg = seg.Next() {
+	for seg := s.writeNext; seg != nil; seg = seg.segEntry.Next() {
 		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) {
 			continue
 		}
@@ -773,15 +775,16 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 			// triggering bugs in poorly written DNS
 			// implementations.
 			var nextTooBig bool
-			for seg.Next() != nil && seg.Next().data.Size() != 0 {
-				if seg.data.Size()+seg.Next().data.Size() > available {
+			for seg.segEntry.Next() != nil && seg.segEntry.Next().data.Size() != 0 {
+				if seg.data.Size()+seg.segEntry.Next().data.Size() > available {
 					nextTooBig = true
 					break
 				}
-				seg.data.Append(seg.Next().data)
+				seg.data.Append(seg.segEntry.Next().data)
 
 				// Consume the segment that we just merged in.
-				s.writeList.Remove(seg.Next())
+				s.writeList.Remove(seg.segEntry.Next())
+				s.rcList.Remove(seg.rackSegEntry.Next())
 			}
 			if !nextTooBig && seg.data.Size() < available {
 				// Segment is not full.
@@ -948,7 +951,7 @@ func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool)
 			}
 			dataSent = true
 			s.outstanding++
-			s.writeNext = nextSeg.Next()
+			s.writeNext = nextSeg.segEntry.Next()
 			continue
 		}
 
@@ -961,6 +964,7 @@ func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool)
 		// transmitted in (C.1)."
 		s.outstanding++
 		dataSent = true
+
 		s.sendSegment(nextSeg)
 
 		segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
@@ -1035,7 +1039,7 @@ func (s *sender) sendData() {
 	if s.fr.active && s.ep.sackPermitted {
 		dataSent = s.handleSACKRecovery(s.maxPayloadSize, end)
 	} else {
-		for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() {
+		for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.segEntry.Next() {
 			cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize
 			if cwndLimit < limit {
 				limit = cwndLimit
@@ -1043,7 +1047,7 @@ func (s *sender) sendData() {
 			if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
 				// Move writeNext along so that we don't try and scan data that
 				// has already been SACKED.
-				s.writeNext = seg.Next()
+				s.writeNext = seg.segEntry.Next()
 				continue
 			}
 			if sent := s.maybeSendSegment(seg, limit, end); !sent {
@@ -1051,7 +1055,7 @@ func (s *sender) sendData() {
 			}
 			dataSent = true
 			s.outstanding += s.pCount(seg)
-			s.writeNext = seg.Next()
+			s.writeNext = seg.segEntry.Next()
 		}
 	}
 
@@ -1182,7 +1186,7 @@ func (s *sender) SetPipe() {
 	}
 	pipe := 0
 	smss := seqnum.Size(s.ep.scoreboard.SMSS())
-	for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() {
+	for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.segEntry.Next() {
 		// With GSO each segment can be much larger than SMSS. So check the segment
 		// in SMSS sized ranges.
 		segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.data.Size()))
@@ -1384,7 +1388,7 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 			}
 
 			if s.writeNext == seg {
-				s.writeNext = seg.Next()
+				s.writeNext = seg.segEntry.Next()
 			}
 
 			// Update the RACK fields if SACK is enabled.
@@ -1393,6 +1397,7 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 			}
 
 			s.writeList.Remove(seg)
+			s.rcList.Remove(seg)
 
 			// if SACK is enabled then Only reduce outstanding if
 			// the segment was not previously SACKED as these have
@@ -1460,6 +1465,12 @@ func (s *sender) sendSegment(seg *segment) *tcpip.Error {
 		if s.sndCwnd < s.sndSsthresh {
 			s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
 		}
+
+		// Move the segment which has to be retransmitted to the end of the list, as
+		// RACK requires the segments in the order of their transmission times.
+		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2
+		// Step 5
+		s.rcList.PushBack(seg)
 	}
 	seg.xmitTime = time.Now()
 	seg.xmitCount++
-- 
cgit v1.2.3


From cf38ac1c6c32b4c514bb56fc70073788835f3766 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 18 Aug 2020 18:50:24 -0700
Subject: Add more information to panic when device ID don't match

PiperOrigin-RevId: 327351357
---
 pkg/sentry/device/device.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go
index f45b2bd2b..6ca9dc79f 100644
--- a/pkg/sentry/device/device.go
+++ b/pkg/sentry/device/device.go
@@ -256,7 +256,7 @@ func (m *MultiDevice) Load(key MultiDeviceKey, value uint64) bool {
 	}
 	if k, exists := m.rcache[value]; exists && k != key {
 		// Should never happen.
-		panic("MultiDevice's caches are inconsistent")
+		panic(fmt.Sprintf("MultiDevice's caches are inconsistent, current: %+v, previous: %+v", key, k))
 	}
 
 	// Cache value at key.
-- 
cgit v1.2.3


From dbade2ec35aa836e8e3b02b0c145b658662728b3 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 18 Aug 2020 18:51:19 -0700
Subject: Don't set atime if mount is readonly

Updates #1035

PiperOrigin-RevId: 327351475
---
 pkg/sentry/fsimpl/gofer/time.go |  2 +-
 pkg/sentry/vfs/mount.go         | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index 2cb8191b9..e59d07e90 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -38,7 +38,7 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
 
 // Preconditions: d.cachedMetadataAuthoritative() == true.
 func (d *dentry) touchAtime(mnt *vfs.Mount) {
-	if mnt.Flags.NoATime {
+	if mnt.Flags.NoATime || mnt.ReadOnly() {
 		return
 	}
 	if err := mnt.CheckBeginWrite(); err != nil {
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index d1d29d0cd..67dfba986 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -114,7 +114,7 @@ func (mnt *Mount) Options() MountOptions {
 	defer mnt.vfs.mountMu.Unlock()
 	return MountOptions{
 		Flags:    mnt.Flags,
-		ReadOnly: mnt.readOnly(),
+		ReadOnly: mnt.ReadOnly(),
 	}
 }
 
@@ -688,7 +688,8 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error {
 	return nil
 }
 
-func (mnt *Mount) readOnly() bool {
+// ReadOnly returns true if mount is readonly.
+func (mnt *Mount) ReadOnly() bool {
 	return atomic.LoadInt64(&mnt.writers) < 0
 }
 
@@ -756,7 +757,7 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi
 		}
 
 		opts := "rw"
-		if mnt.readOnly() {
+		if mnt.ReadOnly() {
 			opts = "ro"
 		}
 		if mnt.Flags.NoATime {
@@ -844,7 +845,7 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
 
 		// (6) Mount options.
 		opts := "rw"
-		if mnt.readOnly() {
+		if mnt.ReadOnly() {
 			opts = "ro"
 		}
 		if mnt.Flags.NoATime {
@@ -883,7 +884,7 @@ func superBlockOpts(mountPath string, mnt *Mount) string {
 	// gVisor doesn't (yet) have a concept of super block options, so we
 	// use the ro/rw bit from the mount flag.
 	opts := "rw"
-	if mnt.readOnly() {
+	if mnt.ReadOnly() {
 		opts = "ro"
 	}
 
-- 
cgit v1.2.3


From b99fce30936ea42bf00e2c7270dc4ca797f766eb Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 18 Aug 2020 19:26:55 -0700
Subject: Move ERESTART* error definitions to syserror package.

This is needed to avoid circular dependencies between the vfs and kernel
packages.

PiperOrigin-RevId: 327355524
---
 pkg/sentry/fs/host/tty.go                    |  2 +-
 pkg/sentry/fsimpl/host/tty.go                |  2 +-
 pkg/sentry/kernel/task_run.go                |  5 +-
 pkg/sentry/kernel/task_signals.go            |  8 ++--
 pkg/sentry/kernel/task_syscall.go            | 69 +---------------------------
 pkg/sentry/syscalls/linux/error.go           |  2 +-
 pkg/sentry/syscalls/linux/sys_file.go        |  4 +-
 pkg/sentry/syscalls/linux/sys_futex.go       |  8 ++--
 pkg/sentry/syscalls/linux/sys_getdents.go    |  2 +-
 pkg/sentry/syscalls/linux/sys_lseek.go       |  2 +-
 pkg/sentry/syscalls/linux/sys_mmap.go        |  2 +-
 pkg/sentry/syscalls/linux/sys_poll.go        |  8 ++--
 pkg/sentry/syscalls/linux/sys_read.go        | 12 ++---
 pkg/sentry/syscalls/linux/sys_signal.go      |  4 +-
 pkg/sentry/syscalls/linux/sys_socket.go      | 14 +++---
 pkg/sentry/syscalls/linux/sys_splice.go      |  6 +--
 pkg/sentry/syscalls/linux/sys_sync.go        |  6 +--
 pkg/sentry/syscalls/linux/sys_thread.go      |  2 +-
 pkg/sentry/syscalls/linux/sys_time.go        |  4 +-
 pkg/sentry/syscalls/linux/sys_write.go       | 12 ++---
 pkg/sentry/syscalls/linux/vfs2/poll.go       |  8 ++--
 pkg/sentry/syscalls/linux/vfs2/read_write.go | 20 ++++----
 pkg/sentry/syscalls/linux/vfs2/socket.go     | 14 +++---
 pkg/sentry/syscalls/linux/vfs2/sync.go       |  2 +-
 pkg/syserror/syserror.go                     | 67 +++++++++++++++++++++++++++
 25 files changed, 143 insertions(+), 142 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index b5229098c..e29ae00f2 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -358,7 +358,7 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e
 	//
 	// Linux ignores the result of kill_pgrp().
 	_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
-	return kernel.ERESTARTSYS
+	return syserror.ERESTARTSYS
 }
 
 // LINT.ThenChange(../../fsimpl/host/tty.go)
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index d372c60cb..27cbd3059 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -376,7 +376,7 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal)
 	//
 	// Linux ignores the result of kill_pgrp().
 	_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
-	return kernel.ERESTARTSYS
+	return syserror.ERESTARTSYS
 }
 
 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index abaf29216..aa3a573c0 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -26,6 +26,7 @@ import (
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -189,8 +190,8 @@ func (app *runApp) execute(t *Task) taskRunState {
 	// a pending signal, causing another interruption, but that signal should
 	// not interact with the interrupted syscall.)
 	if t.haveSyscallReturn {
-		if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
-			if sre == ERESTART_RESTARTBLOCK {
+		if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+			if sre == syserror.ERESTART_RESTARTBLOCK {
 				t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
 				t.Arch().RestartSyscallWithRestartBlock()
 			} else {
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index cff2a8365..d6a2040bc 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -159,7 +159,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
 	sigact := computeAction(linux.Signal(info.Signo), act)
 
 	if t.haveSyscallReturn {
-		if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+		if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
 			// Signals that are ignored, cause a thread group stop, or
 			// terminate the thread group do not interact with interrupted
 			// syscalls; in Linux terms, they are never returned to the signal
@@ -168,11 +168,11 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
 			// signal that is actually handled (by userspace).
 			if sigact == SignalActionHandler {
 				switch {
-				case sre == ERESTARTNOHAND:
+				case sre == syserror.ERESTARTNOHAND:
 					fallthrough
-				case sre == ERESTART_RESTARTBLOCK:
+				case sre == syserror.ERESTART_RESTARTBLOCK:
 					fallthrough
-				case (sre == ERESTARTSYS && !act.IsRestart()):
+				case (sre == syserror.ERESTARTSYS && !act.IsRestart()):
 					t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
 					t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1)))
 				default:
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index a5903b0b5..2dbf86547 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -29,75 +29,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
-// include/linux/errno.h. These errnos are never returned to userspace
-// directly, but are used to communicate the expected behavior of an
-// interrupted syscall from the syscall to signal handling.
-type SyscallRestartErrno int
-
-// These numeric values are significant because ptrace syscall exit tracing can
-// observe them.
-//
-// For all of the following errnos, if the syscall is not interrupted by a
-// signal delivered to a user handler, the syscall is restarted.
-const (
-	// ERESTARTSYS is returned by an interrupted syscall to indicate that it
-	// should be converted to EINTR if interrupted by a signal delivered to a
-	// user handler without SA_RESTART set, and restarted otherwise.
-	ERESTARTSYS = SyscallRestartErrno(512)
-
-	// ERESTARTNOINTR is returned by an interrupted syscall to indicate that it
-	// should always be restarted.
-	ERESTARTNOINTR = SyscallRestartErrno(513)
-
-	// ERESTARTNOHAND is returned by an interrupted syscall to indicate that it
-	// should be converted to EINTR if interrupted by a signal delivered to a
-	// user handler, and restarted otherwise.
-	ERESTARTNOHAND = SyscallRestartErrno(514)
-
-	// ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate
-	// that it should be restarted using a custom function. The interrupted
-	// syscall must register a custom restart function by calling
-	// Task.SetRestartSyscallFn.
-	ERESTART_RESTARTBLOCK = SyscallRestartErrno(516)
-)
-
 var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application")
 
-// Error implements error.Error.
-func (e SyscallRestartErrno) Error() string {
-	// Descriptions are borrowed from strace.
-	switch e {
-	case ERESTARTSYS:
-		return "to be restarted if SA_RESTART is set"
-	case ERESTARTNOINTR:
-		return "to be restarted"
-	case ERESTARTNOHAND:
-		return "to be restarted if no handler"
-	case ERESTART_RESTARTBLOCK:
-		return "interrupted by signal"
-	default:
-		return "(unknown interrupt error)"
-	}
-}
-
-// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by
-// rv, the value in a syscall return register.
-func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) {
-	switch int(rv) {
-	case -int(ERESTARTSYS):
-		return ERESTARTSYS, true
-	case -int(ERESTARTNOINTR):
-		return ERESTARTNOINTR, true
-	case -int(ERESTARTNOHAND):
-		return ERESTARTNOHAND, true
-	case -int(ERESTART_RESTARTBLOCK):
-		return ERESTART_RESTARTBLOCK, true
-	default:
-		return 0, false
-	}
-}
-
 // SyscallRestartBlock represents the restart block for a syscall restartable
 // with a custom function. It encapsulates the state required to restart a
 // syscall across a S/R.
@@ -447,7 +380,7 @@ func ExtractErrno(err error, sysno int) int {
 		return 0
 	case syscall.Errno:
 		return int(err)
-	case SyscallRestartErrno:
+	case syserror.SyscallRestartErrno:
 		return int(err)
 	case *memmap.BusError:
 		// Bus errors may generate SIGBUS, but for syscalls they still
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 46060f6f5..dab6207c0 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -147,7 +147,7 @@ func handleIOErrorImpl(t *kernel.Task, partialResult bool, err, intr error, op s
 	}
 
 	switch err.(type) {
-	case kernel.SyscallRestartErrno:
+	case syserror.SyscallRestartErrno:
 		// Identical to the EINTR case.
 		return true, nil
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 1bc9b184e..256422689 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -184,7 +184,7 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint
 
 		file, err := d.Inode.GetFile(t, d, fileFlags)
 		if err != nil {
-			return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+			return syserror.ConvertIntr(err, syserror.ERESTARTSYS)
 		}
 		defer file.DecRef(t)
 
@@ -414,7 +414,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
 			// Create a new fs.File.
 			newFile, err = found.Inode.GetFile(t, found, fileFlags)
 			if err != nil {
-				return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+				return syserror.ConvertIntr(err, syserror.ERESTARTSYS)
 			}
 			defer newFile.DecRef(t)
 		case syserror.ENOENT:
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index 9d1b2edb1..12b2fa690 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -74,7 +74,7 @@ func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, fo
 	}
 
 	t.Futex().WaitComplete(w, t)
-	return 0, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+	return 0, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
 }
 
 // futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is
@@ -110,7 +110,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add
 
 	// The wait duration was absolute, restart with the original arguments.
 	if forever {
-		return 0, kernel.ERESTARTSYS
+		return 0, syserror.ERESTARTSYS
 	}
 
 	// The wait duration was relative, restart with the remaining duration.
@@ -121,7 +121,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add
 		val:      val,
 		mask:     mask,
 	})
-	return 0, kernel.ERESTART_RESTARTBLOCK
+	return 0, syserror.ERESTART_RESTARTBLOCK
 }
 
 func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.Addr, private bool) error {
@@ -149,7 +149,7 @@ func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.A
 	}
 
 	t.Futex().WaitComplete(w, t)
-	return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+	return syserror.ConvertIntr(err, syserror.ERESTARTSYS)
 }
 
 func tryLockPI(t *kernel.Task, addr usermem.Addr, private bool) error {
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index f5699e55d..59004cefe 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -82,7 +82,7 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir
 	ds := newDirentSerializer(f, w, t.Arch(), size)
 	rerr := dir.Readdir(t, ds)
 
-	switch err := handleIOError(t, ds.Written() > 0, rerr, kernel.ERESTARTSYS, "getdents", dir); err {
+	switch err := handleIOError(t, ds.Written() > 0, rerr, syserror.ERESTARTSYS, "getdents", dir); err {
 	case nil:
 		dir.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
 		return uintptr(ds.Written()), nil
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
index 1c38f8f4f..0046347cb 100644
--- a/pkg/sentry/syscalls/linux/sys_lseek.go
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -48,7 +48,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 
 	offset, serr := file.Seek(t, sw, offset)
-	err := handleIOError(t, false /* partialResult */, serr, kernel.ERESTARTSYS, "lseek", file)
+	err := handleIOError(t, false /* partialResult */, serr, syserror.ERESTARTSYS, "lseek", file)
 	if err != nil {
 		return 0, nil, err
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 72786b032..d0109baa4 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -267,7 +267,7 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	})
 	// MSync calls fsync, the same interrupt conversion rules apply, see
 	// mm/msync.c, fsync POSIX.1-2008.
-	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+	return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
 }
 
 // Mlock implements linux syscall mlock(2).
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 3435bdf77..789e2ed5b 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -410,7 +410,7 @@ func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration
 			nfds:    nfds,
 			timeout: remainingTimeout,
 		})
-		return 0, kernel.ERESTART_RESTARTBLOCK
+		return 0, syserror.ERESTART_RESTARTBLOCK
 	}
 	return n, err
 }
@@ -464,7 +464,7 @@ func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Note that this means that if err is nil but copyErr is not, copyErr is
 	// ignored. This is consistent with Linux.
 	if err == syserror.EINTR && copyErr == nil {
-		err = kernel.ERESTARTNOHAND
+		err = syserror.ERESTARTNOHAND
 	}
 	return n, nil, err
 }
@@ -494,7 +494,7 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
 	// See comment in Ppoll.
 	if err == syserror.EINTR && copyErr == nil {
-		err = kernel.ERESTARTNOHAND
+		err = syserror.ERESTARTNOHAND
 	}
 	return n, nil, err
 }
@@ -539,7 +539,7 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
 	// See comment in Ppoll.
 	if err == syserror.EINTR && copyErr == nil {
-		err = kernel.ERESTARTNOHAND
+		err = syserror.ERESTARTNOHAND
 	}
 	return n, nil, err
 }
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 3bbc3fa4b..f655d3db1 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -71,7 +71,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 
 	n, err := readv(t, file, dst)
 	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "read", file)
 }
 
 // Readahead implements readahead(2).
@@ -151,7 +151,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 
 	n, err := preadv(t, file, dst, offset)
 	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pread64", file)
 }
 
 // Readv implements linux syscall readv(2).
@@ -181,7 +181,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	n, err := readv(t, file, dst)
 	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "readv", file)
 }
 
 // Preadv implements linux syscall preadv(2).
@@ -222,7 +222,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 	n, err := preadv(t, file, dst, offset)
 	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv", file)
 }
 
 // Preadv2 implements linux syscall preadv2(2).
@@ -280,12 +280,12 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	if offset == -1 {
 		n, err := readv(t, file, dst)
 		t.IOUsage().AccountReadSyscall(n)
-		return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+		return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file)
 	}
 
 	n, err := preadv(t, file, dst, offset)
 	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file)
 }
 
 func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) {
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index 20cb1a5cb..e748d33d8 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -348,7 +348,7 @@ func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 
 // Pause implements linux syscall pause(2).
 func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND)
+	return 0, nil, syserror.ConvertIntr(t.Block(nil), syserror.ERESTARTNOHAND)
 }
 
 // RtSigpending implements linux syscall rt_sigpending(2).
@@ -496,7 +496,7 @@ func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 	t.SetSavedSignalMask(oldmask)
 
 	// Perform the wait.
-	return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND)
+	return 0, nil, syserror.ConvertIntr(t.Block(nil), syserror.ERESTARTNOHAND)
 }
 
 // RestartSyscall implements the linux syscall restart_syscall(2).
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index fec1c1974..38f573c14 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -285,7 +285,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 
 	blocking := !file.Flags().NonBlocking
-	return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS)
+	return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), syserror.ERESTARTSYS)
 }
 
 // accept is the implementation of the accept syscall. It is called by accept
@@ -316,7 +316,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f
 	peerRequested := addrLen != 0
 	nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
 	if e != nil {
-		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+		return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
 	}
 	if peerRequested {
 		// NOTE(magi): Linux does not give you an error if it can't
@@ -771,7 +771,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 	if msg.ControlLen == 0 && msg.NameLen == 0 {
 		n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
 		if err != nil {
-			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
+			return 0, syserror.ConvertIntr(err.ToError(), syserror.ERESTARTSYS)
 		}
 		if !cms.Unix.Empty() {
 			mflags |= linux.MSG_CTRUNC
@@ -793,7 +793,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 	}
 	n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
 	if e != nil {
-		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+		return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
 	}
 	defer cms.Release(t)
 
@@ -882,7 +882,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag
 	n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
 	cm.Release(t)
 	if e != nil {
-		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+		return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
 	}
 
 	// Copy the address to the caller.
@@ -1064,7 +1064,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
 
 	// Call the syscall implementation.
 	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
-	err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
+	err = handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
 	if err != nil {
 		controlMessages.Release(t)
 	}
@@ -1124,7 +1124,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags
 
 	// Call the syscall implementation.
 	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
-	return uintptr(n), handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
+	return uintptr(n), handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendto", file)
 }
 
 // SendTo implements the linux syscall sendto(2).
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index b8846a10a..c69941feb 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -170,7 +170,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 
 	// We can only pass a single file to handleIOError, so pick inFile
 	// arbitrarily. This is used only for debugging purposes.
-	return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "sendfile", inFile)
+	return uintptr(n), nil, handleIOError(t, false, err, syserror.ERESTARTSYS, "sendfile", inFile)
 }
 
 // Splice implements splice(2).
@@ -280,7 +280,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	}
 
 	// See above; inFile is chosen arbitrarily here.
-	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "splice", inFile)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "splice", inFile)
 }
 
 // Tee imlements tee(2).
@@ -333,5 +333,5 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
 	}
 
 	// See above; inFile is chosen arbitrarily here.
-	return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "tee", inFile)
+	return uintptr(n), nil, handleIOError(t, false, err, syserror.ERESTARTSYS, "tee", inFile)
 }
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index f2c0e5069..048a21c6e 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -57,7 +57,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	defer file.DecRef(t)
 
 	err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll)
-	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+	return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
 }
 
 // Fdatasync implements linux syscall fdatasync(2).
@@ -73,7 +73,7 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	defer file.DecRef(t)
 
 	err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData)
-	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+	return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
 }
 
 // SyncFileRange implements linux syscall sync_file_rage(2)
@@ -135,7 +135,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 		err = file.Fsync(t, offset, fs.FileMaxOffset, fs.SyncData)
 	}
 
-	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+	return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
 }
 
 // LINT.ThenChange(vfs2/sync.go)
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 2d16e4933..101096038 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -262,7 +262,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error {
 		wopts.Events |= kernel.EventGroupContinue
 	}
 	if options&linux.WNOHANG == 0 {
-		wopts.BlockInterruptErr = kernel.ERESTARTSYS
+		wopts.BlockInterruptErr = syserror.ERESTARTSYS
 	}
 	if options&linux.WNOTHREAD == 0 {
 		wopts.SiblingChildren = true
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index 2d2aa0819..a2a24a027 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -213,7 +213,7 @@ func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, ts linux.Timespec) error
 		return nil
 	}
 
-	return syserror.ConvertIntr(err, kernel.ERESTARTNOHAND)
+	return syserror.ConvertIntr(err, syserror.ERESTARTNOHAND)
 }
 
 // clockNanosleepFor blocks for a specified duration.
@@ -254,7 +254,7 @@ func clockNanosleepFor(t *kernel.Task, c ktime.Clock, dur time.Duration, rem use
 			duration: remaining,
 			rem:      rem,
 		})
-		return kernel.ERESTART_RESTARTBLOCK
+		return syserror.ERESTART_RESTARTBLOCK
 	default:
 		panic(fmt.Sprintf("Impossible BlockWithTimer error %v", err))
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index 485526e28..95bfe6606 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -71,7 +71,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	n, err := writev(t, file, src)
 	t.IOUsage().AccountWriteSyscall(n)
-	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "write", file)
 }
 
 // Pwrite64 implements linux syscall pwrite64(2).
@@ -118,7 +118,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 
 	n, err := pwritev(t, file, src, offset)
 	t.IOUsage().AccountWriteSyscall(n)
-	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwrite64", file)
 }
 
 // Writev implements linux syscall writev(2).
@@ -148,7 +148,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 	n, err := writev(t, file, src)
 	t.IOUsage().AccountWriteSyscall(n)
-	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "writev", file)
 }
 
 // Pwritev implements linux syscall pwritev(2).
@@ -189,7 +189,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 
 	n, err := pwritev(t, file, src, offset)
 	t.IOUsage().AccountWriteSyscall(n)
-	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev", file)
 }
 
 // Pwritev2 implements linux syscall pwritev2(2).
@@ -250,12 +250,12 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	if offset == -1 {
 		n, err := writev(t, file, src)
 		t.IOUsage().AccountWriteSyscall(n)
-		return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+		return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file)
 	}
 
 	n, err := pwritev(t, file, src, offset)
 	t.IOUsage().AccountWriteSyscall(n)
-	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file)
 }
 
 func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) {
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
index 7b9d5e18a..79ad64039 100644
--- a/pkg/sentry/syscalls/linux/vfs2/poll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -415,7 +415,7 @@ func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration
 			nfds:    nfds,
 			timeout: remainingTimeout,
 		})
-		return 0, kernel.ERESTART_RESTARTBLOCK
+		return 0, syserror.ERESTART_RESTARTBLOCK
 	}
 	return n, err
 }
@@ -462,7 +462,7 @@ func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Note that this means that if err is nil but copyErr is not, copyErr is
 	// ignored. This is consistent with Linux.
 	if err == syserror.EINTR && copyErr == nil {
-		err = kernel.ERESTARTNOHAND
+		err = syserror.ERESTARTNOHAND
 	}
 	return n, nil, err
 }
@@ -492,7 +492,7 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
 	// See comment in Ppoll.
 	if err == syserror.EINTR && copyErr == nil {
-		err = kernel.ERESTARTNOHAND
+		err = syserror.ERESTARTNOHAND
 	}
 	return n, nil, err
 }
@@ -533,7 +533,7 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
 	// See comment in Ppoll.
 	if err == syserror.EINTR && copyErr == nil {
-		err = kernel.ERESTARTNOHAND
+		err = syserror.ERESTARTNOHAND
 	}
 	return n, nil, err
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
index a905dae0a..b77b29dcc 100644
--- a/pkg/sentry/syscalls/linux/vfs2/read_write.go
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -62,7 +62,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 
 	n, err := read(t, file, dst, vfs.ReadOptions{})
 	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "read", file)
 }
 
 // Readv implements Linux syscall readv(2).
@@ -87,7 +87,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	n, err := read(t, file, dst, vfs.ReadOptions{})
 	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "readv", file)
 }
 
 func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
@@ -174,7 +174,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 
 	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
 	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pread64", file)
 }
 
 // Preadv implements Linux syscall preadv(2).
@@ -205,7 +205,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
 	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv", file)
 }
 
 // Preadv2 implements Linux syscall preadv2(2).
@@ -251,7 +251,7 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 		n, err = pread(t, file, dst, offset, opts)
 	}
 	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file)
 }
 
 func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
@@ -332,7 +332,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	n, err := write(t, file, src, vfs.WriteOptions{})
 	t.IOUsage().AccountWriteSyscall(n)
-	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "write", file)
 }
 
 // Writev implements Linux syscall writev(2).
@@ -357,7 +357,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 	n, err := write(t, file, src, vfs.WriteOptions{})
 	t.IOUsage().AccountWriteSyscall(n)
-	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "writev", file)
 }
 
 func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
@@ -444,7 +444,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 
 	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
 	t.IOUsage().AccountWriteSyscall(n)
-	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwrite64", file)
 }
 
 // Pwritev implements Linux syscall pwritev(2).
@@ -475,7 +475,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 
 	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
 	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev", file)
 }
 
 // Pwritev2 implements Linux syscall pwritev2(2).
@@ -521,7 +521,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		n, err = pwrite(t, file, src, offset, opts)
 	}
 	t.IOUsage().AccountWriteSyscall(n)
-	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file)
 }
 
 func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index 4a68c64f3..a5032657a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -288,7 +288,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 
 	blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
-	return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS)
+	return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), syserror.ERESTARTSYS)
 }
 
 // accept is the implementation of the accept syscall. It is called by accept
@@ -319,7 +319,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f
 	peerRequested := addrLen != 0
 	nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
 	if e != nil {
-		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+		return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
 	}
 	if peerRequested {
 		// NOTE(magi): Linux does not give you an error if it can't
@@ -774,7 +774,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
 	if msg.ControlLen == 0 && msg.NameLen == 0 {
 		n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
 		if err != nil {
-			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
+			return 0, syserror.ConvertIntr(err.ToError(), syserror.ERESTARTSYS)
 		}
 		if !cms.Unix.Empty() {
 			mflags |= linux.MSG_CTRUNC
@@ -796,7 +796,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
 	}
 	n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
 	if e != nil {
-		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+		return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
 	}
 	defer cms.Release(t)
 
@@ -885,7 +885,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag
 	n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
 	cm.Release(t)
 	if e != nil {
-		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+		return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
 	}
 
 	// Copy the address to the caller.
@@ -1067,7 +1067,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio
 
 	// Call the syscall implementation.
 	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
-	err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
+	err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
 	if err != nil {
 		controlMessages.Release(t)
 	}
@@ -1127,7 +1127,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags
 
 	// Call the syscall implementation.
 	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
-	return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
+	return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendto", file)
 }
 
 // SendTo implements the linux syscall sendto(2).
diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go
index a6491ac37..6e9b599e2 100644
--- a/pkg/sentry/syscalls/linux/vfs2/sync.go
+++ b/pkg/sentry/syscalls/linux/vfs2/sync.go
@@ -108,7 +108,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 
 	if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 {
 		if err := file.Sync(t); err != nil {
-			return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+			return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
 		}
 	}
 	return 0, nil, nil
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 798e07b01..fe9f50169 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -153,6 +153,73 @@ func ConvertIntr(err, intr error) error {
 	return err
 }
 
+// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
+// include/linux/errno.h. These errnos are never returned to userspace
+// directly, but are used to communicate the expected behavior of an
+// interrupted syscall from the syscall to signal handling.
+type SyscallRestartErrno int
+
+// These numeric values are significant because ptrace syscall exit tracing can
+// observe them.
+//
+// For all of the following errnos, if the syscall is not interrupted by a
+// signal delivered to a user handler, the syscall is restarted.
+const (
+	// ERESTARTSYS is returned by an interrupted syscall to indicate that it
+	// should be converted to EINTR if interrupted by a signal delivered to a
+	// user handler without SA_RESTART set, and restarted otherwise.
+	ERESTARTSYS = SyscallRestartErrno(512)
+
+	// ERESTARTNOINTR is returned by an interrupted syscall to indicate that it
+	// should always be restarted.
+	ERESTARTNOINTR = SyscallRestartErrno(513)
+
+	// ERESTARTNOHAND is returned by an interrupted syscall to indicate that it
+	// should be converted to EINTR if interrupted by a signal delivered to a
+	// user handler, and restarted otherwise.
+	ERESTARTNOHAND = SyscallRestartErrno(514)
+
+	// ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate
+	// that it should be restarted using a custom function. The interrupted
+	// syscall must register a custom restart function by calling
+	// Task.SetRestartSyscallFn.
+	ERESTART_RESTARTBLOCK = SyscallRestartErrno(516)
+)
+
+// Error implements error.Error.
+func (e SyscallRestartErrno) Error() string {
+	// Descriptions are borrowed from strace.
+	switch e {
+	case ERESTARTSYS:
+		return "to be restarted if SA_RESTART is set"
+	case ERESTARTNOINTR:
+		return "to be restarted"
+	case ERESTARTNOHAND:
+		return "to be restarted if no handler"
+	case ERESTART_RESTARTBLOCK:
+		return "interrupted by signal"
+	default:
+		return "(unknown interrupt error)"
+	}
+}
+
+// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by
+// rv, the value in a syscall return register.
+func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) {
+	switch int(rv) {
+	case -int(ERESTARTSYS):
+		return ERESTARTSYS, true
+	case -int(ERESTARTNOINTR):
+		return ERESTARTNOINTR, true
+	case -int(ERESTARTNOHAND):
+		return ERESTARTNOHAND, true
+	case -int(ERESTART_RESTARTBLOCK):
+		return ERESTART_RESTARTBLOCK, true
+	default:
+		return 0, false
+	}
+}
+
 func init() {
 	AddErrorTranslation(ErrWouldBlock, syscall.EWOULDBLOCK)
 	AddErrorTranslation(ErrInterrupted, syscall.EINTR)
-- 
cgit v1.2.3


From ab98a35a9adb2df0359478b8898e78337e2d0392 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 18 Aug 2020 21:52:08 -0700
Subject: Get rid of kernfs.Inode.Destroy.

This interface method is unneeded.

PiperOrigin-RevId: 327370325
---
 pkg/sentry/fsimpl/host/host.go              | 19 ++++++++-----------
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 19 ++++++-------------
 pkg/sentry/fsimpl/kernfs/kernfs.go          |  4 ----
 3 files changed, 14 insertions(+), 28 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index bd6caba06..56869f59a 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -432,17 +432,14 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 
 // DecRef implements kernfs.Inode.
 func (i *inode) DecRef(ctx context.Context) {
-	i.AtomicRefCount.DecRefWithDestructor(ctx, i.Destroy)
-}
-
-// Destroy implements kernfs.Inode.
-func (i *inode) Destroy(context.Context) {
-	if i.wouldBlock {
-		fdnotifier.RemoveFD(int32(i.hostFD))
-	}
-	if err := unix.Close(i.hostFD); err != nil {
-		log.Warningf("failed to close host fd %d: %v", i.hostFD, err)
-	}
+	i.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
+		if i.wouldBlock {
+			fdnotifier.RemoveFD(int32(i.hostFD))
+		}
+		if err := unix.Close(i.hostFD); err != nil {
+			log.Warningf("failed to close host fd %d: %v", i.hostFD, err)
+		}
+	})
 }
 
 // Open implements kernfs.Inode.
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index c3efcf3ec..fe8a1e710 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -48,10 +48,6 @@ func (InodeNoopRefCount) TryIncRef() bool {
 	return true
 }
 
-// Destroy implements Inode.Destroy.
-func (InodeNoopRefCount) Destroy(context.Context) {
-}
-
 // InodeDirectoryNoNewChildren partially implements the Inode interface.
 // InodeDirectoryNoNewChildren represents a directory inode which does not
 // support creation of new children.
@@ -367,15 +363,12 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
 
 // DecRef implements Inode.DecRef.
 func (o *OrderedChildren) DecRef(ctx context.Context) {
-	o.AtomicRefCount.DecRefWithDestructor(ctx, o.Destroy)
-}
-
-// Destroy cleans up resources referenced by this OrderedChildren.
-func (o *OrderedChildren) Destroy(context.Context) {
-	o.mu.Lock()
-	defer o.mu.Unlock()
-	o.order.Reset()
-	o.set = nil
+	o.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
+		o.mu.Lock()
+		defer o.mu.Unlock()
+		o.order.Reset()
+		o.set = nil
+	})
 }
 
 // Populate inserts children into this OrderedChildren, and d's dentry
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 080118841..51dbc050c 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -328,10 +328,6 @@ type inodeRefs interface {
 	IncRef()
 	DecRef(ctx context.Context)
 	TryIncRef() bool
-	// Destroy is called when the inode reaches zero references. Destroy release
-	// all resources (references) on objects referenced by the inode, including
-	// any child dentries.
-	Destroy(ctx context.Context)
 }
 
 type inodeMetadata interface {
-- 
cgit v1.2.3


From 01098ad9a23c01bbbd3f8d60242646f88dd42040 Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Tue, 18 Aug 2020 21:55:16 -0700
Subject: [vfs] Allow offsets for special files other than regular files.

Some character and block devices can be seekable. So allow their FD to maintain
file offset.

PiperOrigin-RevId: 327370684
---
 pkg/sentry/fsimpl/gofer/special_file.go | 35 +++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 13 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index a6368fdd0..3c39aa9b7 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -39,8 +39,14 @@ type specialFileFD struct {
 	// handle is used for file I/O. handle is immutable.
 	handle handle
 
+	// isRegularFile is true if this FD represents a regular file which is only
+	// possible when filesystemOptions.regularFilesUseSpecialFileFD is in
+	// effect. isRegularFile is immutable.
+	isRegularFile bool
+
 	// seekable is true if this file description represents a file for which
-	// file offset is significant, i.e. a regular file. seekable is immutable.
+	// file offset is significant, i.e. a regular file, character device or
+	// block device. seekable is immutable.
 	seekable bool
 
 	// haveQueue is true if this file description represents a file for which
@@ -55,12 +61,13 @@ type specialFileFD struct {
 
 func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) {
 	ftype := d.fileType()
-	seekable := ftype == linux.S_IFREG
+	seekable := ftype == linux.S_IFREG || ftype == linux.S_IFCHR || ftype == linux.S_IFBLK
 	haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0
 	fd := &specialFileFD{
-		handle:    h,
-		seekable:  seekable,
-		haveQueue: haveQueue,
+		handle:        h,
+		isRegularFile: ftype == linux.S_IFREG,
+		seekable:      seekable,
+		haveQueue:     haveQueue,
 	}
 	fd.LockFD.Init(locks)
 	if haveQueue {
@@ -200,13 +207,13 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
 	// If the regular file fd was opened with O_APPEND, make sure the file size
 	// is updated. There is a possible race here if size is modified externally
 	// after metadata cache is updated.
-	if fd.seekable && fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+	if fd.isRegularFile && fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
 		if err := d.updateFromGetattr(ctx); err != nil {
 			return 0, offset, err
 		}
 	}
 
-	if fd.seekable {
+	if fd.isRegularFile {
 		// We need to hold the metadataMu *while* writing to a regular file.
 		d.metadataMu.Lock()
 		defer d.metadataMu.Unlock()
@@ -236,18 +243,20 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
 	if err == syserror.EAGAIN {
 		err = syserror.ErrWouldBlock
 	}
-	finalOff = offset
+	// Update offset if the offset is valid.
+	if offset >= 0 {
+		offset += int64(n)
+	}
 	// Update file size for regular files.
-	if fd.seekable {
-		finalOff += int64(n)
+	if fd.isRegularFile {
 		// d.metadataMu is already locked at this point.
-		if uint64(finalOff) > d.size {
+		if uint64(offset) > d.size {
 			d.dataMu.Lock()
 			defer d.dataMu.Unlock()
-			atomic.StoreUint64(&d.size, uint64(finalOff))
+			atomic.StoreUint64(&d.size, uint64(offset))
 		}
 	}
-	return int64(n), finalOff, err
+	return int64(n), offset, err
 }
 
 // Write implements vfs.FileDescriptionImpl.Write.
-- 
cgit v1.2.3


From 1c3c12a37e01adffe5f2ed44d094f29baf0fd2a6 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 19 Aug 2020 08:50:59 -0700
Subject: Return appropriate errors when file locking is unsuccessful.

test_eintr now passes in the Python runtime tests.

Updates #3515.

PiperOrigin-RevId: 327441081
---
 pkg/sentry/vfs/lock.go       | 16 +++++++++++--
 test/syscalls/linux/flock.cc | 54 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 66 insertions(+), 4 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go
index 6c7583a81..42666eebf 100644
--- a/pkg/sentry/vfs/lock.go
+++ b/pkg/sentry/vfs/lock.go
@@ -46,7 +46,13 @@ func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fsloc
 	if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) {
 		return nil
 	}
-	return syserror.ErrWouldBlock
+
+	// Return an appropriate error for the unsuccessful lock attempt, depending on
+	// whether this is a blocking or non-blocking operation.
+	if block == nil {
+		return syserror.ErrWouldBlock
+	}
+	return syserror.ERESTARTSYS
 }
 
 // UnlockBSD releases a BSD-style lock on the entire file.
@@ -66,7 +72,13 @@ func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fsl
 	if fl.posix.LockRegion(uid, t, rng, block) {
 		return nil
 	}
-	return syserror.ErrWouldBlock
+
+	// Return an appropriate error for the unsuccessful lock attempt, depending on
+	// whether this is a blocking or non-blocking operation.
+	if block == nil {
+		return syserror.ErrWouldBlock
+	}
+	return syserror.ERESTARTSYS
 }
 
 // UnlockPOSIX releases a POSIX-style lock on a file region.
diff --git a/test/syscalls/linux/flock.cc b/test/syscalls/linux/flock.cc
index 638a93979..549141cbb 100644
--- a/test/syscalls/linux/flock.cc
+++ b/test/syscalls/linux/flock.cc
@@ -185,7 +185,7 @@ TEST_F(FlockTest, TestMultipleHolderSharedExclusive) {
   ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
 }
 
-TEST_F(FlockTest, TestSharedLockFailExclusiveHolder) {
+TEST_F(FlockTest, TestSharedLockFailExclusiveHolderNonblocking) {
   // This test will verify that a shared lock is denied while
   // someone holds an exclusive lock.
   ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
@@ -203,7 +203,33 @@ TEST_F(FlockTest, TestSharedLockFailExclusiveHolder) {
   ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
 }
 
-TEST_F(FlockTest, TestExclusiveLockFailExclusiveHolder) {
+void trivial_handler(int signum) {}
+
+TEST_F(FlockTest, TestSharedLockFailExclusiveHolderBlocking_NoRandomSave) {
+  const DisableSave ds;  // Timing-related.
+
+  // This test will verify that a shared lock is denied while
+  // someone holds an exclusive lock.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+  // Register a signal handler for SIGALRM and set an alarm that will go off
+  // while blocking in the subsequent flock() call. This will interrupt flock()
+  // and cause it to return EINTR.
+  struct sigaction act = {};
+  act.sa_handler = trivial_handler;
+  ASSERT_THAT(sigaction(SIGALRM, &act, NULL), SyscallSucceeds());
+  ASSERT_THAT(ualarm(10000, 0), SyscallSucceeds());
+  ASSERT_THAT(flock(fd.get(), LOCK_SH), SyscallFailsWithErrno(EINTR));
+
+  // Unlock
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestExclusiveLockFailExclusiveHolderNonblocking) {
   // This test will verify that an exclusive lock is denied while
   // someone already holds an exclsuive lock.
   ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
@@ -221,6 +247,30 @@ TEST_F(FlockTest, TestExclusiveLockFailExclusiveHolder) {
   ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
 }
 
+TEST_F(FlockTest, TestExclusiveLockFailExclusiveHolderBlocking_NoRandomSave) {
+  const DisableSave ds;  // Timing-related.
+
+  // This test will verify that an exclusive lock is denied while
+  // someone already holds an exclsuive lock.
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+              SyscallSucceedsWithValue(0));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+  // Register a signal handler for SIGALRM and set an alarm that will go off
+  // while blocking in the subsequent flock() call. This will interrupt flock()
+  // and cause it to return EINTR.
+  struct sigaction act = {};
+  act.sa_handler = trivial_handler;
+  ASSERT_THAT(sigaction(SIGALRM, &act, NULL), SyscallSucceeds());
+  ASSERT_THAT(ualarm(10000, 0), SyscallSucceeds());
+  ASSERT_THAT(flock(fd.get(), LOCK_EX), SyscallFailsWithErrno(EINTR));
+
+  // Unlock
+  ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
 TEST_F(FlockTest, TestMultipleHolderSharedExclusiveUpgrade) {
   // This test will verify that we cannot obtain an exclusive lock while
   // a shared lock is held by another descriptor, then verify that an upgrade
-- 
cgit v1.2.3


From 55ad34a05b5b1a36e89d2269c9ca4918f81adf48 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 19 Aug 2020 11:43:24 -0700
Subject: Change runtimeoptions proto handling.

Stolen from cl/327337408 (ascannell is OOO)

PiperOrigin-RevId: 327475423
---
 pkg/shim/v2/runtimeoptions/BUILD                  | 16 ++++++-
 pkg/shim/v2/runtimeoptions/runtimeoptions.go      |  3 ++
 pkg/shim/v2/runtimeoptions/runtimeoptions.proto   |  4 +-
 pkg/shim/v2/runtimeoptions/runtimeoptions_test.go | 52 +++++++++++++++++++++++
 4 files changed, 71 insertions(+), 4 deletions(-)
 create mode 100644 pkg/shim/v2/runtimeoptions/runtimeoptions_test.go

(limited to 'pkg')

diff --git a/pkg/shim/v2/runtimeoptions/BUILD b/pkg/shim/v2/runtimeoptions/BUILD
index 01716034c..ba2ed1ea7 100644
--- a/pkg/shim/v2/runtimeoptions/BUILD
+++ b/pkg/shim/v2/runtimeoptions/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library", "proto_library")
+load("//tools:defs.bzl", "go_library", "go_test", "proto_library")
 
 package(licenses = ["notice"])
 
@@ -14,7 +14,19 @@ go_library(
     srcs = ["runtimeoptions.go"],
     visibility = ["//pkg/shim/v2:__pkg__"],
     deps = [
-        "//pkg/shim/v2/runtimeoptions:api_go_proto",
+        ":api_go_proto",
         "@com_github_gogo_protobuf//proto:go_default_library",
     ],
 )
+
+go_test(
+    name = "runtimeoptions_test",
+    size = "small",
+    srcs = ["runtimeoptions_test.go"],
+    library = ":runtimeoptions",
+    deps = [
+        "@com_github_containerd_containerd//runtime/v1/shim/v1:go_default_library",
+        "@com_github_containerd_typeurl//:go_default_library",
+        "@com_github_golang_protobuf//proto:go_default_library",
+    ],
+)
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions.go b/pkg/shim/v2/runtimeoptions/runtimeoptions.go
index 1c1a0c5d1..aaf17b87a 100644
--- a/pkg/shim/v2/runtimeoptions/runtimeoptions.go
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions.go
@@ -23,5 +23,8 @@ import (
 type Options = pb.Options
 
 func init() {
+	// The generated proto file auto registers with "golang/protobuf/proto"
+	// package. However, typeurl uses "golang/gogo/protobuf/proto". So registers
+	// the type there too.
 	proto.RegisterType((*Options)(nil), "cri.runtimeoptions.v1.Options")
 }
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions.proto b/pkg/shim/v2/runtimeoptions/runtimeoptions.proto
index edb19020a..057032e34 100644
--- a/pkg/shim/v2/runtimeoptions/runtimeoptions.proto
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions.proto
@@ -14,11 +14,11 @@
 
 syntax = "proto3";
 
-package runtimeoptions;
+package cri.runtimeoptions.v1;
 
 // This is a version of the runtimeoptions CRI API that is vendored.
 //
-// Imported the full CRI package is a nightmare.
+// Importing the full CRI package is a nightmare.
 message Options {
   string type_url = 1;
   string config_path = 2;
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go b/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go
new file mode 100644
index 000000000..f4c238a00
--- /dev/null
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go
@@ -0,0 +1,52 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package runtimeoptions
+
+import (
+	"testing"
+
+	shim "github.com/containerd/containerd/runtime/v1/shim/v1"
+	"github.com/containerd/typeurl"
+	"github.com/golang/protobuf/proto"
+)
+
+func TestCreateTaskRequest(t *testing.T) {
+	// Serialize the top-level message.
+	const encodedText = `options: <
+  type_url: "cri.runtimeoptions.v1.Options"
+  value: "\n\010type_url\022\013config_path"
+>`
+	got := &shim.CreateTaskRequest{} // Should have raw options.
+	if err := proto.UnmarshalText(encodedText, got); err != nil {
+		t.Fatalf("unable to unmarshal text: %v", err)
+	}
+	t.Logf("got: %s", proto.MarshalTextString(got))
+
+	// Check the options.
+	wantOptions := &Options{}
+	wantOptions.TypeUrl = "type_url"
+	wantOptions.ConfigPath = "config_path"
+	gotMessage, err := typeurl.UnmarshalAny(got.Options)
+	if err != nil {
+		t.Fatalf("unable to unmarshal any: %v", err)
+	}
+	gotOptions, ok := gotMessage.(*Options)
+	if !ok {
+		t.Fatalf("got %v, want %v", gotMessage, wantOptions)
+	}
+	if !proto.Equal(gotOptions, wantOptions) {
+		t.Fatalf("got %v, want %v", gotOptions, wantOptions)
+	}
+}
-- 
cgit v1.2.3


From 15f7c43b75f34635261df05003a4d58519bbe02e Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 19 Aug 2020 11:50:54 -0700
Subject: Remove use of channels from p9.connState legacy transport.

- Remove sendDone, which currently does nothing whatsoever (errors sent to the
  channel are completely unused). Instead, have request handlers log errors
  they get from p9.send() inline.

- Replace recvOkay and recvDone with recvMu/recvIdle/recvShutdown. In addition
  to being slightly clearer (IMO), this eliminates the p9.connState.service()
  goroutine, significantly reducing the overhead involved in passing connection
  receive access between goroutines (from buffered chan send/recv + unbuffered
  chan send/recv to just a mutex unlock/lock).

PiperOrigin-RevId: 327476755
---
 pkg/p9/server.go | 147 +++++++++++++++++++++++++------------------------------
 1 file changed, 68 insertions(+), 79 deletions(-)

(limited to 'pkg')

diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index b9f15e4ed..3736f12a3 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -60,12 +60,6 @@ type connState struct {
 	// server is the backing server.
 	server *Server
 
-	// sendMu is the send lock.
-	sendMu sync.Mutex
-
-	// conn is the connection.
-	conn *unet.Socket
-
 	// fids is the set of active FIDs.
 	//
 	// This is used to find FIDs for files.
@@ -92,14 +86,25 @@ type connState struct {
 
 	// -- below relates to the legacy handler --
 
-	// recvOkay indicates that a receive may start.
-	recvOkay chan bool
+	// recvMu serializes receiving from conn.
+	recvMu sync.Mutex
+
+	// recvIdle is the number of goroutines in handleRequests() attempting to
+	// lock recvMu so that they can receive from conn. recvIdle is accessed
+	// using atomic memory operations.
+	recvIdle int32
+
+	// If recvShutdown is true, at least one goroutine has observed a
+	// connection error while receiving from conn, and all goroutines in
+	// handleRequests() should exit immediately. recvShutdown is protected by
+	// recvMu.
+	recvShutdown bool
 
-	// recvDone is signalled when a message is received.
-	recvDone chan error
+	// sendMu serializes sending to conn.
+	sendMu sync.Mutex
 
-	// sendDone is signalled when a send is finished.
-	sendDone chan error
+	// conn is the connection used by the legacy transport.
+	conn *unet.Socket
 
 	// -- below relates to the flipcall handler --
 
@@ -508,11 +513,21 @@ func (cs *connState) handle(m message) (r message) {
 	return
 }
 
-// handleRequest handles a single request.
-//
-// The recvDone channel is signaled when recv is done (with a error if
-// necessary). The sendDone channel is signaled with the result of the send.
-func (cs *connState) handleRequest() {
+// handleRequest handles a single request. It returns true if the caller should
+// continue handling requests and false if it should terminate.
+func (cs *connState) handleRequest() bool {
+	// Obtain the right to receive a message from cs.conn.
+	atomic.AddInt32(&cs.recvIdle, 1)
+	cs.recvMu.Lock()
+	atomic.AddInt32(&cs.recvIdle, -1)
+
+	if cs.recvShutdown {
+		// Another goroutine already detected a connection problem; exit
+		// immediately.
+		cs.recvMu.Unlock()
+		return false
+	}
+
 	messageSize := atomic.LoadUint32(&cs.messageSize)
 	if messageSize == 0 {
 		// Default or not yet negotiated.
@@ -523,12 +538,17 @@ func (cs *connState) handleRequest() {
 	tag, m, err := recv(cs.conn, messageSize, msgRegistry.get)
 	if errSocket, ok := err.(ErrSocket); ok {
 		// Connection problem; stop serving.
-		cs.recvDone <- errSocket.error
-		return
+		log.Debugf("p9.recv: %v", errSocket.error)
+		cs.recvShutdown = true
+		cs.recvMu.Unlock()
+		return false
 	}
 
-	// Signal receive is done.
-	cs.recvDone <- nil
+	// Ensure that another goroutine is available to receive from cs.conn.
+	if atomic.LoadInt32(&cs.recvIdle) == 0 {
+		go cs.handleRequests() // S/R-SAFE: Irrelevant.
+	}
+	cs.recvMu.Unlock()
 
 	// Deal with other errors.
 	if err != nil && err != io.EOF {
@@ -537,16 +557,17 @@ func (cs *connState) handleRequest() {
 		cs.sendMu.Lock()
 		err := send(cs.conn, tag, newErr(err))
 		cs.sendMu.Unlock()
-		cs.sendDone <- err
-		return
+		if err != nil {
+			log.Debugf("p9.send: %v", err)
+		}
+		return true
 	}
 
 	// Try to start the tag.
 	if !cs.StartTag(tag) {
 		// Nothing we can do at this point; client is bogus.
 		log.Debugf("no valid tag [%05d]", tag)
-		cs.sendDone <- ErrNoValidMessage
-		return
+		return true
 	}
 
 	// Handle the message.
@@ -560,15 +581,21 @@ func (cs *connState) handleRequest() {
 	cs.sendMu.Lock()
 	err = send(cs.conn, tag, r)
 	cs.sendMu.Unlock()
-	cs.sendDone <- err
+	if err != nil {
+		log.Debugf("p9.send: %v", err)
+	}
 
 	// Return the message to the cache.
 	msgRegistry.put(m)
+
+	return true
 }
 
 func (cs *connState) handleRequests() {
-	for range cs.recvOkay {
-		cs.handleRequest()
+	for {
+		if !cs.handleRequest() {
+			return
+		}
 	}
 }
 
@@ -578,11 +605,6 @@ func (cs *connState) stop() {
 	// us with SIGABRT to get a stack dump of the offending handler.
 	cs.pendingWg.Wait()
 
-	// Close all channels.
-	close(cs.recvOkay)
-	close(cs.recvDone)
-	close(cs.sendDone)
-
 	// Free the channels.
 	cs.channelMu.Lock()
 	for _, ch := range cs.channels {
@@ -600,6 +622,9 @@ func (cs *connState) stop() {
 		cs.channelAlloc.Destroy()
 	}
 
+	// Ensure the connection is closed.
+	cs.conn.Close()
+
 	// Close all remaining fids.
 	for fid, fidRef := range cs.fids {
 		delete(cs.fids, fid)
@@ -609,59 +634,23 @@ func (cs *connState) stop() {
 		// handlers running via the wait for Pending => 0 below.
 		fidRef.DecRef()
 	}
-
-	// Ensure the connection is closed.
-	cs.conn.Close()
-}
-
-// service services requests concurrently.
-func (cs *connState) service() error {
-	// Start the first request handler.
-	go cs.handleRequests() // S/R-SAFE: Irrelevant.
-	cs.recvOkay <- true
-
-	// We loop and make sure there's always one goroutine waiting for a new
-	// request. We process all the data for a single request in one
-	// goroutine however, to ensure the best turnaround time possible.
-	for {
-		select {
-		case err := <-cs.recvDone:
-			if err != nil {
-				return err
-			}
-
-			// Kick the next receiver, or start a new handler
-			// if no receiver is currently waiting.
-			select {
-			case cs.recvOkay <- true:
-			default:
-				go cs.handleRequests() // S/R-SAFE: Irrelevant.
-				cs.recvOkay <- true
-			}
-
-		case <-cs.sendDone:
-			// Error sending a response? Nothing can be done.
-			//
-			// We don't terminate on a send error though, since
-			// we still have a pending receive. The error would
-			// have been logged above, we just ignore it here.
-		}
-	}
 }
 
 // Handle handles a single connection.
 func (s *Server) Handle(conn *unet.Socket) error {
 	cs := &connState{
-		server:   s,
-		conn:     conn,
-		fids:     make(map[FID]*fidRef),
-		tags:     make(map[Tag]chan struct{}),
-		recvOkay: make(chan bool),
-		recvDone: make(chan error, 10),
-		sendDone: make(chan error, 10),
+		server: s,
+		fids:   make(map[FID]*fidRef),
+		tags:   make(map[Tag]chan struct{}),
+		conn:   conn,
 	}
 	defer cs.stop()
-	return cs.service()
+
+	// Serve requests from conn in the current goroutine; handleRequests() will
+	// create more goroutines as needed.
+	cs.handleRequests()
+
+	return nil
 }
 
 // Serve handles requests from the bound socket.
-- 
cgit v1.2.3


From 167b2efc94816b0ff823e12c22023c3ccbd16ae9 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 19 Aug 2020 13:45:20 -0700
Subject: ip6tables: move ipv4-specific logic into its own file

A later change will introduce the equivalent IPv6 logic.

#3549

PiperOrigin-RevId: 327499064
---
 pkg/sentry/socket/netfilter/BUILD           |   1 +
 pkg/sentry/socket/netfilter/ipv4.go         | 235 +++++++++++++++++++++++++
 pkg/sentry/socket/netfilter/netfilter.go    | 262 ++++------------------------
 pkg/sentry/socket/netstack/netstack.go      |  13 +-
 pkg/sentry/socket/netstack/netstack_vfs2.go |  13 +-
 pkg/sentry/strace/socket.go                 |   2 +
 6 files changed, 293 insertions(+), 233 deletions(-)
 create mode 100644 pkg/sentry/socket/netfilter/ipv4.go

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index 721094bbf..795620589 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "netfilter",
     srcs = [
         "extensions.go",
+        "ipv4.go",
         "netfilter.go",
         "owner_matcher.go",
         "targets.go",
diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go
new file mode 100644
index 000000000..4fb887e49
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/ipv4.go
@@ -0,0 +1,235 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// emptyIPv4Filter is for comparison with a rule's filters to determine whether
+// it is also empty. It is immutable.
+var emptyIPv4Filter = stack.IPHeaderFilter{
+	Dst:     "\x00\x00\x00\x00",
+	DstMask: "\x00\x00\x00\x00",
+	Src:     "\x00\x00\x00\x00",
+	SrcMask: "\x00\x00\x00\x00",
+}
+
+func getEntries4(table stack.Table, info *linux.IPTGetinfo) linux.KernelIPTGetEntries {
+	var entries linux.KernelIPTGetEntries
+	copy(entries.Name[:], info.Name[:])
+
+	for ruleIdx, rule := range table.Rules {
+		nflog("convert to binary: current offset: %d", entries.Size)
+
+		setHooksAndUnderflow(info, table, entries.Size, ruleIdx)
+		// Each rule corresponds to an entry.
+		entry := linux.KernelIPTEntry{
+			Entry: linux.IPTEntry{
+				IP: linux.IPTIP{
+					Protocol: uint16(rule.Filter.Protocol),
+				},
+				NextOffset:   linux.SizeOfIPTEntry,
+				TargetOffset: linux.SizeOfIPTEntry,
+			},
+		}
+		copy(entry.Entry.IP.Dst[:], rule.Filter.Dst)
+		copy(entry.Entry.IP.DstMask[:], rule.Filter.DstMask)
+		copy(entry.Entry.IP.Src[:], rule.Filter.Src)
+		copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask)
+		copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface)
+		copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
+		if rule.Filter.DstInvert {
+			entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP
+		}
+		if rule.Filter.SrcInvert {
+			entry.Entry.IP.InverseFlags |= linux.IPT_INV_SRCIP
+		}
+		if rule.Filter.OutputInterfaceInvert {
+			entry.Entry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
+		}
+
+		for _, matcher := range rule.Matchers {
+			// Serialize the matcher and add it to the
+			// entry.
+			serialized := marshalMatcher(matcher)
+			nflog("convert to binary: matcher serialized as: %v", serialized)
+			if len(serialized)%8 != 0 {
+				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
+			}
+			entry.Elems = append(entry.Elems, serialized...)
+			entry.Entry.NextOffset += uint16(len(serialized))
+			entry.Entry.TargetOffset += uint16(len(serialized))
+		}
+
+		// Serialize and append the target.
+		serialized := marshalTarget(rule.Target)
+		if len(serialized)%8 != 0 {
+			panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+		}
+		entry.Elems = append(entry.Elems, serialized...)
+		entry.Entry.NextOffset += uint16(len(serialized))
+
+		nflog("convert to binary: adding entry: %+v", entry)
+
+		entries.Size += uint32(entry.Entry.NextOffset)
+		entries.Entrytable = append(entries.Entrytable, entry)
+		info.NumEntries++
+	}
+
+	info.Size = entries.Size
+	nflog("convert to binary: finished with an marshalled size of %d", info.Size)
+	return entries
+}
+
+func modifyEntries4(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) {
+	nflog("set entries: setting entries in table %q", replace.Name.String())
+
+	// Convert input into a list of rules and their offsets.
+	var offset uint32
+	// offsets maps rule byte offsets to their position in table.Rules.
+	offsets := map[uint32]int{}
+	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+		nflog("set entries: processing entry at offset %d", offset)
+
+		// Get the struct ipt_entry.
+		if len(optVal) < linux.SizeOfIPTEntry {
+			nflog("optVal has insufficient size for entry %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		var entry linux.IPTEntry
+		buf := optVal[:linux.SizeOfIPTEntry]
+		binary.Unmarshal(buf, usermem.ByteOrder, &entry)
+		initialOptValLen := len(optVal)
+		optVal = optVal[linux.SizeOfIPTEntry:]
+
+		if entry.TargetOffset < linux.SizeOfIPTEntry {
+			nflog("entry has too-small target offset %d", entry.TargetOffset)
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/170): We should support more IPTIP
+		// filtering fields.
+		filter, err := filterFromIPTIP(entry.IP)
+		if err != nil {
+			nflog("bad iptip: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
+		// that they only work for certain protocols, hooks, tables.
+		// Get matchers.
+		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
+		if len(optVal) < int(matchersSize) {
+			nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		matchers, err := parseMatchers(filter, optVal[:matchersSize])
+		if err != nil {
+			nflog("failed to parse matchers: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+		optVal = optVal[matchersSize:]
+
+		// Get the target of the rule.
+		targetSize := entry.NextOffset - entry.TargetOffset
+		if len(optVal) < int(targetSize) {
+			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		target, err := parseTarget(filter, optVal[:targetSize])
+		if err != nil {
+			nflog("failed to parse target: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+		optVal = optVal[targetSize:]
+
+		table.Rules = append(table.Rules, stack.Rule{
+			Filter:   filter,
+			Target:   target,
+			Matchers: matchers,
+		})
+		offsets[offset] = int(entryIdx)
+		offset += uint32(entry.NextOffset)
+
+		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
+			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+	}
+	return offsets, nil
+}
+
+func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
+	if containsUnsupportedFields4(iptip) {
+		return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
+	}
+	if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
+	}
+	if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
+	}
+
+	n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterface)
+	}
+	ifname := string(iptip.OutputInterface[:n])
+
+	n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterfaceMask)
+	}
+	ifnameMask := string(iptip.OutputInterfaceMask[:n])
+
+	return stack.IPHeaderFilter{
+		Protocol:              tcpip.TransportProtocolNumber(iptip.Protocol),
+		Dst:                   tcpip.Address(iptip.Dst[:]),
+		DstMask:               tcpip.Address(iptip.DstMask[:]),
+		DstInvert:             iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
+		Src:                   tcpip.Address(iptip.Src[:]),
+		SrcMask:               tcpip.Address(iptip.SrcMask[:]),
+		SrcInvert:             iptip.InverseFlags&linux.IPT_INV_SRCIP != 0,
+		OutputInterface:       ifname,
+		OutputInterfaceMask:   ifnameMask,
+		OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0,
+	}, nil
+}
+
+func containsUnsupportedFields4(iptip linux.IPTIP) bool {
+	// The following features are supported:
+	// - Protocol
+	// - Dst and DstMask
+	// - Src and SrcMask
+	// - The inverse destination IP check flag
+	// - OutputInterface, OutputInterfaceMask and its inverse.
+	var emptyInterface = [linux.IFNAMSIZ]byte{}
+	// Disable any supported inverse flags.
+	inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT)
+	return iptip.InputInterface != emptyInterface ||
+		iptip.InputInterfaceMask != emptyInterface ||
+		iptip.Flags != 0 ||
+		iptip.InverseFlags&^inverseMask != 0
+}
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index e91b0624c..df256676f 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -17,7 +17,6 @@
 package netfilter
 
 import (
-	"bytes"
 	"errors"
 	"fmt"
 
@@ -26,8 +25,6 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserr"
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -37,15 +34,6 @@ import (
 // developing iptables, but can pollute sentry logs otherwise.
 const enableLogging = false
 
-// emptyFilter is for comparison with a rule's filters to determine whether it
-// is also empty. It is immutable.
-var emptyFilter = stack.IPHeaderFilter{
-	Dst:     "\x00\x00\x00\x00",
-	DstMask: "\x00\x00\x00\x00",
-	Src:     "\x00\x00\x00\x00",
-	SrcMask: "\x00\x00\x00\x00",
-}
-
 // nflog logs messages related to the writing and reading of iptables.
 func nflog(format string, args ...interface{}) {
 	if enableLogging && log.IsLogging(log.Debug) {
@@ -71,9 +59,9 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	return info, nil
 }
 
-// GetEntries returns netstack's iptables rules encoded for the iptables tool.
-func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
-	// Read in the struct and table name.
+// GetEntries4 returns netstack's iptables rules encoded for the iptables tool.
+func GetEntries4(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
+	// Read in the ABI struct.
 	var userEntries linux.IPTGetEntries
 	if _, err := userEntries.CopyIn(t, outPtr); err != nil {
 		nflog("couldn't copy in entries %q", userEntries.Name)
@@ -99,108 +87,48 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
 // jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(stack *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) {
-	table, ok := stack.IPTables().GetTable(tablename.String())
+func convertNetstackToBinary(stk *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) {
+	// The table name has to fit in the struct.
+	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
+		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
+	}
+
+	table, ok := stk.IPTables().GetTable(tablename.String())
 	if !ok {
 		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
 	}
 
-	var entries linux.KernelIPTGetEntries
+	// Setup the info struct.
 	var info linux.IPTGetinfo
 	info.ValidHooks = table.ValidHooks()
-
-	// The table name has to fit in the struct.
-	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
-		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
-	}
 	copy(info.Name[:], tablename[:])
-	copy(entries.Name[:], tablename[:])
-
-	for ruleIdx, rule := range table.Rules {
-		nflog("convert to binary: current offset: %d", entries.Size)
-
-		// Is this a chain entry point?
-		for hook, hookRuleIdx := range table.BuiltinChains {
-			if hookRuleIdx == ruleIdx {
-				nflog("convert to binary: found hook %d at offset %d", hook, entries.Size)
-				info.HookEntry[hook] = entries.Size
-			}
-		}
-		// Is this a chain underflow point?
-		for underflow, underflowRuleIdx := range table.Underflows {
-			if underflowRuleIdx == ruleIdx {
-				nflog("convert to binary: found underflow %d at offset %d", underflow, entries.Size)
-				info.Underflow[underflow] = entries.Size
-			}
-		}
 
-		// Each rule corresponds to an entry.
-		entry := linux.KernelIPTEntry{
-			Entry: linux.IPTEntry{
-				IP: linux.IPTIP{
-					Protocol: uint16(rule.Filter.Protocol),
-				},
-				NextOffset:   linux.SizeOfIPTEntry,
-				TargetOffset: linux.SizeOfIPTEntry,
-			},
-		}
-		copy(entry.Entry.IP.Dst[:], rule.Filter.Dst)
-		copy(entry.Entry.IP.DstMask[:], rule.Filter.DstMask)
-		copy(entry.Entry.IP.Src[:], rule.Filter.Src)
-		copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask)
-		copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface)
-		copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
-		if rule.Filter.DstInvert {
-			entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP
-		}
-		if rule.Filter.SrcInvert {
-			entry.Entry.IP.InverseFlags |= linux.IPT_INV_SRCIP
-		}
-		if rule.Filter.OutputInterfaceInvert {
-			entry.Entry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
-		}
+	entries := getEntries4(table, &info)
+	return entries, info, nil
+}
 
-		for _, matcher := range rule.Matchers {
-			// Serialize the matcher and add it to the
-			// entry.
-			serialized := marshalMatcher(matcher)
-			nflog("convert to binary: matcher serialized as: %v", serialized)
-			if len(serialized)%8 != 0 {
-				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
-			}
-			entry.Elems = append(entry.Elems, serialized...)
-			entry.Entry.NextOffset += uint16(len(serialized))
-			entry.Entry.TargetOffset += uint16(len(serialized))
+// setHooksAndUnderflow checks whether the rule at ruleIdx is a hook entrypoint
+// or underflow, in which case it fills in info.HookEntry and info.Underflows.
+func setHooksAndUnderflow(info *linux.IPTGetinfo, table stack.Table, offset uint32, ruleIdx int) {
+	// Is this a chain entry point?
+	for hook, hookRuleIdx := range table.BuiltinChains {
+		if hookRuleIdx == ruleIdx {
+			nflog("convert to binary: found hook %d at offset %d", hook, offset)
+			info.HookEntry[hook] = offset
 		}
-
-		// Serialize and append the target.
-		serialized := marshalTarget(rule.Target)
-		if len(serialized)%8 != 0 {
-			panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+	}
+	// Is this a chain underflow point?
+	for underflow, underflowRuleIdx := range table.Underflows {
+		if underflowRuleIdx == ruleIdx {
+			nflog("convert to binary: found underflow %d at offset %d", underflow, offset)
+			info.Underflow[underflow] = offset
 		}
-		entry.Elems = append(entry.Elems, serialized...)
-		entry.Entry.NextOffset += uint16(len(serialized))
-
-		nflog("convert to binary: adding entry: %+v", entry)
-
-		entries.Size += uint32(entry.Entry.NextOffset)
-		entries.Entrytable = append(entries.Entrytable, entry)
-		info.NumEntries++
 	}
-
-	nflog("convert to binary: finished with an marshalled size of %d", info.Size)
-	info.Size = entries.Size
-	return entries, info, nil
 }
 
 // SetEntries sets iptables rules for a single table. See
 // net/ipv4/netfilter/ip_tables.c:translate_table for reference.
 func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
-	// Get the basic rules data (struct ipt_replace).
-	if len(optVal) < linux.SizeOfIPTReplace {
-		nflog("optVal has insufficient size for replace %d", len(optVal))
-		return syserr.ErrInvalidArgument
-	}
 	var replace linux.IPTReplace
 	replaceBuf := optVal[:linux.SizeOfIPTReplace]
 	optVal = optVal[linux.SizeOfIPTReplace:]
@@ -218,79 +146,9 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 		return syserr.ErrInvalidArgument
 	}
 
-	nflog("set entries: setting entries in table %q", replace.Name.String())
-
-	// Convert input into a list of rules and their offsets.
-	var offset uint32
-	// offsets maps rule byte offsets to their position in table.Rules.
-	offsets := map[uint32]int{}
-	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
-		nflog("set entries: processing entry at offset %d", offset)
-
-		// Get the struct ipt_entry.
-		if len(optVal) < linux.SizeOfIPTEntry {
-			nflog("optVal has insufficient size for entry %d", len(optVal))
-			return syserr.ErrInvalidArgument
-		}
-		var entry linux.IPTEntry
-		buf := optVal[:linux.SizeOfIPTEntry]
-		binary.Unmarshal(buf, usermem.ByteOrder, &entry)
-		initialOptValLen := len(optVal)
-		optVal = optVal[linux.SizeOfIPTEntry:]
-
-		if entry.TargetOffset < linux.SizeOfIPTEntry {
-			nflog("entry has too-small target offset %d", entry.TargetOffset)
-			return syserr.ErrInvalidArgument
-		}
-
-		// TODO(gvisor.dev/issue/170): We should support more IPTIP
-		// filtering fields.
-		filter, err := filterFromIPTIP(entry.IP)
-		if err != nil {
-			nflog("bad iptip: %v", err)
-			return syserr.ErrInvalidArgument
-		}
-
-		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
-		// that they only work for certain protocols, hooks, tables.
-		// Get matchers.
-		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
-		if len(optVal) < int(matchersSize) {
-			nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
-			return syserr.ErrInvalidArgument
-		}
-		matchers, err := parseMatchers(filter, optVal[:matchersSize])
-		if err != nil {
-			nflog("failed to parse matchers: %v", err)
-			return syserr.ErrInvalidArgument
-		}
-		optVal = optVal[matchersSize:]
-
-		// Get the target of the rule.
-		targetSize := entry.NextOffset - entry.TargetOffset
-		if len(optVal) < int(targetSize) {
-			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
-			return syserr.ErrInvalidArgument
-		}
-		target, err := parseTarget(filter, optVal[:targetSize])
-		if err != nil {
-			nflog("failed to parse target: %v", err)
-			return syserr.ErrInvalidArgument
-		}
-		optVal = optVal[targetSize:]
-
-		table.Rules = append(table.Rules, stack.Rule{
-			Filter:   filter,
-			Target:   target,
-			Matchers: matchers,
-		})
-		offsets[offset] = int(entryIdx)
-		offset += uint32(entry.NextOffset)
-
-		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
-			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
-			return syserr.ErrInvalidArgument
-		}
+	offsets, err := modifyEntries4(stk, optVal, &replace, &table)
+	if err != nil {
+		return err
 	}
 
 	// Go through the list of supported hooks for this table and, for each
@@ -323,7 +181,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 		}
 	}
 
-	// Add the user chains.
+	// Check the user chains.
 	for ruleIdx, rule := range table.Rules {
 		if _, ok := rule.Target.(stack.UserChainTarget); !ok {
 			continue
@@ -404,7 +262,6 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher,
 
 		// Check some invariants.
 		if match.MatchSize < linux.SizeOfXTEntryMatch {
-
 			return nil, fmt.Errorf("match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
 		}
 		if len(optVal) < int(match.MatchSize) {
@@ -429,64 +286,11 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher,
 	return matchers, nil
 }
 
-func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
-	if containsUnsupportedFields(iptip) {
-		return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
-	}
-	if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
-		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
-	}
-	if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize {
-		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
-	}
-
-	n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
-	if n == -1 {
-		n = len(iptip.OutputInterface)
-	}
-	ifname := string(iptip.OutputInterface[:n])
-
-	n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0)
-	if n == -1 {
-		n = len(iptip.OutputInterfaceMask)
-	}
-	ifnameMask := string(iptip.OutputInterfaceMask[:n])
-
-	return stack.IPHeaderFilter{
-		Protocol:              tcpip.TransportProtocolNumber(iptip.Protocol),
-		Dst:                   tcpip.Address(iptip.Dst[:]),
-		DstMask:               tcpip.Address(iptip.DstMask[:]),
-		DstInvert:             iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
-		Src:                   tcpip.Address(iptip.Src[:]),
-		SrcMask:               tcpip.Address(iptip.SrcMask[:]),
-		SrcInvert:             iptip.InverseFlags&linux.IPT_INV_SRCIP != 0,
-		OutputInterface:       ifname,
-		OutputInterfaceMask:   ifnameMask,
-		OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0,
-	}, nil
-}
-
-func containsUnsupportedFields(iptip linux.IPTIP) bool {
-	// The following features are supported:
-	// - Protocol
-	// - Dst and DstMask
-	// - Src and SrcMask
-	// - The inverse destination IP check flag
-	// - OutputInterface, OutputInterfaceMask and its inverse.
-	var emptyInterface = [linux.IFNAMSIZ]byte{}
-	// Disable any supported inverse flags.
-	inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT)
-	return iptip.InputInterface != emptyInterface ||
-		iptip.InputInterfaceMask != emptyInterface ||
-		iptip.Flags != 0 ||
-		iptip.InverseFlags&^inverseMask != 0
-}
-
 func validUnderflow(rule stack.Rule) bool {
 	if len(rule.Matchers) != 0 {
 		return false
 	}
-	if rule.Filter != emptyFilter {
+	if rule.Filter != emptyIPv4Filter {
 		return false
 	}
 	switch rule.Target.(type) {
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index e4846bc0b..0e5913b60 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -949,6 +949,9 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 			if outLen < linux.SizeOfIPTGetinfo {
 				return nil, syserr.ErrInvalidArgument
 			}
+			if s.family != linux.AF_INET {
+				return nil, syserr.ErrInvalidArgument
+			}
 
 			stack := inet.StackFromContext(t)
 			if stack == nil {
@@ -964,12 +967,15 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 			if outLen < linux.SizeOfIPTGetEntries {
 				return nil, syserr.ErrInvalidArgument
 			}
+			if s.family != linux.AF_INET {
+				return nil, syserr.ErrInvalidArgument
+			}
 
 			stack := inet.StackFromContext(t)
 			if stack == nil {
 				return nil, syserr.ErrNoDevice
 			}
-			entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen)
+			entries, err := netfilter.GetEntries4(t, stack.(*Stack).Stack, outPtr, outLen)
 			if err != nil {
 				return nil, err
 			}
@@ -1650,12 +1656,15 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 		return nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
+	if s.skType == linux.SOCK_RAW && level == linux.SOL_IP {
 		switch name {
 		case linux.IPT_SO_SET_REPLACE:
 			if len(optVal) < linux.SizeOfIPTReplace {
 				return syserr.ErrInvalidArgument
 			}
+			if s.family != linux.AF_INET {
+				return syserr.ErrInvalidArgument
+			}
 
 			stack := inet.StackFromContext(t)
 			if stack == nil {
diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
index 3335e7430..1db8ae491 100644
--- a/pkg/sentry/socket/netstack/netstack_vfs2.go
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -239,6 +239,9 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
 			if outLen < linux.SizeOfIPTGetinfo {
 				return nil, syserr.ErrInvalidArgument
 			}
+			if s.family != linux.AF_INET {
+				return nil, syserr.ErrInvalidArgument
+			}
 
 			stack := inet.StackFromContext(t)
 			if stack == nil {
@@ -254,12 +257,15 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
 			if outLen < linux.SizeOfIPTGetEntries {
 				return nil, syserr.ErrInvalidArgument
 			}
+			if s.family != linux.AF_INET {
+				return nil, syserr.ErrInvalidArgument
+			}
 
 			stack := inet.StackFromContext(t)
 			if stack == nil {
 				return nil, syserr.ErrNoDevice
 			}
-			entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen)
+			entries, err := netfilter.GetEntries4(t, stack.(*Stack).Stack, outPtr, outLen)
 			if err != nil {
 				return nil, err
 			}
@@ -298,12 +304,15 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by
 		return nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
+	if s.skType == linux.SOCK_RAW && level == linux.SOL_IP {
 		switch name {
 		case linux.IPT_SO_SET_REPLACE:
 			if len(optVal) < linux.SizeOfIPTReplace {
 				return syserr.ErrInvalidArgument
 			}
+			if s.family != linux.AF_INET {
+				return syserr.ErrInvalidArgument
+			}
 
 			stack := inet.StackFromContext(t)
 			if stack == nil {
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index b51c4c941..08e97e6c4 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -632,6 +632,8 @@ var sockOptNames = map[uint64]abi.ValueSet{
 		linux.IPV6_UNICAST_IF:          "IPV6_UNICAST_IF",
 		linux.MCAST_MSFILTER:           "MCAST_MSFILTER",
 		linux.IPV6_ADDRFORM:            "IPV6_ADDRFORM",
+		linux.IP6T_SO_GET_INFO:         "IP6T_SO_GET_INFO",
+		linux.IP6T_SO_GET_ENTRIES:      "IP6T_SO_GET_ENTRIES",
 	},
 	linux.SOL_NETLINK: {
 		linux.NETLINK_BROADCAST_ERROR:  "NETLINK_BROADCAST_ERROR",
-- 
cgit v1.2.3


From 00ee4cb1a26d8f3cabbbb7fc05d719d8aabbee60 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 19 Aug 2020 18:03:15 -0700
Subject: Remove path walk from localFile.Mknod

Replace mknod call with mknodat equivalent to protect
against symlink attacks. Also added Mknod tests.

Remove goferfs reliance on gofer to check for file
existence before creating a synthetic entry.

Updates #2923

PiperOrigin-RevId: 327544516
---
 pkg/sentry/fsimpl/gofer/directory.go  |  12 +-
 pkg/sentry/fsimpl/gofer/filesystem.go |  77 ++++++----
 runsc/fsgofer/BUILD                   |   1 +
 runsc/fsgofer/fsgofer.go              | 258 ++++++++++++++++++----------------
 runsc/fsgofer/fsgofer_amd64_unsafe.go |  16 +--
 runsc/fsgofer/fsgofer_arm64_unsafe.go |  16 +--
 runsc/fsgofer/fsgofer_test.go         | 147 ++++++++++++-------
 runsc/fsgofer/fsgofer_unsafe.go       |  18 +--
 8 files changed, 312 insertions(+), 233 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 2a8011eb4..40dce553e 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -82,7 +82,7 @@ type createSyntheticOpts struct {
 // Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain
 // a child with the given name.
 func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
-	d2 := &dentry{
+	child := &dentry{
 		refs:      1, // held by d
 		fs:        d.fs,
 		ino:       d.fs.nextSyntheticIno(),
@@ -97,16 +97,16 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
 	case linux.S_IFDIR:
 		// Nothing else needs to be done.
 	case linux.S_IFSOCK:
-		d2.endpoint = opts.endpoint
+		child.endpoint = opts.endpoint
 	case linux.S_IFIFO:
-		d2.pipe = opts.pipe
+		child.pipe = opts.pipe
 	default:
 		panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType()))
 	}
-	d2.pf.dentry = d2
-	d2.vfsd.Init(d2)
+	child.pf.dentry = child
+	child.vfsd.Init(child)
 
-	d.cacheNewChildLocked(d2, opts.name)
+	d.cacheNewChildLocked(child, opts.name)
 	d.syntheticChildren++
 }
 
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 9a90351e5..1b6fa4e14 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -330,7 +330,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 //
 // Preconditions: !rp.Done(). For the final path component in rp,
 // !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error {
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) error, createInSyntheticDir func(parent *dentry, name string) error) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -399,7 +399,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 		// RPC will fail with EEXIST like we would have. If the RPC succeeds, and a
 		// stale dentry exists, the dentry will fail revalidation next time it's
 		// used.
-		if err := createInRemoteDir(parent, name); err != nil {
+		if err := createInRemoteDir(parent, name, &ds); err != nil {
 			return err
 		}
 		ev := linux.IN_CREATE
@@ -414,7 +414,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	}
 	// No cached dentry exists; however, there might still be an existing file
 	// at name. As above, we attempt the file creation RPC anyway.
-	if err := createInRemoteDir(parent, name); err != nil {
+	if err := createInRemoteDir(parent, name, &ds); err != nil {
 		return err
 	}
 	if child, ok := parent.children[name]; ok && child == nil {
@@ -721,7 +721,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
 
 // LinkAt implements vfs.FilesystemImpl.LinkAt.
 func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
-	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, _ **[]*dentry) error {
 		if rp.Mount() != vd.Mount() {
 			return syserror.EXDEV
 		}
@@ -754,7 +754,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
 	creds := rp.Credentials()
-	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
 		if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
 			if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
 				return err
@@ -789,34 +789,49 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
-	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) error {
 		creds := rp.Credentials()
 		_, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
-		// If the gofer does not allow creating a socket or pipe, create a
-		// synthetic one, i.e. one that is kept entirely in memory.
-		if err == syserror.EPERM {
-			switch opts.Mode.FileType() {
-			case linux.S_IFSOCK:
-				parent.createSyntheticChildLocked(&createSyntheticOpts{
-					name:     name,
-					mode:     opts.Mode,
-					kuid:     creds.EffectiveKUID,
-					kgid:     creds.EffectiveKGID,
-					endpoint: opts.Endpoint,
-				})
-				return nil
-			case linux.S_IFIFO:
-				parent.createSyntheticChildLocked(&createSyntheticOpts{
-					name: name,
-					mode: opts.Mode,
-					kuid: creds.EffectiveKUID,
-					kgid: creds.EffectiveKGID,
-					pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
-				})
-				return nil
-			}
+		if err != syserror.EPERM {
+			return err
 		}
-		return err
+
+		// EPERM means that gofer does not allow creating a socket or pipe. Fallback
+		// to creating a synthetic one, i.e. one that is kept entirely in memory.
+
+		// Check that we're not overriding an existing file with a synthetic one.
+		_, err = fs.stepLocked(ctx, rp, parent, true, ds)
+		switch {
+		case err == nil:
+			// Step succeeded, another file exists.
+			return syserror.EEXIST
+		case err != syserror.ENOENT:
+			// Unexpected error.
+			return err
+		}
+
+		switch opts.Mode.FileType() {
+		case linux.S_IFSOCK:
+			parent.createSyntheticChildLocked(&createSyntheticOpts{
+				name:     name,
+				mode:     opts.Mode,
+				kuid:     creds.EffectiveKUID,
+				kgid:     creds.EffectiveKGID,
+				endpoint: opts.Endpoint,
+			})
+			return nil
+		case linux.S_IFIFO:
+			parent.createSyntheticChildLocked(&createSyntheticOpts{
+				name: name,
+				mode: opts.Mode,
+				kuid: creds.EffectiveKUID,
+				kgid: creds.EffectiveKGID,
+				pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+			})
+			return nil
+		}
+		// Retain error from gofer if synthetic file cannot be created internally.
+		return syserror.EPERM
 	}, nil)
 }
 
@@ -1452,7 +1467,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
-	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
 		creds := rp.Credentials()
 		_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
 		return err
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 05e3637f7..96c57a426 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -32,5 +32,6 @@ go_test(
         "//pkg/log",
         "//pkg/p9",
         "//pkg/test/testutil",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 639de9ca1..b0788bd23 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -29,7 +29,6 @@ import (
 	"path/filepath"
 	"runtime"
 	"strconv"
-	"syscall"
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -45,7 +44,7 @@ const (
 	// modes to ensure an unopened/closed file fails all mode checks.
 	invalidMode = p9.OpenFlags(math.MaxUint32)
 
-	openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC
+	openFlags = unix.O_NOFOLLOW | unix.O_CLOEXEC
 
 	allowedOpenFlags = unix.O_TRUNC
 )
@@ -125,7 +124,7 @@ func (a *attachPoint) Attach() (p9.File, error) {
 }
 
 // makeQID returns a unique QID for the given stat buffer.
-func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
+func (a *attachPoint) makeQID(stat unix.Stat_t) p9.QID {
 	a.deviceMu.Lock()
 	defer a.deviceMu.Unlock()
 
@@ -156,9 +155,7 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 // localFile implements p9.File wrapping a local file. The underlying file
 // is opened during Walk() and stored in 'file' to be used with other
 // operations. The file is opened as readonly, unless it's a symlink or there is
-// no read access, which requires O_PATH. 'file' is dup'ed when Walk(nil) is
-// called to clone the file. This reduces the number of walks that need to be
-// done by the host file system when files are reused.
+// no read access, which requires O_PATH.
 //
 // The file may be reopened if the requested mode in Open() is not a subset of
 // current mode. Consequently, 'file' could have a mode wider than requested and
@@ -170,11 +167,28 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 // performance with 'overlay2' storage driver. overlay2 eagerly copies the
 // entire file up when it's opened in write mode, and would perform badly when
 // multiple files are only being opened for read (esp. startup).
+//
+// File operations must use "at" functions whenever possible:
+//   * Local operations must use AT_EMPTY_PATH:
+//  	   fchownat(fd, "", AT_EMPTY_PATH, ...), instead of chown(fullpath, ...)
+//   * Creation operations must use (fd + name):
+//       mkdirat(fd, name, ...), instead of mkdir(fullpath, ...)
+//
+// Apart from being faster, it also adds another layer of defense against
+// symlink attacks (note that O_NOFOLLOW applies only to the last element in
+// the path).
+//
+// The few exceptions where it cannot be done are: utimensat on symlinks, and
+// Connect() for the socket address.
 type localFile struct {
 	// attachPoint is the attachPoint that serves this localFile.
 	attachPoint *attachPoint
 
-	// hostPath will be safely updated by the Renamed hook.
+	// hostPath is the full path to the host file. It can be used for logging and
+	// the few cases where full path is required to operation the host file. In
+	// all other cases, use "file" directly.
+	//
+	// Note: it's safely updated by the Renamed hook.
 	hostPath string
 
 	// file is opened when localFile is created and it's never nil. It may be
@@ -191,7 +205,7 @@ type localFile struct {
 	mode p9.OpenFlags
 
 	// fileType for this file. It is equivalent to:
-	// syscall.Stat_t.Mode & syscall.S_IFMT
+	// unix.Stat_t.Mode & unix.S_IFMT
 	fileType uint32
 
 	qid p9.QID
@@ -211,7 +225,7 @@ var procSelfFD *fd.FD
 // OpenProcSelfFD opens the /proc/self/fd directory, which will be used to
 // reopen file descriptors.
 func OpenProcSelfFD() error {
-	d, err := syscall.Open("/proc/self/fd", syscall.O_RDONLY|syscall.O_DIRECTORY, 0)
+	d, err := unix.Open("/proc/self/fd", unix.O_RDONLY|unix.O_DIRECTORY, 0)
 	if err != nil {
 		return fmt.Errorf("error opening /proc/self/fd: %v", err)
 	}
@@ -220,7 +234,7 @@ func OpenProcSelfFD() error {
 }
 
 func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) {
-	d, err := syscall.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^syscall.O_NOFOLLOW, 0)
+	d, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^unix.O_NOFOLLOW, 0)
 	if err != nil {
 		return nil, err
 	}
@@ -229,17 +243,17 @@ func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) {
 }
 
 func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, bool, error) {
-	path := path.Join(parent.hostPath, name)
-	f, readable, err := openAnyFile(path, func(mode int) (*fd.FD, error) {
+	pathDebug := path.Join(parent.hostPath, name)
+	f, readable, err := openAnyFile(pathDebug, func(mode int) (*fd.FD, error) {
 		return fd.OpenAt(parent.file, name, openFlags|mode, 0)
 	})
-	return f, path, readable, err
+	return f, pathDebug, readable, err
 }
 
-// openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback
+// openAnyFile attempts to open the file in O_RDONLY. If it fails, falls back
 // to O_PATH. 'path' is used for logging messages only. 'fn' is what does the
 // actual file open and is customizable by the caller.
-func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool, error) {
+func openAnyFile(pathDebug string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool, error) {
 	// Attempt to open file in the following mode in order:
 	//   1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs.
 	//      Use non-blocking to prevent getting stuck inside open(2) for
@@ -250,7 +264,7 @@ func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool,
 		readable bool
 	}{
 		{
-			mode:     syscall.O_RDONLY | syscall.O_NONBLOCK,
+			mode:     unix.O_RDONLY | unix.O_NONBLOCK,
 			readable: true,
 		},
 		{
@@ -268,36 +282,36 @@ func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool,
 			return file, option.readable, nil
 		}
 		switch e := extractErrno(err); e {
-		case syscall.ENOENT:
+		case unix.ENOENT:
 			// File doesn't exist, no point in retrying.
 			return nil, false, e
 		}
 		// File failed to open. Try again with next mode, preserving 'err' in case
 		// this was the last attempt.
-		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|option.mode, path, err)
+		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|option.mode, pathDebug, err)
 	}
 	// All attempts to open file have failed, return the last error.
-	log.Debugf("Failed to open file, path: %q, err: %v", path, err)
+	log.Debugf("Failed to open file, path: %q, err: %v", pathDebug, err)
 	return nil, false, extractErrno(err)
 }
 
-func checkSupportedFileType(stat syscall.Stat_t, permitSocket bool) error {
-	switch stat.Mode & syscall.S_IFMT {
-	case syscall.S_IFREG, syscall.S_IFDIR, syscall.S_IFLNK:
+func checkSupportedFileType(stat unix.Stat_t, permitSocket bool) error {
+	switch stat.Mode & unix.S_IFMT {
+	case unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK:
 		return nil
 
-	case syscall.S_IFSOCK:
+	case unix.S_IFSOCK:
 		if !permitSocket {
-			return syscall.EPERM
+			return unix.EPERM
 		}
 		return nil
 
 	default:
-		return syscall.EPERM
+		return unix.EPERM
 	}
 }
 
-func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat syscall.Stat_t) (*localFile, error) {
+func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat unix.Stat_t) (*localFile, error) {
 	if err := checkSupportedFileType(stat, a.conf.HostUDS); err != nil {
 		return nil, err
 	}
@@ -307,7 +321,7 @@ func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat
 		hostPath:        path,
 		file:            file,
 		mode:            invalidMode,
-		fileType:        stat.Mode & syscall.S_IFMT,
+		fileType:        stat.Mode & unix.S_IFMT,
 		qid:             a.makeQID(stat),
 		controlReadable: readable,
 	}, nil
@@ -317,7 +331,7 @@ func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat
 // non-blocking. If anything fails, returns nil. It's better to have a file
 // without host FD, than to fail the operation.
 func newFDMaybe(file *fd.FD) *fd.FD {
-	dupFD, err := syscall.Dup(file.FD())
+	dupFD, err := unix.Dup(file.FD())
 	// Technically, the runtime may call the finalizer on file as soon as
 	// FD() returns.
 	runtime.KeepAlive(file)
@@ -327,31 +341,23 @@ func newFDMaybe(file *fd.FD) *fd.FD {
 	dup := fd.New(dupFD)
 
 	// fd is blocking; non-blocking is required.
-	if err := syscall.SetNonblock(dup.FD(), true); err != nil {
+	if err := unix.SetNonblock(dup.FD(), true); err != nil {
 		_ = dup.Close()
 		return nil
 	}
 	return dup
 }
 
-func fstat(fd int) (syscall.Stat_t, error) {
-	var stat syscall.Stat_t
-	if err := syscall.Fstat(fd, &stat); err != nil {
-		return syscall.Stat_t{}, err
-	}
-	return stat, nil
-}
-
-func stat(path string) (syscall.Stat_t, error) {
-	var stat syscall.Stat_t
-	if err := syscall.Stat(path, &stat); err != nil {
-		return syscall.Stat_t{}, err
+func fstat(fd int) (unix.Stat_t, error) {
+	var stat unix.Stat_t
+	if err := unix.Fstat(fd, &stat); err != nil {
+		return unix.Stat_t{}, err
 	}
 	return stat, nil
 }
 
 func fchown(fd int, uid p9.UID, gid p9.GID) error {
-	return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
+	return unix.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
 }
 
 // Open implements p9.File.
@@ -377,7 +383,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 		// name_to_handle_at and open_by_handle_at aren't supported by overlay2.
 		log.Debugf("Open reopening file, flags: %v, %q", flags, l.hostPath)
 		var err error
-		osFlags := flags.OSFlags() & (syscall.O_ACCMODE | allowedOpenFlags)
+		osFlags := flags.OSFlags() & (unix.O_ACCMODE | allowedOpenFlags)
 		newFile, err = reopenProcFd(l.file, openFlags|osFlags)
 		if err != nil {
 			return nil, p9.QID{}, 0, extractErrno(err)
@@ -385,7 +391,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 	}
 
 	var fd *fd.FD
-	if l.fileType == syscall.S_IFREG {
+	if l.fileType == unix.S_IFREG {
 		// Donate FD for regular files only.
 		fd = newFDMaybe(newFile)
 	}
@@ -408,7 +414,7 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode,
 	}
 
 	// Set file creation flags, plus allowed open flags from caller.
-	osFlags := openFlags | syscall.O_CREAT | syscall.O_EXCL
+	osFlags := openFlags | unix.O_CREAT | unix.O_EXCL
 	osFlags |= p9Flags.OSFlags() & allowedOpenFlags
 
 	// 'file' may be used for other operations (e.g. Walk), so read access is
@@ -416,9 +422,9 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode,
 	// than needed for each particular case.
 	mode := p9Flags & p9.OpenFlagsModeMask
 	if mode == p9.WriteOnly {
-		osFlags |= syscall.O_RDWR
+		osFlags |= unix.O_RDWR
 	} else {
-		osFlags |= mode.OSFlags() & unix.O_ACCMODE
+		osFlags |= mode.OSFlags()
 	}
 
 	child, err := fd.OpenAt(l.file, name, osFlags, uint32(perm.Permissions()))
@@ -428,7 +434,7 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode,
 	cu := cleanup.Make(func() {
 		_ = child.Close()
 		// Best effort attempt to remove the file in case of failure.
-		if err := syscall.Unlinkat(l.file.FD(), name); err != nil {
+		if err := unix.Unlinkat(l.file.FD(), name, 0); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
 		}
 	})
@@ -447,7 +453,7 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode,
 		hostPath:    path.Join(l.hostPath, name),
 		file:        child,
 		mode:        mode,
-		fileType:    syscall.S_IFREG,
+		fileType:    unix.S_IFREG,
 		qid:         l.attachPoint.makeQID(stat),
 	}
 
@@ -461,7 +467,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 		return p9.QID{}, err
 	}
 
-	if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
+	if err := unix.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
 	cu := cleanup.Make(func() {
@@ -473,7 +479,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 	defer cu.Clean()
 
 	// Open directory to change ownership and stat it.
-	flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
+	flags := unix.O_DIRECTORY | unix.O_RDONLY | openFlags
 	f, err := fd.OpenAt(l.file, name, flags, 0)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
@@ -508,20 +514,20 @@ func (l *localFile) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask,
 	return qids, file, mask, attr, nil
 }
 
-func (l *localFile) walk(names []string) ([]p9.QID, p9.File, syscall.Stat_t, error) {
+func (l *localFile) walk(names []string) ([]p9.QID, p9.File, unix.Stat_t, error) {
 	// Duplicate current file if 'names' is empty.
 	if len(names) == 0 {
 		newFile, readable, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
 			return reopenProcFd(l.file, openFlags|mode)
 		})
 		if err != nil {
-			return nil, nil, syscall.Stat_t{}, extractErrno(err)
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
 
 		stat, err := fstat(newFile.FD())
 		if err != nil {
 			_ = newFile.Close()
-			return nil, nil, syscall.Stat_t{}, extractErrno(err)
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
 
 		c := &localFile{
@@ -537,7 +543,7 @@ func (l *localFile) walk(names []string) ([]p9.QID, p9.File, syscall.Stat_t, err
 	}
 
 	var qids []p9.QID
-	var lastStat syscall.Stat_t
+	var lastStat unix.Stat_t
 	last := l
 	for _, name := range names {
 		f, path, readable, err := openAnyFileFromParent(last, name)
@@ -545,17 +551,17 @@ func (l *localFile) walk(names []string) ([]p9.QID, p9.File, syscall.Stat_t, err
 			_ = last.Close()
 		}
 		if err != nil {
-			return nil, nil, syscall.Stat_t{}, extractErrno(err)
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
 		lastStat, err = fstat(f.FD())
 		if err != nil {
 			_ = f.Close()
-			return nil, nil, syscall.Stat_t{}, extractErrno(err)
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
 		c, err := newLocalFile(last.attachPoint, f, path, readable, lastStat)
 		if err != nil {
 			_ = f.Close()
-			return nil, nil, syscall.Stat_t{}, extractErrno(err)
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
 
 		qids = append(qids, c.qid)
@@ -566,8 +572,8 @@ func (l *localFile) walk(names []string) ([]p9.QID, p9.File, syscall.Stat_t, err
 
 // StatFS implements p9.File.
 func (l *localFile) StatFS() (p9.FSStat, error) {
-	var s syscall.Statfs_t
-	if err := syscall.Fstatfs(l.file.FD(), &s); err != nil {
+	var s unix.Statfs_t
+	if err := unix.Fstatfs(l.file.FD(), &s); err != nil {
 		return p9.FSStat{}, extractErrno(err)
 	}
 
@@ -587,9 +593,9 @@ func (l *localFile) StatFS() (p9.FSStat, error) {
 // FSync implements p9.File.
 func (l *localFile) FSync() error {
 	if !l.isOpen() {
-		return syscall.EBADF
+		return unix.EBADF
 	}
-	if err := syscall.Fsync(l.file.FD()); err != nil {
+	if err := unix.Fsync(l.file.FD()); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -605,7 +611,7 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error)
 	return l.qid, mask, attr, nil
 }
 
-func (l *localFile) fillAttr(stat syscall.Stat_t) (p9.AttrMask, p9.Attr) {
+func (l *localFile) fillAttr(stat unix.Stat_t) (p9.AttrMask, p9.Attr) {
 	attr := p9.Attr{
 		Mode:             p9.FileMode(stat.Mode),
 		UID:              p9.UID(stat.Uid),
@@ -665,13 +671,13 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	// consistent result that is not attribute dependent.
 	if !valid.IsSubsetOf(allowed) {
 		log.Warningf("SetAttr() failed for %q, mask: %v", l.hostPath, valid)
-		return syscall.EPERM
+		return unix.EPERM
 	}
 
 	// Check if it's possible to use cached file, or if another one needs to be
 	// opened for write.
 	f := l.file
-	if l.fileType == syscall.S_IFREG && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+	if l.fileType == unix.S_IFREG && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
 		var err error
 		f, err = reopenProcFd(l.file, openFlags|os.O_WRONLY)
 		if err != nil {
@@ -692,21 +698,21 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	// over another.
 	var err error
 	if valid.Permissions {
-		if cerr := syscall.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil {
+		if cerr := unix.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil {
 			log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr)
 			err = extractErrno(cerr)
 		}
 	}
 
 	if valid.Size {
-		if terr := syscall.Ftruncate(f.FD(), int64(attr.Size)); terr != nil {
+		if terr := unix.Ftruncate(f.FD(), int64(attr.Size)); terr != nil {
 			log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr)
 			err = extractErrno(terr)
 		}
 	}
 
 	if valid.ATime || valid.MTime {
-		utimes := [2]syscall.Timespec{
+		utimes := [2]unix.Timespec{
 			{Sec: 0, Nsec: linux.UTIME_OMIT},
 			{Sec: 0, Nsec: linux.UTIME_OMIT},
 		}
@@ -727,15 +733,15 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 			}
 		}
 
-		if l.fileType == syscall.S_IFLNK {
+		if l.fileType == unix.S_IFLNK {
 			// utimensat operates different that other syscalls. To operate on a
 			// symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
 			// name.
-			parent, err := syscall.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
+			parent, err := unix.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
 			if err != nil {
 				return extractErrno(err)
 			}
-			defer syscall.Close(parent)
+			defer unix.Close(parent)
 
 			if terr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil {
 				log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
@@ -760,7 +766,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 		if valid.GID {
 			gid = int(attr.GID)
 		}
-		if oerr := syscall.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
+		if oerr := unix.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
 			log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr)
 			err = extractErrno(oerr)
 		}
@@ -770,28 +776,28 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 }
 
 func (*localFile) GetXattr(string, uint64) (string, error) {
-	return "", syscall.EOPNOTSUPP
+	return "", unix.EOPNOTSUPP
 }
 
 func (*localFile) SetXattr(string, string, uint32) error {
-	return syscall.EOPNOTSUPP
+	return unix.EOPNOTSUPP
 }
 
 func (*localFile) ListXattr(uint64) (map[string]struct{}, error) {
-	return nil, syscall.EOPNOTSUPP
+	return nil, unix.EOPNOTSUPP
 }
 
 func (*localFile) RemoveXattr(string) error {
-	return syscall.EOPNOTSUPP
+	return unix.EOPNOTSUPP
 }
 
 // Allocate implements p9.File.
 func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error {
 	if !l.isOpen() {
-		return syscall.EBADF
+		return unix.EBADF
 	}
 
-	if err := syscall.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil {
+	if err := unix.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -818,10 +824,10 @@ func (l *localFile) RenameAt(oldName string, directory p9.File, newName string)
 // ReadAt implements p9.File.
 func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
-		return 0, syscall.EBADF
+		return 0, unix.EBADF
 	}
 	if !l.isOpen() {
-		return 0, syscall.EBADF
+		return 0, unix.EBADF
 	}
 
 	r, err := l.file.ReadAt(p, int64(offset))
@@ -836,10 +842,10 @@ func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
 // WriteAt implements p9.File.
 func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
-		return 0, syscall.EBADF
+		return 0, unix.EBADF
 	}
 	if !l.isOpen() {
-		return 0, syscall.EBADF
+		return 0, unix.EBADF
 	}
 
 	w, err := l.file.WriteAt(p, int64(offset))
@@ -860,7 +866,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 	}
 	cu := cleanup.Make(func() {
 		// Best effort attempt to remove the symlink in case of failure.
-		if err := syscall.Unlinkat(l.file.FD(), newName); err != nil {
+		if err := unix.Unlinkat(l.file.FD(), newName, 0); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
 		}
 	})
@@ -899,34 +905,46 @@ func (l *localFile) Link(target p9.File, newName string) error {
 }
 
 // Mknod implements p9.File.
-func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
 	if err := l.checkROMount(); err != nil {
 		return p9.QID{}, err
 	}
 
-	hostPath := path.Join(l.hostPath, name)
-
-	// Return EEXIST if the file already exists.
-	if _, err := stat(hostPath); err == nil {
-		return p9.QID{}, syscall.EEXIST
-	}
-
 	// From mknod(2) man page:
 	// "EPERM: [...] if the filesystem containing pathname does not support
 	// the type of node requested."
 	if mode.FileType() != p9.ModeRegular {
-		return p9.QID{}, syscall.EPERM
+		return p9.QID{}, unix.EPERM
 	}
 
 	// Allow Mknod to create regular files.
-	if err := syscall.Mknod(hostPath, uint32(mode), 0); err != nil {
+	if err := unix.Mknodat(l.file.FD(), name, uint32(mode), 0); err != nil {
 		return p9.QID{}, err
 	}
+	cu := cleanup.Make(func() {
+		// Best effort attempt to remove the file in case of failure.
+		if err := unix.Unlinkat(l.file.FD(), name, 0); err != nil {
+			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
+		}
+	})
+	defer cu.Clean()
 
-	stat, err := stat(hostPath)
+	// Open file to change ownership and stat it.
+	child, err := fd.OpenAt(l.file, name, unix.O_PATH|openFlags, 0)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
+	defer child.Close()
+
+	if err := fchown(child.FD(), uid, gid); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	stat, err := fstat(child.FD())
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+
+	cu.Release()
 	return l.attachPoint.makeQID(stat), nil
 }
 
@@ -945,10 +963,10 @@ func (l *localFile) UnlinkAt(name string, flags uint32) error {
 // Readdir implements p9.File.
 func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
-		return nil, syscall.EBADF
+		return nil, unix.EBADF
 	}
 	if !l.isOpen() {
-		return nil, syscall.EBADF
+		return nil, unix.EBADF
 	}
 
 	// Readdirnames is a cursor over directories, so seek back to 0 to ensure it's
@@ -965,7 +983,7 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	// which causes the directory stream to resynchronize with the directory's
 	// current contents).
 	if l.lastDirentOffset != offset || offset == 0 {
-		if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil {
+		if _, err := unix.Seek(l.file.FD(), 0, 0); err != nil {
 			return nil, extractErrno(err)
 		}
 		skip = offset
@@ -998,7 +1016,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64)
 
 	end := offset + uint64(count)
 	for offset < end {
-		dirSize, err := syscall.ReadDirent(f, direntsBuf)
+		dirSize, err := unix.ReadDirent(f, direntsBuf)
 		if err != nil {
 			return dirents, err
 		}
@@ -1007,7 +1025,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64)
 		}
 
 		names := names[:0]
-		_, _, names = syscall.ParseDirent(direntsBuf[:dirSize], -1, names)
+		_, _, names = unix.ParseDirent(direntsBuf[:dirSize], -1, names)
 
 		// Skip over entries that the caller is not interested in.
 		if skip > 0 {
@@ -1052,7 +1070,7 @@ func (l *localFile) Readlink() (string, error) {
 			return string(b[:n]), nil
 		}
 	}
-	return "", syscall.ENOMEM
+	return "", unix.ENOMEM
 }
 
 // Flush implements p9.File.
@@ -1063,7 +1081,7 @@ func (l *localFile) Flush() error {
 // Connect implements p9.File.
 func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
 	if !l.attachPoint.conf.HostUDS {
-		return nil, syscall.ECONNREFUSED
+		return nil, unix.ECONNREFUSED
 	}
 
 	// TODO(gvisor.dev/issue/1003): Due to different app vs replacement
@@ -1071,34 +1089,34 @@ func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
 	// fit f.path in our sockaddr. We'd need to redirect through a shorter
 	// path in order to actually connect to this socket.
 	if len(l.hostPath) > linux.UnixPathMax {
-		return nil, syscall.ECONNREFUSED
+		return nil, unix.ECONNREFUSED
 	}
 
 	var stype int
 	switch flags {
 	case p9.StreamSocket:
-		stype = syscall.SOCK_STREAM
+		stype = unix.SOCK_STREAM
 	case p9.DgramSocket:
-		stype = syscall.SOCK_DGRAM
+		stype = unix.SOCK_DGRAM
 	case p9.SeqpacketSocket:
-		stype = syscall.SOCK_SEQPACKET
+		stype = unix.SOCK_SEQPACKET
 	default:
-		return nil, syscall.ENXIO
+		return nil, unix.ENXIO
 	}
 
-	f, err := syscall.Socket(syscall.AF_UNIX, stype, 0)
+	f, err := unix.Socket(unix.AF_UNIX, stype, 0)
 	if err != nil {
 		return nil, err
 	}
 
-	if err := syscall.SetNonblock(f, true); err != nil {
-		_ = syscall.Close(f)
+	if err := unix.SetNonblock(f, true); err != nil {
+		_ = unix.Close(f)
 		return nil, err
 	}
 
-	sa := syscall.SockaddrUnix{Name: l.hostPath}
-	if err := syscall.Connect(f, &sa); err != nil {
-		_ = syscall.Close(f)
+	sa := unix.SockaddrUnix{Name: l.hostPath}
+	if err := unix.Connect(f, &sa); err != nil {
+		_ = unix.Close(f)
 		return nil, err
 	}
 
@@ -1123,7 +1141,7 @@ func (l *localFile) Renamed(newDir p9.File, newName string) {
 }
 
 // extractErrno tries to determine the errno.
-func extractErrno(err error) syscall.Errno {
+func extractErrno(err error) unix.Errno {
 	if err == nil {
 		// This should never happen. The likely result will be that
 		// some user gets the frustrating "error: SUCCESS" message.
@@ -1133,18 +1151,18 @@ func extractErrno(err error) syscall.Errno {
 
 	switch err {
 	case os.ErrNotExist:
-		return syscall.ENOENT
+		return unix.ENOENT
 	case os.ErrExist:
-		return syscall.EEXIST
+		return unix.EEXIST
 	case os.ErrPermission:
-		return syscall.EACCES
+		return unix.EACCES
 	case os.ErrInvalid:
-		return syscall.EINVAL
+		return unix.EINVAL
 	}
 
 	// See if it's an errno or a common wrapped error.
 	switch e := err.(type) {
-	case syscall.Errno:
+	case unix.Errno:
 		return e
 	case *os.PathError:
 		return extractErrno(e.Err)
@@ -1156,7 +1174,7 @@ func extractErrno(err error) syscall.Errno {
 
 	// Fall back to EIO.
 	log.Debugf("Unknown error: %v, defaulting to EIO", err)
-	return syscall.EIO
+	return unix.EIO
 }
 
 func (l *localFile) checkROMount() error {
@@ -1164,7 +1182,7 @@ func (l *localFile) checkROMount() error {
 		if conf.PanicOnWrite {
 			panic("attempt to write to RO mount")
 		}
-		return syscall.EROFS
+		return unix.EROFS
 	}
 	return nil
 }
diff --git a/runsc/fsgofer/fsgofer_amd64_unsafe.go b/runsc/fsgofer/fsgofer_amd64_unsafe.go
index 5d4aab597..c46958185 100644
--- a/runsc/fsgofer/fsgofer_amd64_unsafe.go
+++ b/runsc/fsgofer/fsgofer_amd64_unsafe.go
@@ -17,25 +17,25 @@
 package fsgofer
 
 import (
-	"syscall"
 	"unsafe"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/syserr"
 )
 
-func statAt(dirFd int, name string) (syscall.Stat_t, error) {
-	nameBytes, err := syscall.BytePtrFromString(name)
+func statAt(dirFd int, name string) (unix.Stat_t, error) {
+	nameBytes, err := unix.BytePtrFromString(name)
 	if err != nil {
-		return syscall.Stat_t{}, err
+		return unix.Stat_t{}, err
 	}
 	namePtr := unsafe.Pointer(nameBytes)
 
-	var stat syscall.Stat_t
+	var stat unix.Stat_t
 	statPtr := unsafe.Pointer(&stat)
 
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_NEWFSTATAT,
+	if _, _, errno := unix.Syscall6(
+		unix.SYS_NEWFSTATAT,
 		uintptr(dirFd),
 		uintptr(namePtr),
 		uintptr(statPtr),
@@ -43,7 +43,7 @@ func statAt(dirFd int, name string) (syscall.Stat_t, error) {
 		0,
 		0); errno != 0 {
 
-		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+		return unix.Stat_t{}, syserr.FromHost(errno).ToError()
 	}
 	return stat, nil
 }
diff --git a/runsc/fsgofer/fsgofer_arm64_unsafe.go b/runsc/fsgofer/fsgofer_arm64_unsafe.go
index 8041fd352..491460718 100644
--- a/runsc/fsgofer/fsgofer_arm64_unsafe.go
+++ b/runsc/fsgofer/fsgofer_arm64_unsafe.go
@@ -17,25 +17,25 @@
 package fsgofer
 
 import (
-	"syscall"
 	"unsafe"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/syserr"
 )
 
-func statAt(dirFd int, name string) (syscall.Stat_t, error) {
-	nameBytes, err := syscall.BytePtrFromString(name)
+func statAt(dirFd int, name string) (unix.Stat_t, error) {
+	nameBytes, err := unix.BytePtrFromString(name)
 	if err != nil {
-		return syscall.Stat_t{}, err
+		return unix.Stat_t{}, err
 	}
 	namePtr := unsafe.Pointer(nameBytes)
 
-	var stat syscall.Stat_t
+	var stat unix.Stat_t
 	statPtr := unsafe.Pointer(&stat)
 
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_FSTATAT,
+	if _, _, errno := unix.Syscall6(
+		unix.SYS_FSTATAT,
 		uintptr(dirFd),
 		uintptr(namePtr),
 		uintptr(statPtr),
@@ -43,7 +43,7 @@ func statAt(dirFd int, name string) (syscall.Stat_t, error) {
 		0,
 		0); errno != 0 {
 
-		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+		return unix.Stat_t{}, syserr.FromHost(errno).ToError()
 	}
 	return stat, nil
 }
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 8ed703584..c91cfd094 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -21,9 +21,9 @@ import (
 	"os"
 	"path"
 	"path/filepath"
-	"syscall"
 	"testing"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/test/testutil"
@@ -32,7 +32,7 @@ import (
 var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite}
 
 var (
-	allTypes = []uint32{syscall.S_IFREG, syscall.S_IFDIR, syscall.S_IFLNK}
+	allTypes = []uint32{unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK}
 
 	// allConfs is set in init().
 	allConfs []Config
@@ -83,7 +83,7 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error {
 		}
 		want = append(want, b...)
 	} else {
-		if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF {
+		if e, ok := err.(unix.Errno); !ok || e != unix.EBADF {
 			return fmt.Errorf("WriteAt() should have failed, got: %d, want: EBADFD", err)
 		}
 	}
@@ -101,7 +101,7 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error {
 			return fmt.Errorf("ReadAt() wrong data, got: %s, want: %s", string(rBuf), want)
 		}
 	} else {
-		if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF {
+		if e, ok := err.(unix.Errno); !ok || e != unix.EBADF {
 			return fmt.Errorf("ReadAt() should have failed, got: %d, want: EBADFD", err)
 		}
 	}
@@ -121,11 +121,11 @@ func (s state) String() string {
 
 func typeName(fileType uint32) string {
 	switch fileType {
-	case syscall.S_IFREG:
+	case unix.S_IFREG:
 		return "file"
-	case syscall.S_IFDIR:
+	case unix.S_IFDIR:
 		return "directory"
-	case syscall.S_IFLNK:
+	case unix.S_IFLNK:
 		return "symlink"
 	default:
 		panic(fmt.Sprintf("invalid file type for test: %d", fileType))
@@ -195,19 +195,19 @@ func setup(fileType uint32) (string, string, error) {
 
 	var name string
 	switch fileType {
-	case syscall.S_IFREG:
+	case unix.S_IFREG:
 		name = "file"
 		_, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
 		if err != nil {
 			return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err)
 		}
 		defer f.Close()
-	case syscall.S_IFDIR:
+	case unix.S_IFDIR:
 		name = "dir"
 		if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
 			return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err)
 		}
-	case syscall.S_IFLNK:
+	case unix.S_IFLNK:
 		name = "symlink"
 		if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
 			return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err)
@@ -227,7 +227,7 @@ func createFile(dir *localFile, name string) (*localFile, error) {
 }
 
 func TestReadWrite(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		child, err := createFile(s.file, "test")
 		if err != nil {
 			t.Fatalf("%v: createFile() failed, err: %v", s, err)
@@ -246,9 +246,13 @@ func TestReadWrite(t *testing.T) {
 			if err != nil {
 				t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err)
 			}
-			if _, _, _, err := l.Open(flags); err != nil {
+			fd, _, _, err := l.Open(flags)
+			if err != nil {
 				t.Fatalf("%v: Open(%v) failed, err: %v", s, flags, err)
 			}
+			if fd != nil {
+				defer fd.Close()
+			}
 			if err := testReadWrite(l, flags, want); err != nil {
 				t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err)
 			}
@@ -257,14 +261,14 @@ func TestReadWrite(t *testing.T) {
 }
 
 func TestCreate(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		for i, flags := range allOpenFlags {
 			_, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
 			if err != nil {
 				t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, flags, err)
 			}
 
-			if err := testReadWrite(l, flags, []byte{}); err != nil {
+			if err := testReadWrite(l, flags, nil); err != nil {
 				t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err)
 			}
 		}
@@ -274,7 +278,7 @@ func TestCreate(t *testing.T) {
 // TestReadWriteDup tests that a file opened in any mode can be dup'ed and
 // reopened in any other mode.
 func TestReadWriteDup(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		child, err := createFile(s.file, "test")
 		if err != nil {
 			t.Fatalf("%v: createFile() failed, err: %v", s, err)
@@ -304,9 +308,13 @@ func TestReadWriteDup(t *testing.T) {
 					t.Fatalf("%v: Walk(<empty>) failed: %v", s, err)
 				}
 				defer dup.Close()
-				if _, _, _, err := dup.Open(dupFlags); err != nil {
+				fd, _, _, err := dup.Open(dupFlags)
+				if err != nil {
 					t.Fatalf("%v: Open(%v) failed: %v", s, flags, err)
 				}
+				if fd != nil {
+					defer fd.Close()
+				}
 				if err := testReadWrite(dup, dupFlags, want); err != nil {
 					t.Fatalf("%v: testReadWrite(%v) failed: %v", s, dupFlags, err)
 				}
@@ -316,19 +324,19 @@ func TestReadWriteDup(t *testing.T) {
 }
 
 func TestUnopened(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFREG}, allConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFREG}, allConfs, func(t *testing.T, s state) {
 		b := []byte("foobar")
-		if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF {
-			t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if _, err := s.file.WriteAt(b, 0); err != unix.EBADF {
+			t.Errorf("%v: WriteAt() should have failed, got: %v, expected: unix.EBADF", s, err)
 		}
-		if _, err := s.file.ReadAt(b, 0); err != syscall.EBADF {
-			t.Errorf("%v: ReadAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if _, err := s.file.ReadAt(b, 0); err != unix.EBADF {
+			t.Errorf("%v: ReadAt() should have failed, got: %v, expected: unix.EBADF", s, err)
 		}
-		if _, err := s.file.Readdir(0, 100); err != syscall.EBADF {
-			t.Errorf("%v: Readdir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if _, err := s.file.Readdir(0, 100); err != unix.EBADF {
+			t.Errorf("%v: Readdir() should have failed, got: %v, expected: unix.EBADF", s, err)
 		}
-		if err := s.file.FSync(); err != syscall.EBADF {
-			t.Errorf("%v: FSync() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if err := s.file.FSync(); err != unix.EBADF {
+			t.Errorf("%v: FSync() should have failed, got: %v, expected: unix.EBADF", s, err)
 		}
 	})
 }
@@ -338,7 +346,7 @@ func TestUnopened(t *testing.T) {
 // was open with O_PATH, but Open() was not checking for it and allowing the
 // control file to be reused.
 func TestOpenOPath(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFREG}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFREG}, rwConfs, func(t *testing.T, s state) {
 		// Fist remove all permissions on the file.
 		if err := s.file.SetAttr(p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(0)}); err != nil {
 			t.Fatalf("SetAttr(): %v", err)
@@ -353,7 +361,7 @@ func TestOpenOPath(t *testing.T) {
 		if newFile.(*localFile).controlReadable {
 			t.Fatalf("control file didn't open with O_PATH: %+v", newFile)
 		}
-		if _, _, _, err := newFile.Open(p9.ReadOnly); err != syscall.EACCES {
+		if _, _, _, err := newFile.Open(p9.ReadOnly); err != unix.EACCES {
 			t.Fatalf("Open() should have failed, got: %v, wanted: EACCES", err)
 		}
 	})
@@ -375,7 +383,7 @@ func TestSetAttrPerm(t *testing.T) {
 		valid := p9.SetAttrMask{Permissions: true}
 		attr := p9.SetAttr{Permissions: 0777}
 		got, err := SetGetAttr(s.file, valid, attr)
-		if s.fileType == syscall.S_IFLNK {
+		if s.fileType == unix.S_IFLNK {
 			if err == nil {
 				t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
 			}
@@ -396,7 +404,7 @@ func TestSetAttrSize(t *testing.T) {
 			valid := p9.SetAttrMask{Size: true}
 			attr := p9.SetAttr{Size: size}
 			got, err := SetGetAttr(s.file, valid, attr)
-			if s.fileType == syscall.S_IFLNK || s.fileType == syscall.S_IFDIR {
+			if s.fileType == unix.S_IFLNK || s.fileType == unix.S_IFDIR {
 				if err == nil {
 					t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
 				}
@@ -478,9 +486,9 @@ func TestLink(t *testing.T) {
 		}
 
 		err = dir.Link(s.file, linkFile)
-		if s.fileType == syscall.S_IFDIR {
-			if err != syscall.EPERM {
-				t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err)
+		if s.fileType == unix.S_IFDIR {
+			if err != unix.EPERM {
+				t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: unix.EPERM", s, linkFile, err)
 			}
 			return
 		}
@@ -491,9 +499,12 @@ func TestLink(t *testing.T) {
 }
 
 func TestROMountChecks(t *testing.T) {
-	const want = syscall.EROFS
+	const want = unix.EROFS
+	uid := p9.UID(os.Getuid())
+	gid := p9.GID(os.Getgid())
+
 	runCustom(t, allTypes, roConfs, func(t *testing.T, s state) {
-		if s.fileType != syscall.S_IFLNK {
+		if s.fileType != unix.S_IFLNK {
 			if _, _, _, err := s.file.Open(p9.WriteOnly); err != want {
 				t.Errorf("Open() should have failed, got: %v, expected: %v", err, want)
 			}
@@ -512,16 +523,16 @@ func TestROMountChecks(t *testing.T) {
 			}
 		}
 
-		if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != want {
+		if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, uid, gid); err != want {
 			t.Errorf("Create() should have failed, got: %v, expected: %v", err, want)
 		}
-		if _, err := s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != want {
+		if _, err := s.file.Mkdir("some_dir", 0777, uid, gid); err != want {
 			t.Errorf("MkDir() should have failed, got: %v, expected: %v", err, want)
 		}
 		if err := s.file.RenameAt("some_file", s.file, "other_file"); err != want {
 			t.Errorf("Rename() should have failed, got: %v, expected: %v", err, want)
 		}
-		if _, err := s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != want {
+		if _, err := s.file.Symlink("some_place", "some_symlink", uid, gid); err != want {
 			t.Errorf("Symlink() should have failed, got: %v, expected: %v", err, want)
 		}
 		if err := s.file.UnlinkAt("some_file", 0); err != want {
@@ -530,6 +541,9 @@ func TestROMountChecks(t *testing.T) {
 		if err := s.file.Link(s.file, "some_link"); err != want {
 			t.Errorf("Link() should have failed, got: %v, expected: %v", err, want)
 		}
+		if _, err := s.file.Mknod("some-nod", 0777, 1, 2, uid, gid); err != want {
+			t.Errorf("Mknod() should have failed, got: %v, expected: %v", err, want)
+		}
 
 		valid := p9.SetAttrMask{Size: true}
 		attr := p9.SetAttr{Size: 0}
@@ -541,16 +555,20 @@ func TestROMountChecks(t *testing.T) {
 
 func TestROMountPanics(t *testing.T) {
 	conf := Config{ROMount: true, PanicOnWrite: true}
+	uid := p9.UID(os.Getuid())
+	gid := p9.GID(os.Getgid())
+
 	runCustom(t, allTypes, []Config{conf}, func(t *testing.T, s state) {
-		if s.fileType != syscall.S_IFLNK {
+		if s.fileType != unix.S_IFLNK {
 			assertPanic(t, func() { s.file.Open(p9.WriteOnly) })
 		}
-		assertPanic(t, func() { s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
-		assertPanic(t, func() { s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.Create("some_file", p9.ReadWrite, 0777, uid, gid) })
+		assertPanic(t, func() { s.file.Mkdir("some_dir", 0777, uid, gid) })
 		assertPanic(t, func() { s.file.RenameAt("some_file", s.file, "other_file") })
-		assertPanic(t, func() { s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.Symlink("some_place", "some_symlink", uid, gid) })
 		assertPanic(t, func() { s.file.UnlinkAt("some_file", 0) })
 		assertPanic(t, func() { s.file.Link(s.file, "some_link") })
+		assertPanic(t, func() { s.file.Mknod("some-nod", 0777, 1, 2, uid, gid) })
 
 		valid := p9.SetAttrMask{Size: true}
 		attr := p9.SetAttr{Size: 0}
@@ -559,9 +577,9 @@ func TestROMountPanics(t *testing.T) {
 }
 
 func TestWalkNotFound(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFDIR}, allConfs, func(t *testing.T, s state) {
-		if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT {
-			t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err)
+	runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) {
+		if _, _, err := s.file.Walk([]string{"nobody-here"}); err != unix.ENOENT {
+			t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: unix.ENOENT", s, "nobody-here", err)
 		}
 	})
 }
@@ -580,7 +598,7 @@ func TestWalkDup(t *testing.T) {
 }
 
 func TestReaddir(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		name := "dir"
 		if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
 			t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err)
@@ -705,7 +723,7 @@ func TestAttachInvalidType(t *testing.T) {
 	defer os.RemoveAll(dir)
 
 	fifo := filepath.Join(dir, "fifo")
-	if err := syscall.Mkfifo(fifo, 0755); err != nil {
+	if err := unix.Mkfifo(fifo, 0755); err != nil {
 		t.Fatalf("Mkfifo(%q): %v", fifo, err)
 	}
 
@@ -766,16 +784,16 @@ func TestDoubleAttachError(t *testing.T) {
 }
 
 func TestTruncate(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		child, err := createFile(s.file, "test")
 		if err != nil {
-			t.Fatalf("createFile() failed, err: %v", err)
+			t.Fatalf("createFile() failed: %v", err)
 		}
 		defer child.Close()
 		want := []byte("foobar")
 		w, err := child.WriteAt(want, 0)
 		if err != nil {
-			t.Fatalf("Write() failed, err: %v", err)
+			t.Fatalf("Write() failed: %v", err)
 		}
 		if w != len(want) {
 			t.Fatalf("Write() was partial, got: %d, expected: %d", w, len(want))
@@ -783,12 +801,15 @@ func TestTruncate(t *testing.T) {
 
 		_, l, err := s.file.Walk([]string{"test"})
 		if err != nil {
-			t.Fatalf("Walk(%s) failed, err: %v", "test", err)
+			t.Fatalf("Walk(%s) failed: %v", "test", err)
 		}
 		if _, _, _, err := l.Open(p9.ReadOnly | p9.OpenTruncate); err != nil {
-			t.Fatalf("Open() failed, err: %v", err)
+			t.Fatalf("Open() failed: %v", err)
 		}
 		_, mask, attr, err := l.GetAttr(p9.AttrMask{Size: true})
+		if err != nil {
+			t.Fatalf("GetAttr() failed: %v", err)
+		}
 		if !mask.Size {
 			t.Fatalf("GetAttr() didn't return size: %+v", mask)
 		}
@@ -797,3 +818,27 @@ func TestTruncate(t *testing.T) {
 		}
 	})
 }
+
+func TestMknod(t *testing.T) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+		_, err := s.file.Mknod("test", p9.ModeRegular|0777, 1, 2, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+		if err != nil {
+			t.Fatalf("Mknod() failed: %v", err)
+		}
+
+		_, f, err := s.file.Walk([]string{"test"})
+		if err != nil {
+			t.Fatalf("Walk() failed: %v", err)
+		}
+		fd, _, _, err := f.Open(p9.ReadWrite)
+		if err != nil {
+			t.Fatalf("Open() failed: %v", err)
+		}
+		if fd != nil {
+			defer fd.Close()
+		}
+		if err := testReadWrite(f, p9.ReadWrite, nil); err != nil {
+			t.Fatalf("testReadWrite() failed: %v", err)
+		}
+	})
+}
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
index 542b54365..f11fea40d 100644
--- a/runsc/fsgofer/fsgofer_unsafe.go
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -15,18 +15,18 @@
 package fsgofer
 
 import (
-	"syscall"
 	"unsafe"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/syserr"
 )
 
-func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
+func utimensat(dirFd int, name string, times [2]unix.Timespec, flags int) error {
 	// utimensat(2) doesn't accept empty name, instead name must be nil to make it
 	// operate directly on 'dirFd' unlike other *at syscalls.
 	var namePtr unsafe.Pointer
 	if name != "" {
-		nameBytes, err := syscall.BytePtrFromString(name)
+		nameBytes, err := unix.BytePtrFromString(name)
 		if err != nil {
 			return err
 		}
@@ -35,8 +35,8 @@ func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) err
 
 	timesPtr := unsafe.Pointer(&times[0])
 
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_UTIMENSAT,
+	if _, _, errno := unix.Syscall6(
+		unix.SYS_UTIMENSAT,
 		uintptr(dirFd),
 		uintptr(namePtr),
 		uintptr(timesPtr),
@@ -52,7 +52,7 @@ func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) err
 func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error {
 	var oldNamePtr unsafe.Pointer
 	if oldName != "" {
-		nameBytes, err := syscall.BytePtrFromString(oldName)
+		nameBytes, err := unix.BytePtrFromString(oldName)
 		if err != nil {
 			return err
 		}
@@ -60,15 +60,15 @@ func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error
 	}
 	var newNamePtr unsafe.Pointer
 	if newName != "" {
-		nameBytes, err := syscall.BytePtrFromString(newName)
+		nameBytes, err := unix.BytePtrFromString(newName)
 		if err != nil {
 			return err
 		}
 		newNamePtr = unsafe.Pointer(nameBytes)
 	}
 
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_RENAMEAT,
+	if _, _, errno := unix.Syscall6(
+		unix.SYS_RENAMEAT,
 		uintptr(oldDirFD),
 		uintptr(oldNamePtr),
 		uintptr(newDirFD),
-- 
cgit v1.2.3


From 8a725d8a66ef1c38b256c52c1865e5000cc8ca36 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 19 Aug 2020 18:35:35 -0700
Subject: Move boot.Config to its own package

Updates #3494

PiperOrigin-RevId: 327548511
---
 pkg/test/testutil/BUILD                    |   2 +-
 pkg/test/testutil/testutil.go              |  12 +-
 runsc/BUILD                                |   4 +-
 runsc/boot/BUILD                           |   3 +-
 runsc/boot/config.go                       | 336 --------------------------
 runsc/boot/controller.go                   |   3 +-
 runsc/boot/fs.go                           |  41 ++--
 runsc/boot/fs_test.go                      |  11 +-
 runsc/boot/loader.go                       |  19 +-
 runsc/boot/loader_test.go                  |   7 +-
 runsc/boot/network.go                      |  45 +---
 runsc/boot/strace.go                       |   3 +-
 runsc/boot/vfs.go                          |  21 +-
 runsc/cmd/BUILD                            |   3 +-
 runsc/cmd/boot.go                          |   3 +-
 runsc/cmd/capability_test.go               |   4 +-
 runsc/cmd/checkpoint.go                    |   4 +-
 runsc/cmd/create.go                        |   4 +-
 runsc/cmd/debug.go                         |   4 +-
 runsc/cmd/delete.go                        |   6 +-
 runsc/cmd/delete_test.go                   |   4 +-
 runsc/cmd/do.go                            |  10 +-
 runsc/cmd/events.go                        |   4 +-
 runsc/cmd/exec.go                          |   4 +-
 runsc/cmd/gofer.go                         |  12 +-
 runsc/cmd/kill.go                          |   4 +-
 runsc/cmd/list.go                          |   4 +-
 runsc/cmd/pause.go                         |   4 +-
 runsc/cmd/ps.go                            |   4 +-
 runsc/cmd/restore.go                       |   4 +-
 runsc/cmd/resume.go                        |   4 +-
 runsc/cmd/run.go                           |   4 +-
 runsc/cmd/start.go                         |   4 +-
 runsc/cmd/state.go                         |   4 +-
 runsc/cmd/wait.go                          |   4 +-
 runsc/config/BUILD                         |  15 ++
 runsc/config/config.go                     | 376 +++++++++++++++++++++++++++++
 runsc/container/BUILD                      |   2 +
 runsc/container/container.go               |  11 +-
 runsc/container/container_test.go          |  12 +-
 runsc/container/multi_container_test.go    |   3 +-
 runsc/container/shared_volume_test.go      |   6 +-
 runsc/fsgofer/fsgofer_test.go              |   4 +-
 runsc/main.go                              |  16 +-
 runsc/sandbox/BUILD                        |   1 +
 runsc/sandbox/network.go                   |  11 +-
 runsc/sandbox/sandbox.go                   |  19 +-
 website/blog/2019-11-18-security-basics.md |   2 +-
 48 files changed, 561 insertions(+), 526 deletions(-)
 delete mode 100644 runsc/boot/config.go
 create mode 100644 runsc/config/BUILD
 create mode 100644 runsc/config/config.go

(limited to 'pkg')

diff --git a/pkg/test/testutil/BUILD b/pkg/test/testutil/BUILD
index 2d8f56bc0..c4b131896 100644
--- a/pkg/test/testutil/BUILD
+++ b/pkg/test/testutil/BUILD
@@ -12,7 +12,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/sync",
-        "//runsc/boot",
+        "//runsc/config",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go
index 1580527b5..3cb6c6814 100644
--- a/pkg/test/testutil/testutil.go
+++ b/pkg/test/testutil/testutil.go
@@ -44,7 +44,7 @@ import (
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -133,21 +133,21 @@ func Command(logger Logger, args ...string) *Cmd {
 
 // TestConfig returns the default configuration to use in tests. Note that
 // 'RootDir' must be set by caller if required.
-func TestConfig(t *testing.T) *boot.Config {
+func TestConfig(t *testing.T) *config.Config {
 	logDir := os.TempDir()
 	if dir, ok := os.LookupEnv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
 		logDir = dir + "/"
 	}
-	return &boot.Config{
+	return &config.Config{
 		Debug:              true,
 		DebugLog:           path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%"),
 		LogFormat:          "text",
 		DebugLogFormat:     "text",
 		LogPackets:         true,
-		Network:            boot.NetworkNone,
+		Network:            config.NetworkNone,
 		Strace:             true,
 		Platform:           "ptrace",
-		FileAccess:         boot.FileAccessExclusive,
+		FileAccess:         config.FileAccessExclusive,
 		NumNetworkChannels: 1,
 
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
@@ -203,7 +203,7 @@ func SetupRootDir() (string, func(), error) {
 
 // SetupContainer creates a bundle and root dir for the container, generates a
 // test config, and writes the spec to config.json in the bundle dir.
-func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, cleanup func(), err error) {
+func SetupContainer(spec *specs.Spec, conf *config.Config) (rootDir, bundleDir string, cleanup func(), err error) {
 	rootDir, rootCleanup, err := SetupRootDir()
 	if err != nil {
 		return "", "", nil, err
diff --git a/runsc/BUILD b/runsc/BUILD
index 96f697a5f..267fb2af8 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -17,8 +17,8 @@ go_binary(
         "//pkg/log",
         "//pkg/refs",
         "//pkg/sentry/platform",
-        "//runsc/boot",
         "//runsc/cmd",
+        "//runsc/config",
         "//runsc/flag",
         "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
@@ -53,8 +53,8 @@ go_binary(
         "//pkg/log",
         "//pkg/refs",
         "//pkg/sentry/platform",
-        "//runsc/boot",
         "//runsc/cmd",
+        "//runsc/config",
         "//runsc/flag",
         "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 9f52438c2..040f6a72d 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -8,7 +8,6 @@ go_library(
         "compat.go",
         "compat_amd64.go",
         "compat_arm64.go",
-        "config.go",
         "controller.go",
         "debug.go",
         "events.go",
@@ -105,6 +104,7 @@ go_library(
         "//runsc/boot/filter",
         "//runsc/boot/platforms",
         "//runsc/boot/pprof",
+        "//runsc/config",
         "//runsc/specutils",
         "@com_github_golang_protobuf//proto:go_default_library",
         "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
@@ -131,6 +131,7 @@ go_test(
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/unet",
+        "//runsc/config",
         "//runsc/fsgofer",
         "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
deleted file mode 100644
index 80da8b3e6..000000000
--- a/runsc/boot/config.go
+++ /dev/null
@@ -1,336 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-import (
-	"fmt"
-	"strconv"
-	"strings"
-
-	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/watchdog"
-)
-
-// FileAccessType tells how the filesystem is accessed.
-type FileAccessType int
-
-const (
-	// FileAccessShared sends IO requests to a Gofer process that validates the
-	// requests and forwards them to the host.
-	FileAccessShared FileAccessType = iota
-
-	// FileAccessExclusive is the same as FileAccessShared, but enables
-	// extra caching for improved performance. It should only be used if
-	// the sandbox has exclusive access to the filesystem.
-	FileAccessExclusive
-)
-
-// MakeFileAccessType converts type from string.
-func MakeFileAccessType(s string) (FileAccessType, error) {
-	switch s {
-	case "shared":
-		return FileAccessShared, nil
-	case "exclusive":
-		return FileAccessExclusive, nil
-	default:
-		return 0, fmt.Errorf("invalid file access type %q", s)
-	}
-}
-
-func (f FileAccessType) String() string {
-	switch f {
-	case FileAccessShared:
-		return "shared"
-	case FileAccessExclusive:
-		return "exclusive"
-	default:
-		return fmt.Sprintf("unknown(%d)", f)
-	}
-}
-
-// NetworkType tells which network stack to use.
-type NetworkType int
-
-const (
-	// NetworkSandbox uses internal network stack, isolated from the host.
-	NetworkSandbox NetworkType = iota
-
-	// NetworkHost redirects network related syscalls to the host network.
-	NetworkHost
-
-	// NetworkNone sets up just loopback using netstack.
-	NetworkNone
-)
-
-// MakeNetworkType converts type from string.
-func MakeNetworkType(s string) (NetworkType, error) {
-	switch s {
-	case "sandbox":
-		return NetworkSandbox, nil
-	case "host":
-		return NetworkHost, nil
-	case "none":
-		return NetworkNone, nil
-	default:
-		return 0, fmt.Errorf("invalid network type %q", s)
-	}
-}
-
-func (n NetworkType) String() string {
-	switch n {
-	case NetworkSandbox:
-		return "sandbox"
-	case NetworkHost:
-		return "host"
-	case NetworkNone:
-		return "none"
-	default:
-		return fmt.Sprintf("unknown(%d)", n)
-	}
-}
-
-// MakeWatchdogAction converts type from string.
-func MakeWatchdogAction(s string) (watchdog.Action, error) {
-	switch strings.ToLower(s) {
-	case "log", "logwarning":
-		return watchdog.LogWarning, nil
-	case "panic":
-		return watchdog.Panic, nil
-	default:
-		return 0, fmt.Errorf("invalid watchdog action %q", s)
-	}
-}
-
-// MakeRefsLeakMode converts type from string.
-func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
-	switch strings.ToLower(s) {
-	case "disabled":
-		return refs.NoLeakChecking, nil
-	case "log-names":
-		return refs.LeaksLogWarning, nil
-	case "log-traces":
-		return refs.LeaksLogTraces, nil
-	default:
-		return 0, fmt.Errorf("invalid refs leakmode %q", s)
-	}
-}
-
-func refsLeakModeToString(mode refs.LeakMode) string {
-	switch mode {
-	// If not set, default it to disabled.
-	case refs.UninitializedLeakChecking, refs.NoLeakChecking:
-		return "disabled"
-	case refs.LeaksLogWarning:
-		return "log-names"
-	case refs.LeaksLogTraces:
-		return "log-traces"
-	default:
-		panic(fmt.Sprintf("Invalid leakmode: %d", mode))
-	}
-}
-
-// Config holds configuration that is not part of the runtime spec.
-type Config struct {
-	// RootDir is the runtime root directory.
-	RootDir string
-
-	// Debug indicates that debug logging should be enabled.
-	Debug bool
-
-	// LogFilename is the filename to log to, if not empty.
-	LogFilename string
-
-	// LogFormat is the log format.
-	LogFormat string
-
-	// DebugLog is the path to log debug information to, if not empty.
-	DebugLog string
-
-	// PanicLog is the path to log GO's runtime messages, if not empty.
-	PanicLog string
-
-	// DebugLogFormat is the log format for debug.
-	DebugLogFormat string
-
-	// FileAccess indicates how the filesystem is accessed.
-	FileAccess FileAccessType
-
-	// Overlay is whether to wrap the root filesystem in an overlay.
-	Overlay bool
-
-	// FSGoferHostUDS enables the gofer to mount a host UDS.
-	FSGoferHostUDS bool
-
-	// Network indicates what type of network to use.
-	Network NetworkType
-
-	// EnableRaw indicates whether raw sockets should be enabled. Raw
-	// sockets are disabled by stripping CAP_NET_RAW from the list of
-	// capabilities.
-	EnableRaw bool
-
-	// HardwareGSO indicates that hardware segmentation offload is enabled.
-	HardwareGSO bool
-
-	// SoftwareGSO indicates that software segmentation offload is enabled.
-	SoftwareGSO bool
-
-	// TXChecksumOffload indicates that TX Checksum Offload is enabled.
-	TXChecksumOffload bool
-
-	// RXChecksumOffload indicates that RX Checksum Offload is enabled.
-	RXChecksumOffload bool
-
-	// QDisc indicates the type of queuening discipline to use by default
-	// for non-loopback interfaces.
-	QDisc QueueingDiscipline
-
-	// LogPackets indicates that all network packets should be logged.
-	LogPackets bool
-
-	// Platform is the platform to run on.
-	Platform string
-
-	// Strace indicates that strace should be enabled.
-	Strace bool
-
-	// StraceSyscalls is the set of syscalls to trace.  If StraceEnable is
-	// true and this list is empty, then all syscalls will be traced.
-	StraceSyscalls []string
-
-	// StraceLogSize is the max size of data blobs to display.
-	StraceLogSize uint
-
-	// DisableSeccomp indicates whether seccomp syscall filters should be
-	// disabled. Pardon the double negation, but default to enabled is important.
-	DisableSeccomp bool
-
-	// WatchdogAction sets what action the watchdog takes when triggered.
-	WatchdogAction watchdog.Action
-
-	// PanicSignal registers signal handling that panics. Usually set to
-	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
-	PanicSignal int
-
-	// ProfileEnable is set to prepare the sandbox to be profiled.
-	ProfileEnable bool
-
-	// RestoreFile is the path to the saved container image
-	RestoreFile string
-
-	// NumNetworkChannels controls the number of AF_PACKET sockets that map
-	// to the same underlying network device. This allows netstack to better
-	// scale for high throughput use cases.
-	NumNetworkChannels int
-
-	// Rootless allows the sandbox to be started with a user that is not root.
-	// Defense is depth measures are weaker with rootless. Specifically, the
-	// sandbox and Gofer process run as root inside a user namespace with root
-	// mapped to the caller's user.
-	Rootless bool
-
-	// AlsoLogToStderr allows to send log messages to stderr.
-	AlsoLogToStderr bool
-
-	// ReferenceLeakMode sets reference leak check mode
-	ReferenceLeakMode refs.LeakMode
-
-	// OverlayfsStaleRead instructs the sandbox to assume that the root mount
-	// is on a Linux overlayfs mount, which does not necessarily preserve
-	// coherence between read-only and subsequent writable file descriptors
-	// representing the "same" file.
-	OverlayfsStaleRead bool
-
-	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
-	// tests. It allows runsc to start the sandbox process as the current
-	// user, and without chrooting the sandbox process. This can be
-	// necessary in test environments that have limited capabilities.
-	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
-
-	// TestOnlyTestNameEnv should only be used in tests. It looks up for the
-	// test name in the container environment variables and adds it to the debug
-	// log file name. This is done to help identify the log with the test when
-	// multiple tests are run in parallel, since there is no way to pass
-	// parameters to the runtime from docker.
-	TestOnlyTestNameEnv string
-
-	// CPUNumFromQuota sets CPU number count to available CPU quota, using
-	// least integer value greater than or equal to quota.
-	//
-	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
-	CPUNumFromQuota bool
-
-	// Enables VFS2 (not plumbled through yet).
-	VFS2 bool
-
-	// Enables FUSE usage (not plumbled through yet).
-	FUSE bool
-}
-
-// ToFlags returns a slice of flags that correspond to the given Config.
-func (c *Config) ToFlags() []string {
-	f := []string{
-		"--root=" + c.RootDir,
-		"--debug=" + strconv.FormatBool(c.Debug),
-		"--log=" + c.LogFilename,
-		"--log-format=" + c.LogFormat,
-		"--debug-log=" + c.DebugLog,
-		"--panic-log=" + c.PanicLog,
-		"--debug-log-format=" + c.DebugLogFormat,
-		"--file-access=" + c.FileAccess.String(),
-		"--overlay=" + strconv.FormatBool(c.Overlay),
-		"--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
-		"--network=" + c.Network.String(),
-		"--log-packets=" + strconv.FormatBool(c.LogPackets),
-		"--platform=" + c.Platform,
-		"--strace=" + strconv.FormatBool(c.Strace),
-		"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
-		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
-		"--watchdog-action=" + c.WatchdogAction.String(),
-		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
-		"--profile=" + strconv.FormatBool(c.ProfileEnable),
-		"--net-raw=" + strconv.FormatBool(c.EnableRaw),
-		"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
-		"--rootless=" + strconv.FormatBool(c.Rootless),
-		"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
-		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
-		"--gso=" + strconv.FormatBool(c.HardwareGSO),
-		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
-		"--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload),
-		"--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload),
-		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
-		"--qdisc=" + c.QDisc.String(),
-	}
-	if c.CPUNumFromQuota {
-		f = append(f, "--cpu-num-from-quota")
-	}
-	// Only include these if set since it is never to be used by users.
-	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		f = append(f, "--TESTONLY-unsafe-nonroot=true")
-	}
-	if len(c.TestOnlyTestNameEnv) != 0 {
-		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
-	}
-
-	if c.VFS2 {
-		f = append(f, "--vfs2=true")
-	}
-
-	if c.FUSE {
-		f = append(f, "--fuse=true")
-	}
-
-	return f
-}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 626a3816e..68a2b45cf 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -33,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -220,7 +221,7 @@ type StartArgs struct {
 	Spec *specs.Spec
 
 	// Config is the runsc-specific configuration for the sandbox.
-	Conf *Config
+	Conf *config.Config
 
 	// CID is the ID of the container to start.
 	CID string
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 9dd5b0184..163265afe 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -48,6 +48,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -66,7 +67,7 @@ const (
 // tmpfs has some extra supported options that we must pass through.
 var tmpfsAllowedData = []string{"mode", "uid", "gid"}
 
-func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
 	// Upper layer uses the same flags as lower, but it must be read-write.
 	upperFlags := lowerFlags
 	upperFlags.ReadOnly = false
@@ -156,7 +157,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 }
 
 // p9MountData creates a slice of p9 mount data.
-func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
+func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string {
 	opts := []string{
 		"trans=fd",
 		"rfdno=" + strconv.Itoa(fd),
@@ -167,7 +168,7 @@ func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
 		// enablement.
 		opts = append(opts, "privateunixsocket=true")
 	}
-	if fa == FileAccessShared {
+	if fa == config.FileAccessShared {
 		opts = append(opts, "cache=remote_revalidating")
 	}
 	return opts
@@ -281,7 +282,7 @@ func subtargets(root string, mnts []specs.Mount) []string {
 	return targets
 }
 
-func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+func setupContainerFS(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
 	if conf.VFS2 {
 		return setupContainerVFS2(ctx, conf, mntr, procArgs)
 	}
@@ -468,11 +469,11 @@ func (m *mountHint) checkCompatible(mount specs.Mount) error {
 	return nil
 }
 
-func (m *mountHint) fileAccessType() FileAccessType {
+func (m *mountHint) fileAccessType() config.FileAccessType {
 	if m.share == container {
-		return FileAccessExclusive
+		return config.FileAccessExclusive
 	}
-	return FileAccessShared
+	return config.FileAccessShared
 }
 
 func filterUnsupportedOptions(mount specs.Mount) []string {
@@ -576,7 +577,7 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 // processHints processes annotations that container hints about how volumes
 // should be mounted (e.g. a volume shared between containers). It must be
 // called for the root container only.
-func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error {
+func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error {
 	if conf.VFS2 {
 		return c.processHintsVFS2(conf, creds)
 	}
@@ -600,7 +601,7 @@ func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) e
 // setupFS is used to set up the file system for all containers. This is the
 // main entry point method, with most of the other being internal only. It
 // returns the mount namespace that is created for the container.
-func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
+func (c *containerMounter) setupFS(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
 	log.Infof("Configuring container's file system")
 
 	// Create context with root credentials to mount the filesystem (the current
@@ -626,7 +627,7 @@ func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessA
 	return mns, nil
 }
 
-func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) {
+func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config) (*fs.MountNamespace, error) {
 	rootInode, err := c.createRootMount(ctx, conf)
 	if err != nil {
 		return nil, fmt.Errorf("creating filesystem for container: %v", err)
@@ -638,7 +639,7 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi
 	return mns, nil
 }
 
-func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *fs.MountNamespace) error {
 	root := mns.Root()
 	defer root.DecRef(ctx)
 
@@ -674,7 +675,7 @@ func (c *containerMounter) checkDispenser() error {
 
 // mountSharedMaster mounts the master of a volume that is shared among
 // containers in a pod. It returns the root mount's inode.
-func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
@@ -714,7 +715,7 @@ func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config,
 }
 
 // createRootMount creates the root filesystem.
-func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
+func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Config) (*fs.Inode, error) {
 	// First construct the filesystem from the spec.Root.
 	mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
 
@@ -759,7 +760,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 
 // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
 // used for mounts.
-func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
+func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.Mount) (string, []string, bool, error) {
 	var (
 		fsName     string
 		opts       []string
@@ -793,19 +794,19 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	return fsName, opts, useOverlay, nil
 }
 
-func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
+func (c *containerMounter) getMountAccessType(mount specs.Mount) config.FileAccessType {
 	if hint := c.hints.findMount(mount); hint != nil {
 		return hint.fileAccessType()
 	}
 	// Non-root bind mounts are always shared if no hints were provided.
-	return FileAccessShared
+	return config.FileAccessShared
 }
 
 // mountSubmount mounts volumes inside the container's root. Because mounts may
 // be readonly, a lower ramfs overlay is added to create the mount point dir.
 // Another overlay is added with tmpfs on top if Config.Overlay is true.
 // 'm.Destination' must be an absolute path with '..' and symlinks resolved.
-func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
@@ -904,7 +905,7 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
 
 // addRestoreMount adds a mount to the MountSources map used for restoring a
 // checkpointed container.
-func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
 	if err != nil {
 		return err
@@ -929,7 +930,7 @@ func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnviron
 
 // createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
 // the mounts to the environment.
-func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
+func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.RestoreEnvironment, error) {
 	renv := &fs.RestoreEnvironment{
 		MountSources: make(map[string][]fs.MountArgs),
 	}
@@ -984,7 +985,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent) error {
 	for _, m := range c.mounts {
 		if filepath.Clean(m.Destination) == "/tmp" {
 			log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 912037075..e986231e5 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -20,6 +20,7 @@ import (
 	"testing"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 func TestPodMountHintsHappy(t *testing.T) {
@@ -196,7 +197,7 @@ func TestGetMountAccessType(t *testing.T) {
 	for _, tst := range []struct {
 		name        string
 		annotations map[string]string
-		want        FileAccessType
+		want        config.FileAccessType
 	}{
 		{
 			name: "container=exclusive",
@@ -205,7 +206,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "container",
 			},
-			want: FileAccessExclusive,
+			want: config.FileAccessExclusive,
 		},
 		{
 			name: "pod=shared",
@@ -214,7 +215,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "pod",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 		{
 			name: "shared=shared",
@@ -223,7 +224,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "shared",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 		{
 			name: "default=shared",
@@ -232,7 +233,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "container",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 	} {
 		t.Run(tst.name, func(t *testing.T) {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 40c6f99fd..e8ea5093b 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -67,6 +67,7 @@ import (
 	"gvisor.dev/gvisor/runsc/boot/filter"
 	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
 	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 
 	// Include supported socket providers.
@@ -79,7 +80,7 @@ import (
 )
 
 type containerInfo struct {
-	conf *Config
+	conf *config.Config
 
 	// spec is the base configuration for the root container.
 	spec *specs.Spec
@@ -165,7 +166,7 @@ type Args struct {
 	// Spec is the sandbox specification.
 	Spec *specs.Spec
 	// Conf is the system configuration.
-	Conf *Config
+	Conf *config.Config
 	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
 	// of this FD and may close it at any time.
 	ControllerFD int
@@ -471,7 +472,7 @@ func (l *Loader) Destroy() {
 	}
 }
 
-func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
+func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) {
 	p, err := platform.Lookup(conf.Platform)
 	if err != nil {
 		panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err))
@@ -504,7 +505,7 @@ func (l *Loader) installSeccompFilters() error {
 	} else {
 		opts := filter.Options{
 			Platform:      l.k.Platform,
-			HostNetwork:   l.root.conf.Network == NetworkHost,
+			HostNetwork:   l.root.conf.Network == config.NetworkHost,
 			ProfileEnable: l.root.conf.ProfileEnable,
 			ControllerFD:  l.ctrl.srv.FD(),
 		}
@@ -531,7 +532,7 @@ func (l *Loader) Run() error {
 }
 
 func (l *Loader) run() error {
-	if l.root.conf.Network == NetworkHost {
+	if l.root.conf.Network == config.NetworkHost {
 		// Delay host network configuration to this point because network namespace
 		// is configured after the loader is created and before Run() is called.
 		log.Debugf("Configuring host network")
@@ -629,7 +630,7 @@ func (l *Loader) createContainer(cid string) error {
 // startContainer starts a child container. It returns the thread group ID of
 // the newly created process. Caller owns 'files' and may close them after
 // this method returns.
-func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*os.File) error {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
 	if err != nil {
@@ -1017,17 +1018,17 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
 	// Create an empty network stack because the network namespace may be empty at
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
 	// Run().
 	switch conf.Network {
-	case NetworkHost:
+	case config.NetworkHost:
 		// No network namespacing support for hostinet yet, hence creator is nil.
 		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
 
-	case NetworkNone, NetworkSandbox:
+	case config.NetworkNone, config.NetworkSandbox:
 		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
 		if err != nil {
 			return nil, err
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index aa3fdf96c..03cbaec33 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -34,6 +34,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/fsgofer"
 )
 
@@ -45,10 +46,10 @@ func init() {
 	}
 }
 
-func testConfig() *Config {
-	return &Config{
+func testConfig() *config.Config {
+	return &config.Config{
 		RootDir:        "unused_root_dir",
-		Network:        NetworkNone,
+		Network:        config.NetworkNone,
 		DisableSeccomp: true,
 		Platform:       "ptrace",
 	}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 4e1fa7665..988573640 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -33,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 var (
@@ -78,44 +79,6 @@ type DefaultRoute struct {
 	Name  string
 }
 
-// QueueingDiscipline is used to specify the kind of Queueing Discipline to
-// apply for a give FDBasedLink.
-type QueueingDiscipline int
-
-const (
-	// QDiscNone disables any queueing for the underlying FD.
-	QDiscNone QueueingDiscipline = iota
-
-	// QDiscFIFO applies a simple fifo based queue to the underlying
-	// FD.
-	QDiscFIFO
-)
-
-// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
-// else returns an error.
-func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
-	switch s {
-	case "none":
-		return QDiscNone, nil
-	case "fifo":
-		return QDiscFIFO, nil
-	default:
-		return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
-	}
-}
-
-// String implements fmt.Stringer.
-func (q QueueingDiscipline) String() string {
-	switch q {
-	case QDiscNone:
-		return "none"
-	case QDiscFIFO:
-		return "fifo"
-	default:
-		panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
-	}
-}
-
 // FDBasedLink configures an fd-based link.
 type FDBasedLink struct {
 	Name               string
@@ -127,7 +90,7 @@ type FDBasedLink struct {
 	TXChecksumOffload  bool
 	RXChecksumOffload  bool
 	LinkAddress        net.HardwareAddr
-	QDisc              QueueingDiscipline
+	QDisc              config.QueueingDiscipline
 
 	// NumChannels controls how many underlying FD's are to be used to
 	// create this endpoint.
@@ -247,8 +210,8 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		switch link.QDisc {
-		case QDiscNone:
-		case QDiscFIFO:
+		case config.QDiscNone:
+		case config.QDiscFIFO:
 			log.Infof("Enabling FIFO QDisc on %q", link.Name)
 			linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
 		}
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index fbfd3b07c..176981f74 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -16,9 +16,10 @@ package boot
 
 import (
 	"gvisor.dev/gvisor/pkg/sentry/strace"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
-func enableStrace(conf *Config) error {
+func enableStrace(conf *config.Config) error {
 	// We must initialize even if strace is not enabled.
 	strace.Initialize()
 
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 08dce8b6c..3da7a64f0 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -42,6 +42,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 func registerFilesystems(k *kernel.Kernel) error {
@@ -133,7 +134,7 @@ func registerFilesystems(k *kernel.Kernel) error {
 	return nil
 }
 
-func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
 	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
 	if err != nil {
 		return fmt.Errorf("failed to setupFS: %w", err)
@@ -149,7 +150,7 @@ func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounte
 	return nil
 }
 
-func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+func (c *containerMounter) setupVFS2(ctx context.Context, conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
 	log.Infof("Configuring container's file system with VFS2")
 
 	// Create context with root credentials to mount the filesystem (the current
@@ -175,7 +176,7 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
 	return mns, nil
 }
 
-func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
+func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
 	fd := c.fds.remove()
 	opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
 
@@ -196,7 +197,7 @@ func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *C
 	return mns, nil
 }
 
-func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
 	mounts, err := c.prepareMountsVFS2()
 	if err != nil {
 		return err
@@ -256,7 +257,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
 	return mounts, nil
 }
 
-func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
 	root := mns.Root()
 	defer root.DecRef(ctx)
 	target := &vfs.PathOperation{
@@ -285,7 +286,7 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
 
 // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
 // used for mounts.
-func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
 	fsName := m.Type
 	var data []string
 
@@ -383,7 +384,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
+func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
 	for _, m := range c.mounts {
 		// m.Destination has been cleaned, so it's to use equality here.
 		if m.Destination == "/tmp" {
@@ -448,7 +449,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
 // processHintsVFS2 processes annotations that container hints about how volumes
 // should be mounted (e.g. a volume shared between containers). It must be
 // called for the root container only.
-func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error {
+func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Credentials) error {
 	ctx := c.k.SupervisorContext()
 	for _, hint := range c.hints.mounts {
 		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
@@ -469,7 +470,7 @@ func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credential
 
 // mountSharedMasterVFS2 mounts the master of a volume that is shared among
 // containers in a pod.
-func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
+func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	mntFD := &mountAndFD{Mount: hint.mount}
@@ -485,7 +486,7 @@ func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Conf
 
 // mountSharedSubmount binds mount to a previously mounted volume that is shared
 // among containers in the same pod.
-func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error {
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error {
 	if err := source.checkCompatible(mount); err != nil {
 		return err
 	}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 1b5178dd5..2556f6d9e 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -51,6 +51,7 @@ go_library(
         "//pkg/unet",
         "//pkg/urpc",
         "//runsc/boot",
+        "//runsc/config",
         "//runsc/console",
         "//runsc/container",
         "//runsc/flag",
@@ -84,7 +85,7 @@ go_test(
         "//pkg/sentry/kernel/auth",
         "//pkg/test/testutil",
         "//pkg/urpc",
-        "//runsc/boot",
+        "//runsc/config",
         "//runsc/container",
         "//runsc/specutils",
         "@com_github_google_go_cmp//cmp:go_default_library",
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index f4f247721..357f46517 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -133,7 +134,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Ensure that if there is a panic, all goroutine stacks are printed.
 	debug.SetTraceback("system")
 
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	if b.attached {
 		// Ensure this process is killed after parent process terminates when
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index a84067112..e13a94486 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -24,7 +24,7 @@ import (
 	"github.com/syndtr/gocapability/capability"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/test/testutil"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -88,7 +88,7 @@ func TestCapabilities(t *testing.T) {
 	conf := testutil.TestConfig(t)
 
 	// Use --network=host to make sandbox use spec's capabilities.
-	conf.Network = boot.NetworkHost
+	conf.Network = config.NetworkHost
 
 	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 8a29e521e..db46d509f 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -22,7 +22,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -72,7 +72,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	cont, err := container.Load(conf.RootDir, id)
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 910e97577..4d9085244 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -18,7 +18,7 @@ import (
 	"context"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -81,7 +81,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	if conf.Rootless {
 		return Errorf("Rootless mode not supported with %q", c.Name())
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 742f8c344..132198222 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -25,7 +25,7 @@ import (
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -82,7 +82,7 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute.
 func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
 	var c *container.Container
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	if d.pid == 0 {
 		// No pid, container ID must have been provided.
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index 0e4863f50..4e49deff8 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -21,7 +21,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -59,14 +59,14 @@ func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 		return subcommands.ExitUsageError
 	}
 
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	if err := d.execute(f.Args(), conf); err != nil {
 		Fatalf("%v", err)
 	}
 	return subcommands.ExitSuccess
 }
 
-func (d *Delete) execute(ids []string, conf *boot.Config) error {
+func (d *Delete) execute(ids []string, conf *config.Config) error {
 	for _, id := range ids {
 		c, err := container.Load(conf.RootDir, id)
 		if err != nil {
diff --git a/runsc/cmd/delete_test.go b/runsc/cmd/delete_test.go
index cb59516a3..e2d994a05 100644
--- a/runsc/cmd/delete_test.go
+++ b/runsc/cmd/delete_test.go
@@ -18,7 +18,7 @@ import (
 	"io/ioutil"
 	"testing"
 
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 func TestNotFound(t *testing.T) {
@@ -27,7 +27,7 @@ func TestNotFound(t *testing.T) {
 	if err != nil {
 		t.Fatalf("error creating dir: %v", err)
 	}
-	conf := &boot.Config{RootDir: dir}
+	conf := &config.Config{RootDir: dir}
 
 	d := Delete{}
 	if err := d.execute(ids, conf); err == nil {
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 7d1310c96..d1f2e9e6d 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -30,7 +30,7 @@ import (
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -82,7 +82,7 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 		return subcommands.ExitUsageError
 	}
 
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	if conf.Rootless {
@@ -125,7 +125,7 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	specutils.LogSpec(spec)
 
 	cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
-	if conf.Network == boot.NetworkNone {
+	if conf.Network == config.NetworkNone {
 		netns := specs.LinuxNamespace{
 			Type: specs.NetworkNamespace,
 		}
@@ -135,9 +135,9 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 		spec.Linux = &specs.Linux{Namespaces: []specs.LinuxNamespace{netns}}
 
 	} else if conf.Rootless {
-		if conf.Network == boot.NetworkSandbox {
+		if conf.Network == config.NetworkSandbox {
 			c.notifyUser("*** Warning: using host network due to --rootless ***")
-			conf.Network = boot.NetworkHost
+			conf.Network = config.NetworkHost
 		}
 
 	} else {
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index 51f6a98ed..25fe2cf1c 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -22,7 +22,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -72,7 +72,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index d9a94903e..600876a27 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -33,7 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/urpc"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/console"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
@@ -105,7 +105,7 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute. It starts a process in an
 // already created container.
 func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	e, id, err := ex.parseArgs(f, conf.EnableRaw)
 	if err != nil {
 		Fatalf("parsing process spec: %v", err)
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 3966e2d21..7da02c3af 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -30,7 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/fsgofer"
 	"gvisor.dev/gvisor/runsc/fsgofer/filter"
@@ -107,7 +107,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("reading spec: %v", err)
 	}
 
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	if g.setUpRoot {
 		if err := setupRootFS(spec, conf); err != nil {
@@ -263,7 +263,7 @@ func isReadonlyMount(opts []string) bool {
 	return false
 }
 
-func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
+func setupRootFS(spec *specs.Spec, conf *config.Config) error {
 	// Convert all shared mounts into slaves to be sure that nothing will be
 	// propagated outside of our namespace.
 	if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
@@ -346,7 +346,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 // setupMounts binds mount all mounts specified in the spec in their correct
 // location inside root. It will resolve relative paths and symlinks. It also
 // creates directories as needed.
-func setupMounts(conf *boot.Config, mounts []specs.Mount, root string) error {
+func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error {
 	for _, m := range mounts {
 		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
 			continue
@@ -385,7 +385,7 @@ func setupMounts(conf *boot.Config, mounts []specs.Mount, root string) error {
 // Otherwise, it may follow symlinks to locations that would be overwritten
 // with another mount point and return the wrong location. In short, make sure
 // setupMounts() has been called before.
-func resolveMounts(conf *boot.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
+func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
 	cleanMounts := make([]specs.Mount, 0, len(mounts))
 	for _, m := range mounts {
 		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
@@ -467,7 +467,7 @@ func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, erro
 }
 
 // adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs.
-func adjustMountOptions(conf *boot.Config, path string, opts []string) ([]string, error) {
+func adjustMountOptions(conf *config.Config, path string, opts []string) ([]string, error) {
 	rv := make([]string, len(opts))
 	copy(rv, opts)
 
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index 8282ea0e0..04eee99b2 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -23,7 +23,7 @@ import (
 
 	"github.com/google/subcommands"
 	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -63,7 +63,7 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	if k.pid != 0 && k.all {
 		Fatalf("it is invalid to specify both --all and --pid")
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index d8d906fe3..f92d6fef9 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -24,7 +24,7 @@ import (
 
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -63,7 +63,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		return subcommands.ExitUsageError
 	}
 
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	ids, err := container.List(conf.RootDir)
 	if err != nil {
 		Fatalf("%v", err)
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index 6f95a9837..0eb1402ed 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -18,7 +18,7 @@ import (
 	"context"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -53,7 +53,7 @@ func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	cont, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 7fb8041af..bc58c928f 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -20,7 +20,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/sentry/control"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -58,7 +58,7 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 72584b326..b16975804 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -20,7 +20,7 @@ import (
 	"syscall"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -77,7 +77,7 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	if conf.Rootless {
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index 61a55a554..f24823f99 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -18,7 +18,7 @@ import (
 	"context"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -54,7 +54,7 @@ func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	cont, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index cf41581ad..1161de67a 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -19,7 +19,7 @@ import (
 	"syscall"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -64,7 +64,7 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	if conf.Rootless {
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 0205fd9f7..88991b521 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -18,7 +18,7 @@ import (
 	"context"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -52,7 +52,7 @@ func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index cf2413deb..2bd2ab9f8 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -21,7 +21,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -55,7 +55,7 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index 29c0a15f0..28d0642ed 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -21,7 +21,7 @@ import (
 	"syscall"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -70,7 +70,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/config/BUILD b/runsc/config/BUILD
new file mode 100644
index 000000000..3c8713d53
--- /dev/null
+++ b/runsc/config/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "config",
+    srcs = [
+        "config.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/refs",
+        "//pkg/sentry/watchdog",
+    ],
+)
diff --git a/runsc/config/config.go b/runsc/config/config.go
new file mode 100644
index 000000000..ca85cef51
--- /dev/null
+++ b/runsc/config/config.go
@@ -0,0 +1,376 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package config provides basic infrastructure to set configuration settings
+// for runsc. The configuration is set by flags to the command line. They can
+// also propagate to a different process using the same flags.
+package config
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/watchdog"
+)
+
+// FileAccessType tells how the filesystem is accessed.
+type FileAccessType int
+
+const (
+	// FileAccessShared sends IO requests to a Gofer process that validates the
+	// requests and forwards them to the host.
+	FileAccessShared FileAccessType = iota
+
+	// FileAccessExclusive is the same as FileAccessShared, but enables
+	// extra caching for improved performance. It should only be used if
+	// the sandbox has exclusive access to the filesystem.
+	FileAccessExclusive
+)
+
+// MakeFileAccessType converts type from string.
+func MakeFileAccessType(s string) (FileAccessType, error) {
+	switch s {
+	case "shared":
+		return FileAccessShared, nil
+	case "exclusive":
+		return FileAccessExclusive, nil
+	default:
+		return 0, fmt.Errorf("invalid file access type %q", s)
+	}
+}
+
+func (f FileAccessType) String() string {
+	switch f {
+	case FileAccessShared:
+		return "shared"
+	case FileAccessExclusive:
+		return "exclusive"
+	default:
+		return fmt.Sprintf("unknown(%d)", f)
+	}
+}
+
+// NetworkType tells which network stack to use.
+type NetworkType int
+
+const (
+	// NetworkSandbox uses internal network stack, isolated from the host.
+	NetworkSandbox NetworkType = iota
+
+	// NetworkHost redirects network related syscalls to the host network.
+	NetworkHost
+
+	// NetworkNone sets up just loopback using netstack.
+	NetworkNone
+)
+
+// MakeNetworkType converts type from string.
+func MakeNetworkType(s string) (NetworkType, error) {
+	switch s {
+	case "sandbox":
+		return NetworkSandbox, nil
+	case "host":
+		return NetworkHost, nil
+	case "none":
+		return NetworkNone, nil
+	default:
+		return 0, fmt.Errorf("invalid network type %q", s)
+	}
+}
+
+func (n NetworkType) String() string {
+	switch n {
+	case NetworkSandbox:
+		return "sandbox"
+	case NetworkHost:
+		return "host"
+	case NetworkNone:
+		return "none"
+	default:
+		return fmt.Sprintf("unknown(%d)", n)
+	}
+}
+
+// MakeWatchdogAction converts type from string.
+func MakeWatchdogAction(s string) (watchdog.Action, error) {
+	switch strings.ToLower(s) {
+	case "log", "logwarning":
+		return watchdog.LogWarning, nil
+	case "panic":
+		return watchdog.Panic, nil
+	default:
+		return 0, fmt.Errorf("invalid watchdog action %q", s)
+	}
+}
+
+// MakeRefsLeakMode converts type from string.
+func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
+	switch strings.ToLower(s) {
+	case "disabled":
+		return refs.NoLeakChecking, nil
+	case "log-names":
+		return refs.LeaksLogWarning, nil
+	case "log-traces":
+		return refs.LeaksLogTraces, nil
+	default:
+		return 0, fmt.Errorf("invalid refs leakmode %q", s)
+	}
+}
+
+func refsLeakModeToString(mode refs.LeakMode) string {
+	switch mode {
+	// If not set, default it to disabled.
+	case refs.UninitializedLeakChecking, refs.NoLeakChecking:
+		return "disabled"
+	case refs.LeaksLogWarning:
+		return "log-names"
+	case refs.LeaksLogTraces:
+		return "log-traces"
+	default:
+		panic(fmt.Sprintf("Invalid leakmode: %d", mode))
+	}
+}
+
+// QueueingDiscipline is used to specify the kind of Queueing Discipline to
+// apply for a give FDBasedLink.
+type QueueingDiscipline int
+
+const (
+	// QDiscNone disables any queueing for the underlying FD.
+	QDiscNone QueueingDiscipline = iota
+
+	// QDiscFIFO applies a simple fifo based queue to the underlying
+	// FD.
+	QDiscFIFO
+)
+
+// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
+// else returns an error.
+func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
+	switch s {
+	case "none":
+		return QDiscNone, nil
+	case "fifo":
+		return QDiscFIFO, nil
+	default:
+		return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
+	}
+}
+
+// String implements fmt.Stringer.
+func (q QueueingDiscipline) String() string {
+	switch q {
+	case QDiscNone:
+		return "none"
+	case QDiscFIFO:
+		return "fifo"
+	default:
+		panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
+	}
+}
+
+// Config holds configuration that is not part of the runtime spec.
+type Config struct {
+	// RootDir is the runtime root directory.
+	RootDir string
+
+	// Debug indicates that debug logging should be enabled.
+	Debug bool
+
+	// LogFilename is the filename to log to, if not empty.
+	LogFilename string
+
+	// LogFormat is the log format.
+	LogFormat string
+
+	// DebugLog is the path to log debug information to, if not empty.
+	DebugLog string
+
+	// PanicLog is the path to log GO's runtime messages, if not empty.
+	PanicLog string
+
+	// DebugLogFormat is the log format for debug.
+	DebugLogFormat string
+
+	// FileAccess indicates how the filesystem is accessed.
+	FileAccess FileAccessType
+
+	// Overlay is whether to wrap the root filesystem in an overlay.
+	Overlay bool
+
+	// FSGoferHostUDS enables the gofer to mount a host UDS.
+	FSGoferHostUDS bool
+
+	// Network indicates what type of network to use.
+	Network NetworkType
+
+	// EnableRaw indicates whether raw sockets should be enabled. Raw
+	// sockets are disabled by stripping CAP_NET_RAW from the list of
+	// capabilities.
+	EnableRaw bool
+
+	// HardwareGSO indicates that hardware segmentation offload is enabled.
+	HardwareGSO bool
+
+	// SoftwareGSO indicates that software segmentation offload is enabled.
+	SoftwareGSO bool
+
+	// TXChecksumOffload indicates that TX Checksum Offload is enabled.
+	TXChecksumOffload bool
+
+	// RXChecksumOffload indicates that RX Checksum Offload is enabled.
+	RXChecksumOffload bool
+
+	// QDisc indicates the type of queuening discipline to use by default
+	// for non-loopback interfaces.
+	QDisc QueueingDiscipline
+
+	// LogPackets indicates that all network packets should be logged.
+	LogPackets bool
+
+	// Platform is the platform to run on.
+	Platform string
+
+	// Strace indicates that strace should be enabled.
+	Strace bool
+
+	// StraceSyscalls is the set of syscalls to trace.  If StraceEnable is
+	// true and this list is empty, then all syscalls will be traced.
+	StraceSyscalls []string
+
+	// StraceLogSize is the max size of data blobs to display.
+	StraceLogSize uint
+
+	// DisableSeccomp indicates whether seccomp syscall filters should be
+	// disabled. Pardon the double negation, but default to enabled is important.
+	DisableSeccomp bool
+
+	// WatchdogAction sets what action the watchdog takes when triggered.
+	WatchdogAction watchdog.Action
+
+	// PanicSignal registers signal handling that panics. Usually set to
+	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
+	PanicSignal int
+
+	// ProfileEnable is set to prepare the sandbox to be profiled.
+	ProfileEnable bool
+
+	// RestoreFile is the path to the saved container image
+	RestoreFile string
+
+	// NumNetworkChannels controls the number of AF_PACKET sockets that map
+	// to the same underlying network device. This allows netstack to better
+	// scale for high throughput use cases.
+	NumNetworkChannels int
+
+	// Rootless allows the sandbox to be started with a user that is not root.
+	// Defense is depth measures are weaker with rootless. Specifically, the
+	// sandbox and Gofer process run as root inside a user namespace with root
+	// mapped to the caller's user.
+	Rootless bool
+
+	// AlsoLogToStderr allows to send log messages to stderr.
+	AlsoLogToStderr bool
+
+	// ReferenceLeakMode sets reference leak check mode
+	ReferenceLeakMode refs.LeakMode
+
+	// OverlayfsStaleRead instructs the sandbox to assume that the root mount
+	// is on a Linux overlayfs mount, which does not necessarily preserve
+	// coherence between read-only and subsequent writable file descriptors
+	// representing the "same" file.
+	OverlayfsStaleRead bool
+
+	// CPUNumFromQuota sets CPU number count to available CPU quota, using
+	// least integer value greater than or equal to quota.
+	//
+	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
+	CPUNumFromQuota bool
+
+	// Enables VFS2 (not plumbled through yet).
+	VFS2 bool
+
+	// Enables FUSE usage (not plumbled through yet).
+	FUSE bool
+
+	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
+	// tests. It allows runsc to start the sandbox process as the current
+	// user, and without chrooting the sandbox process. This can be
+	// necessary in test environments that have limited capabilities.
+	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+
+	// TestOnlyTestNameEnv should only be used in tests. It looks up for the
+	// test name in the container environment variables and adds it to the debug
+	// log file name. This is done to help identify the log with the test when
+	// multiple tests are run in parallel, since there is no way to pass
+	// parameters to the runtime from docker.
+	TestOnlyTestNameEnv string
+}
+
+// ToFlags returns a slice of flags that correspond to the given Config.
+func (c *Config) ToFlags() []string {
+	f := []string{
+		"--root=" + c.RootDir,
+		"--debug=" + strconv.FormatBool(c.Debug),
+		"--log=" + c.LogFilename,
+		"--log-format=" + c.LogFormat,
+		"--debug-log=" + c.DebugLog,
+		"--panic-log=" + c.PanicLog,
+		"--debug-log-format=" + c.DebugLogFormat,
+		"--file-access=" + c.FileAccess.String(),
+		"--overlay=" + strconv.FormatBool(c.Overlay),
+		"--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
+		"--network=" + c.Network.String(),
+		"--log-packets=" + strconv.FormatBool(c.LogPackets),
+		"--platform=" + c.Platform,
+		"--strace=" + strconv.FormatBool(c.Strace),
+		"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
+		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
+		"--watchdog-action=" + c.WatchdogAction.String(),
+		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
+		"--profile=" + strconv.FormatBool(c.ProfileEnable),
+		"--net-raw=" + strconv.FormatBool(c.EnableRaw),
+		"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
+		"--rootless=" + strconv.FormatBool(c.Rootless),
+		"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
+		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
+		"--gso=" + strconv.FormatBool(c.HardwareGSO),
+		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
+		"--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload),
+		"--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload),
+		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
+		"--qdisc=" + c.QDisc.String(),
+	}
+	if c.CPUNumFromQuota {
+		f = append(f, "--cpu-num-from-quota")
+	}
+	if c.VFS2 {
+		f = append(f, "--vfs2=true")
+	}
+	if c.FUSE {
+		f = append(f, "--fuse=true")
+	}
+
+	// Only include these if set since it is never to be used by users.
+	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		f = append(f, "--TESTONLY-unsafe-nonroot=true")
+	}
+	if len(c.TestOnlyTestNameEnv) != 0 {
+		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
+	}
+
+	return f
+}
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 9a9ee7e2a..c33755482 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -23,6 +23,7 @@ go_library(
         "//pkg/sync",
         "//runsc/boot",
         "//runsc/cgroup",
+        "//runsc/config",
         "//runsc/sandbox",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
@@ -65,6 +66,7 @@ go_test(
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/boot/platforms",
+        "//runsc/config",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_kr_pty//:go_default_library",
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 7ad09bf23..6e1d6a568 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -37,6 +37,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/sighandling"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/cgroup"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/sandbox"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -269,7 +270,7 @@ type Args struct {
 // New creates the container in a new Sandbox process, unless the metadata
 // indicates that an existing Sandbox should be used. The caller must call
 // Destroy() on the container.
-func New(conf *boot.Config, args Args) (*Container, error) {
+func New(conf *config.Config, args Args) (*Container, error) {
 	log.Debugf("Create container %q in root dir: %s", args.ID, conf.RootDir)
 	if err := validateID(args.ID); err != nil {
 		return nil, err
@@ -397,7 +398,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 }
 
 // Start starts running the containerized process inside the sandbox.
-func (c *Container) Start(conf *boot.Config) error {
+func (c *Container) Start(conf *config.Config) error {
 	log.Debugf("Start container %q", c.ID)
 
 	if err := c.Saver.lock(); err != nil {
@@ -472,7 +473,7 @@ func (c *Container) Start(conf *boot.Config) error {
 
 // Restore takes a container and replaces its kernel and file system
 // to restore a container from its state file.
-func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
+func (c *Container) Restore(spec *specs.Spec, conf *config.Config, restoreFile string) error {
 	log.Debugf("Restore container %q", c.ID)
 	if err := c.Saver.lock(); err != nil {
 		return err
@@ -499,7 +500,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str
 }
 
 // Run is a helper that calls Create + Start + Wait.
-func Run(conf *boot.Config, args Args) (syscall.WaitStatus, error) {
+func Run(conf *config.Config, args Args) (syscall.WaitStatus, error) {
 	log.Debugf("Run container %q in root dir: %s", args.ID, conf.RootDir)
 	c, err := New(conf, args)
 	if err != nil {
@@ -861,7 +862,7 @@ func (c *Container) waitForStopped() error {
 	return backoff.Retry(op, b)
 }
 
-func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) {
+func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) {
 	// Start with the general config flags.
 	args := conf.ToFlags()
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 5e8247bc8..6082068c7 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -41,8 +41,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/test/testutil"
-	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -250,7 +250,7 @@ func readOutputNum(file string, position int) (int, error) {
 
 // run starts the sandbox and waits for it to exit, checking that the
 // application succeeded.
-func run(spec *specs.Spec, conf *boot.Config) error {
+func run(spec *specs.Spec, conf *config.Config) error {
 	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		return fmt.Errorf("error setting up container: %v", err)
@@ -289,9 +289,9 @@ var (
 )
 
 // configs generates different configurations to run tests.
-func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
+func configs(t *testing.T, opts ...configOption) map[string]*config.Config {
 	// Always load the default config.
-	cs := make(map[string]*boot.Config)
+	cs := make(map[string]*config.Config)
 	for _, o := range opts {
 		switch o {
 		case overlay:
@@ -308,7 +308,7 @@ func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
 			cs["kvm"] = c
 		case nonExclusiveFS:
 			c := testutil.TestConfig(t)
-			c.FileAccess = boot.FileAccessShared
+			c.FileAccess = config.FileAccessShared
 			cs["non-exclusive"] = c
 		default:
 			panic(fmt.Sprintf("unknown config option %v", o))
@@ -317,7 +317,7 @@ func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
 	return cs
 }
 
-func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*boot.Config {
+func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*config.Config {
 	vfs1 := configs(t, opts...)
 
 	var optsVFS2 []configOption
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index e189648f4..1beea123f 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -33,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -60,7 +61,7 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 	return specs, ids
 }
 
-func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
+func startContainers(conf *config.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
 	if len(conf.RootDir) == 0 {
 		panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
 	}
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
index bac177a88..4ea8fefee 100644
--- a/runsc/container/shared_volume_test.go
+++ b/runsc/container/shared_volume_test.go
@@ -25,14 +25,14 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/test/testutil"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 // TestSharedVolume checks that modifications to a volume mount are propagated
 // into and out of the sandbox.
 func TestSharedVolume(t *testing.T) {
 	conf := testutil.TestConfig(t)
-	conf.FileAccess = boot.FileAccessShared
+	conf.FileAccess = config.FileAccessShared
 
 	// Main process just sleeps. We will use "exec" to probe the state of
 	// the filesystem.
@@ -189,7 +189,7 @@ func checkFile(c *Container, filename string, want []byte) error {
 // is reflected inside.
 func TestSharedVolumeFile(t *testing.T) {
 	conf := testutil.TestConfig(t)
-	conf.FileAccess = boot.FileAccessShared
+	conf.FileAccess = config.FileAccessShared
 
 	// Main process just sleeps. We will use "exec" to probe the state of
 	// the filesystem.
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index c91cfd094..0e4945b3d 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -52,8 +52,8 @@ func init() {
 	}
 }
 
-func configTestName(config *Config) string {
-	if config.ROMount {
+func configTestName(conf *Config) string {
+	if conf.ROMount {
 		return "ROMount"
 	}
 	return "RWMount"
diff --git a/runsc/main.go b/runsc/main.go
index 69cb505fa..c2ffecbdc 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -32,8 +32,8 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/cmd"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -174,21 +174,21 @@ func main() {
 		cmd.Fatalf("%v", err)
 	}
 
-	fsAccess, err := boot.MakeFileAccessType(*fileAccess)
+	fsAccess, err := config.MakeFileAccessType(*fileAccess)
 	if err != nil {
 		cmd.Fatalf("%v", err)
 	}
 
-	if fsAccess == boot.FileAccessShared && *overlay {
+	if fsAccess == config.FileAccessShared && *overlay {
 		cmd.Fatalf("overlay flag is incompatible with shared file access")
 	}
 
-	netType, err := boot.MakeNetworkType(*network)
+	netType, err := config.MakeNetworkType(*network)
 	if err != nil {
 		cmd.Fatalf("%v", err)
 	}
 
-	wa, err := boot.MakeWatchdogAction(*watchdogAction)
+	wa, err := config.MakeWatchdogAction(*watchdogAction)
 	if err != nil {
 		cmd.Fatalf("%v", err)
 	}
@@ -197,12 +197,12 @@ func main() {
 		cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels)
 	}
 
-	refsLeakMode, err := boot.MakeRefsLeakMode(*referenceLeakMode)
+	refsLeakMode, err := config.MakeRefsLeakMode(*referenceLeakMode)
 	if err != nil {
 		cmd.Fatalf("%v", err)
 	}
 
-	queueingDiscipline, err := boot.MakeQueueingDiscipline(*qDisc)
+	queueingDiscipline, err := config.MakeQueueingDiscipline(*qDisc)
 	if err != nil {
 		cmd.Fatalf("%s", err)
 	}
@@ -212,7 +212,7 @@ func main() {
 	refs.SetLeakMode(refsLeakMode)
 
 	// Create a new Config from the flags.
-	conf := &boot.Config{
+	conf := &config.Config{
 		RootDir:            *rootDir,
 		Debug:              *debug,
 		LogFilename:        *logFilename,
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 2b9d4549d..f0a551a1e 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -26,6 +26,7 @@ go_library(
         "//runsc/boot",
         "//runsc/boot/platforms",
         "//runsc/cgroup",
+        "//runsc/config",
         "//runsc/console",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 817a923ad..f9abb2d44 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -49,23 +50,23 @@ import (
 //
 // Run the following container to test it:
 //  docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
-func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error {
+func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *config.Config) error {
 	log.Infof("Setting up network")
 
 	switch conf.Network {
-	case boot.NetworkNone:
+	case config.NetworkNone:
 		log.Infof("Network is disabled, create loopback interface only")
 		if err := createDefaultLoopbackInterface(conn); err != nil {
 			return fmt.Errorf("creating default loopback interface: %v", err)
 		}
-	case boot.NetworkSandbox:
+	case config.NetworkSandbox:
 		// Build the path to the net namespace of the sandbox process.
 		// This is what we will copy.
 		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
 		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.TXChecksumOffload, conf.RXChecksumOffload, conf.NumNetworkChannels, conf.QDisc); err != nil {
 			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
 		}
-	case boot.NetworkHost:
+	case config.NetworkHost:
 		// Nothing to do here.
 	default:
 		return fmt.Errorf("invalid network type: %d", conf.Network)
@@ -115,7 +116,7 @@ func isRootNS() (bool, error) {
 // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
 // net namespace with the given path, creates them in the sandbox, and removes
 // them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc config.QueueingDiscipline) error {
 	// Join the network namespace that we will be copying.
 	restore, err := joinNetNS(nsPath)
 	if err != nil {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 36bb0c9c9..a339937fb 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -41,6 +41,7 @@ import (
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
 	"gvisor.dev/gvisor/runsc/cgroup"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/console"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -116,7 +117,7 @@ type Args struct {
 
 // New creates the sandbox process. The caller must call Destroy() on the
 // sandbox.
-func New(conf *boot.Config, args *Args) (*Sandbox, error) {
+func New(conf *config.Config, args *Args) (*Sandbox, error) {
 	s := &Sandbox{ID: args.ID, Cgroup: args.Cgroup}
 	// The Cleanup object cleans up partially created sandboxes when an error
 	// occurs. Any errors occurring during cleanup itself are ignored.
@@ -180,7 +181,7 @@ func (s *Sandbox) CreateContainer(cid string) error {
 }
 
 // StartRoot starts running the root container process inside the sandbox.
-func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
+func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error {
 	log.Debugf("Start root sandbox %q, PID: %d", s.ID, s.Pid)
 	conn, err := s.sandboxConnect()
 	if err != nil {
@@ -203,7 +204,7 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 }
 
 // StartContainer starts running a non-root container inside the sandbox.
-func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string, goferFiles []*os.File) error {
+func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, goferFiles []*os.File) error {
 	for _, f := range goferFiles {
 		defer f.Close()
 	}
@@ -232,7 +233,7 @@ func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string
 }
 
 // Restore sends the restore call for a container in the sandbox.
-func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, filename string) error {
+func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *config.Config, filename string) error {
 	log.Debugf("Restore sandbox %q", s.ID)
 
 	rf, err := os.Open(filename)
@@ -344,7 +345,7 @@ func (s *Sandbox) connError(err error) error {
 
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
 // command, passing in the bundle dir.
-func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncFile *os.File) error {
+func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error {
 	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
@@ -555,10 +556,10 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 	// Joins the network namespace if network is enabled. the sandbox talks
 	// directly to the host network, which may have been configured in the
 	// namespace.
-	if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != boot.NetworkNone {
+	if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone {
 		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
 		nss = append(nss, ns)
-	} else if conf.Network == boot.NetworkHost {
+	} else if conf.Network == config.NetworkHost {
 		log.Infof("Sandbox will be started in the host network namespace")
 	} else {
 		log.Infof("Sandbox will be started in new network namespace")
@@ -568,7 +569,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 	// User namespace depends on the network type. Host network requires to run
 	// inside the user namespace specified in the spec or the current namespace
 	// if none is configured.
-	if conf.Network == boot.NetworkHost {
+	if conf.Network == config.NetworkHost {
 		if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok {
 			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
 			nss = append(nss, userns)
@@ -1179,7 +1180,7 @@ func deviceFileForPlatform(name string) (*os.File, error) {
 
 // checkBinaryPermissions verifies that the required binary bits are set on
 // the runsc executable.
-func checkBinaryPermissions(conf *boot.Config) error {
+func checkBinaryPermissions(conf *config.Config) error {
 	// All platforms need the other exe bit
 	neededBits := os.FileMode(0001)
 	if conf.Platform == platforms.Ptrace {
diff --git a/website/blog/2019-11-18-security-basics.md b/website/blog/2019-11-18-security-basics.md
index 76bbabc13..2256ee9d5 100644
--- a/website/blog/2019-11-18-security-basics.md
+++ b/website/blog/2019-11-18-security-basics.md
@@ -188,7 +188,7 @@ for direct access to some files. And most files will be remotely accessed
 through the Gofers, in which case no FDs are donated to the Sentry.
 
 The Sentry itself is only allowed access to specific
-[whitelisted syscalls](https://github.com/google/gvisor/blob/master/runsc/boot/config.go).
+[whitelisted syscalls](https://github.com/google/gvisor/blob/master/runsc/config/config.go).
 Without networking, the Sentry needs 53 host syscalls in order to function, and
 with networking, it uses an additional 15[^8]. By limiting the whitelist to only
 these needed syscalls, we radically reduce the amount of host OS attack surface.
-- 
cgit v1.2.3


From c0ae8604b524b80d77a1596fded05ef09d1f76fd Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 20 Aug 2020 10:40:26 -0700
Subject: Fix tabs in lock-ordering doc.

PiperOrigin-RevId: 327654207
---
 pkg/sentry/vfs/vfs.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 9c2420683..8a79e1325 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -24,9 +24,9 @@
 //           Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
 //         VirtualFilesystem.filesystemsMu
 //       EpollInstance.mu
-//		   Inotify.mu
-// 		     Watches.mu
-//  		     Inotify.evMu
+//       Inotify.mu
+//         Watches.mu
+//           Inotify.evMu
 // VirtualFilesystem.fsTypesMu
 //
 // Locking Dentry.mu in multiple Dentries requires holding
-- 
cgit v1.2.3


From 78cc2396bb1b3d89c4606fa95a77b151bb529c96 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 20 Aug 2020 11:05:37 -0700
Subject: Use a explicit random src for RandomID.

PiperOrigin-RevId: 327659759
---
 pkg/test/testutil/testutil.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go
index 3cb6c6814..42d79f5c2 100644
--- a/pkg/test/testutil/testutil.go
+++ b/pkg/test/testutil/testutil.go
@@ -243,12 +243,15 @@ func writeSpec(dir string, spec *specs.Spec) error {
 	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
 }
 
+// idRandomSrc is a pseudo random generator used to in RandomID.
+var idRandomSrc = rand.New(rand.NewSource(time.Now().UnixNano()))
+
 // RandomID returns 20 random bytes following the given prefix.
 func RandomID(prefix string) string {
 	// Read 20 random bytes.
 	b := make([]byte, 20)
 	// "[Read] always returns len(p) and a nil error." --godoc
-	if _, err := rand.Read(b); err != nil {
+	if _, err := idRandomSrc.Read(b); err != nil {
 		panic("rand.Read failed: " + err.Error())
 	}
 	if prefix != "" {
-- 
cgit v1.2.3


From bcd92e97513c0bfa6255f21a7330e18b5e8c7f1e Mon Sep 17 00:00:00 2001
From: Arthur Sfez <asfez@google.com>
Date: Thu, 20 Aug 2020 12:04:36 -0700
Subject: Only use the NextHeader value of the first IPv6 fragment extension
 header.

As per RFC 8200 Section 4.5:
  The Next Header field of the last header of the Per-Fragment
  headers is obtained from the Next Header field of the first
  fragment's Fragment header.

Test:
  - pkg/tcpip/network/ipv6:ipv6_test
  - pkg/tcpip/network/ipv4:ipv4_test
  - pkg/tcpip/network/fragmentation:fragmentation_test

Updates #2197

PiperOrigin-RevId: 327671635
---
 pkg/tcpip/network/fragmentation/BUILD              |  4 +-
 pkg/tcpip/network/fragmentation/fragmentation.go   | 25 ++++++----
 .../network/fragmentation/fragmentation_test.go    | 57 +++++++++++++++-------
 pkg/tcpip/network/fragmentation/reassembler.go     | 23 ++++++---
 pkg/tcpip/network/ipv4/ipv4.go                     |  6 ++-
 pkg/tcpip/network/ipv6/ipv6.go                     | 10 ++--
 pkg/tcpip/network/ipv6/ipv6_test.go                | 40 +++++++++++++++
 7 files changed, 125 insertions(+), 40 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index d1c728ccf..96c5f42f8 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -41,5 +41,7 @@ go_test(
         "reassembler_test.go",
     ],
     library = ":fragmentation",
-    deps = ["//pkg/tcpip/buffer"],
+    deps = [
+        "//pkg/tcpip/buffer",
+    ],
 )
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index 1827666c5..6a4843f92 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -120,29 +120,36 @@ func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, rea
 }
 
 // Process processes an incoming fragment belonging to an ID and returns a
-// complete packet when all the packets belonging to that ID have been received.
+// complete packet and its protocol number when all the packets belonging to
+// that ID have been received.
 //
 // [first, last] is the range of the fragment bytes.
 //
 // first must be a multiple of the block size f is configured with. The size
 // of the fragment data must be a multiple of the block size, unless there are
 // no fragments following this fragment (more set to false).
-func (f *Fragmentation) Process(id FragmentID, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, error) {
+//
+// proto is the protocol number marked in the fragment being processed. It has
+// to be given here outside of the FragmentID struct because IPv6 should not use
+// the protocol to identify a fragment.
+func (f *Fragmentation) Process(
+	id FragmentID, first, last uint16, more bool, proto uint8, vv buffer.VectorisedView) (
+	buffer.VectorisedView, uint8, bool, error) {
 	if first > last {
-		return buffer.VectorisedView{}, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs)
 	}
 
 	if first%f.blockSize != 0 {
-		return buffer.VectorisedView{}, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs)
 	}
 
 	fragmentSize := last - first + 1
 	if more && fragmentSize%f.blockSize != 0 {
-		return buffer.VectorisedView{}, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs)
 	}
 
 	if l := vv.Size(); l < int(fragmentSize) {
-		return buffer.VectorisedView{}, false, fmt.Errorf("got fragment size=%d bytes less than the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("got fragment size=%d bytes less than the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs)
 	}
 	vv.CapLength(int(fragmentSize))
 
@@ -160,14 +167,14 @@ func (f *Fragmentation) Process(id FragmentID, first, last uint16, more bool, vv
 	}
 	f.mu.Unlock()
 
-	res, done, consumed, err := r.process(first, last, more, vv)
+	res, firstFragmentProto, done, consumed, err := r.process(first, last, more, proto, vv)
 	if err != nil {
 		// We probably got an invalid sequence of fragments. Just
 		// discard the reassembler and move on.
 		f.mu.Lock()
 		f.release(r)
 		f.mu.Unlock()
-		return buffer.VectorisedView{}, false, fmt.Errorf("fragmentation processing error: %v", err)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("fragmentation processing error: %w", err)
 	}
 	f.mu.Lock()
 	f.size += consumed
@@ -186,7 +193,7 @@ func (f *Fragmentation) Process(id FragmentID, first, last uint16, more bool, vv
 		}
 	}
 	f.mu.Unlock()
-	return res, done, nil
+	return res, firstFragmentProto, done, nil
 }
 
 func (f *Fragmentation) release(r *reassembler) {
diff --git a/pkg/tcpip/network/fragmentation/fragmentation_test.go b/pkg/tcpip/network/fragmentation/fragmentation_test.go
index 9eedd33c4..416604659 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation_test.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation_test.go
@@ -38,12 +38,14 @@ type processInput struct {
 	first uint16
 	last  uint16
 	more  bool
+	proto uint8
 	vv    buffer.VectorisedView
 }
 
 type processOutput struct {
-	vv   buffer.VectorisedView
-	done bool
+	vv    buffer.VectorisedView
+	proto uint8
+	done  bool
 }
 
 var processTestCases = []struct {
@@ -62,6 +64,17 @@ var processTestCases = []struct {
 			{vv: vv(4, "01", "23"), done: true},
 		},
 	},
+	{
+		comment: "Next Header protocol mismatch",
+		in: []processInput{
+			{id: FragmentID{ID: 0}, first: 0, last: 1, more: true, proto: 6, vv: vv(2, "01")},
+			{id: FragmentID{ID: 0}, first: 2, last: 3, more: false, proto: 17, vv: vv(2, "23")},
+		},
+		out: []processOutput{
+			{vv: buffer.VectorisedView{}, done: false},
+			{vv: vv(4, "01", "23"), proto: 6, done: true},
+		},
+	},
 	{
 		comment: "Two IDs",
 		in: []processInput{
@@ -83,18 +96,26 @@ func TestFragmentationProcess(t *testing.T) {
 	for _, c := range processTestCases {
 		t.Run(c.comment, func(t *testing.T) {
 			f := NewFragmentation(minBlockSize, 1024, 512, DefaultReassembleTimeout)
+			firstFragmentProto := c.in[0].proto
 			for i, in := range c.in {
-				vv, done, err := f.Process(in.id, in.first, in.last, in.more, in.vv)
+				vv, proto, done, err := f.Process(in.id, in.first, in.last, in.more, in.proto, in.vv)
 				if err != nil {
-					t.Fatalf("f.Process(%+v, %+d, %+d, %t, %+v) failed: %v", in.id, in.first, in.last, in.more, in.vv, err)
+					t.Fatalf("f.Process(%+v, %d, %d, %t, %d, %X) failed: %s",
+						in.id, in.first, in.last, in.more, in.proto, in.vv.ToView(), err)
 				}
 				if !reflect.DeepEqual(vv, c.out[i].vv) {
-					t.Errorf("got Process(%d) = %+v, want = %+v", i, vv, c.out[i].vv)
+					t.Errorf("got Process(%+v, %d, %d, %t, %d, %X) = (%X, _, _, _), want = (%X, _, _, _)",
+						in.id, in.first, in.last, in.more, in.proto, in.vv.ToView(), vv.ToView(), c.out[i].vv.ToView())
 				}
 				if done != c.out[i].done {
-					t.Errorf("got Process(%d) = %+v, want = %+v", i, done, c.out[i].done)
+					t.Errorf("got Process(%+v, %d, %d, %t, %d, _) = (_, _, %t, _), want = (_, _, %t, _)",
+						in.id, in.first, in.last, in.more, in.proto, done, c.out[i].done)
 				}
 				if c.out[i].done {
+					if firstFragmentProto != proto {
+						t.Errorf("got Process(%+v, %d, %d, %t, %d, _) = (_, %d, _, _), want = (_, %d, _, _)",
+							in.id, in.first, in.last, in.more, in.proto, proto, firstFragmentProto)
+					}
 					if _, ok := f.reassemblers[in.id]; ok {
 						t.Errorf("Process(%d) did not remove buffer from reassemblers", i)
 					}
@@ -113,14 +134,14 @@ func TestReassemblingTimeout(t *testing.T) {
 	timeout := time.Millisecond
 	f := NewFragmentation(minBlockSize, 1024, 512, timeout)
 	// Send first fragment with id = 0, first = 0, last = 0, and more = true.
-	f.Process(FragmentID{}, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{}, 0, 0, true, 0xFF, vv(1, "0"))
 	// Sleep more than the timeout.
 	time.Sleep(2 * timeout)
 	// Send another fragment that completes a packet.
 	// However, no packet should be reassembled because the fragment arrived after the timeout.
-	_, done, err := f.Process(FragmentID{}, 1, 1, false, vv(1, "1"))
+	_, _, done, err := f.Process(FragmentID{}, 1, 1, false, 0xFF, vv(1, "1"))
 	if err != nil {
-		t.Fatalf("f.Process(0, 1, 1, false, vv(1, \"1\")) failed: %v", err)
+		t.Fatalf("f.Process(0, 1, 1, false, 0xFF, vv(1, \"1\")) failed: %v", err)
 	}
 	if done {
 		t.Errorf("Fragmentation does not respect the reassembling timeout.")
@@ -130,15 +151,15 @@ func TestReassemblingTimeout(t *testing.T) {
 func TestMemoryLimits(t *testing.T) {
 	f := NewFragmentation(minBlockSize, 3, 1, DefaultReassembleTimeout)
 	// Send first fragment with id = 0.
-	f.Process(FragmentID{ID: 0}, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{ID: 0}, 0, 0, true, 0xFF, vv(1, "0"))
 	// Send first fragment with id = 1.
-	f.Process(FragmentID{ID: 1}, 0, 0, true, vv(1, "1"))
+	f.Process(FragmentID{ID: 1}, 0, 0, true, 0xFF, vv(1, "1"))
 	// Send first fragment with id = 2.
-	f.Process(FragmentID{ID: 2}, 0, 0, true, vv(1, "2"))
+	f.Process(FragmentID{ID: 2}, 0, 0, true, 0xFF, vv(1, "2"))
 
 	// Send first fragment with id = 3. This should caused id = 0 and id = 1 to be
 	// evicted.
-	f.Process(FragmentID{ID: 3}, 0, 0, true, vv(1, "3"))
+	f.Process(FragmentID{ID: 3}, 0, 0, true, 0xFF, vv(1, "3"))
 
 	if _, ok := f.reassemblers[FragmentID{ID: 0}]; ok {
 		t.Errorf("Memory limits are not respected: id=0 has not been evicted.")
@@ -154,9 +175,9 @@ func TestMemoryLimits(t *testing.T) {
 func TestMemoryLimitsIgnoresDuplicates(t *testing.T) {
 	f := NewFragmentation(minBlockSize, 1, 0, DefaultReassembleTimeout)
 	// Send first fragment with id = 0.
-	f.Process(FragmentID{}, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{}, 0, 0, true, 0xFF, vv(1, "0"))
 	// Send the same packet again.
-	f.Process(FragmentID{}, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{}, 0, 0, true, 0xFF, vv(1, "0"))
 
 	got := f.size
 	want := 1
@@ -248,12 +269,12 @@ func TestErrors(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			f := NewFragmentation(test.blockSize, HighFragThreshold, LowFragThreshold, DefaultReassembleTimeout)
-			_, done, err := f.Process(FragmentID{}, test.first, test.last, test.more, vv(len(test.data), test.data))
+			_, _, done, err := f.Process(FragmentID{}, test.first, test.last, test.more, 0, vv(len(test.data), test.data))
 			if !errors.Is(err, test.err) {
-				t.Errorf("got Proceess(_, %d, %d, %t, %q) = (_, _, %v), want = (_, _, %v)", test.first, test.last, test.more, test.data, err, test.err)
+				t.Errorf("got Process(_, %d, %d, %t, _, %q) = (_, _, _, %v), want = (_, _, _, %v)", test.first, test.last, test.more, test.data, err, test.err)
 			}
 			if done {
-				t.Errorf("got Proceess(_, %d, %d, %t, %q) = (_, true, _), want = (_, false, _)", test.first, test.last, test.more, test.data)
+				t.Errorf("got Process(_, %d, %d, %t, _, %q) = (_, _, true, _), want = (_, _, false, _)", test.first, test.last, test.more, test.data)
 			}
 		})
 	}
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
index 50d30bbf0..f044867dc 100644
--- a/pkg/tcpip/network/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -34,6 +34,7 @@ type reassembler struct {
 	reassemblerEntry
 	id           FragmentID
 	size         int
+	proto        uint8
 	mu           sync.Mutex
 	holes        []hole
 	deleted      int
@@ -46,7 +47,6 @@ func newReassembler(id FragmentID) *reassembler {
 	r := &reassembler{
 		id:           id,
 		holes:        make([]hole, 0, 16),
-		deleted:      0,
 		heap:         make(fragHeap, 0, 8),
 		creationTime: time.Now(),
 	}
@@ -78,7 +78,7 @@ func (r *reassembler) updateHoles(first, last uint16, more bool) bool {
 	return used
 }
 
-func (r *reassembler) process(first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, int, error) {
+func (r *reassembler) process(first, last uint16, more bool, proto uint8, vv buffer.VectorisedView) (buffer.VectorisedView, uint8, bool, int, error) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	consumed := 0
@@ -86,7 +86,18 @@ func (r *reassembler) process(first, last uint16, more bool, vv buffer.Vectorise
 		// A concurrent goroutine might have already reassembled
 		// the packet and emptied the heap while this goroutine
 		// was waiting on the mutex. We don't have to do anything in this case.
-		return buffer.VectorisedView{}, false, consumed, nil
+		return buffer.VectorisedView{}, 0, false, consumed, nil
+	}
+	// For IPv6, it is possible to have different Protocol values between
+	// fragments of a packet (because, unlike IPv4, the Protocol is not used to
+	// identify a fragment). In this case, only the Protocol of the first
+	// fragment must be used as per RFC 8200 Section 4.5.
+	//
+	// TODO(gvisor.dev/issue/3648): The entire first IP header should be recorded
+	// here (instead of just the protocol) because most IP options should be
+	// derived from the first fragment.
+	if first == 0 {
+		r.proto = proto
 	}
 	if r.updateHoles(first, last, more) {
 		// We store the incoming packet only if it filled some holes.
@@ -96,13 +107,13 @@ func (r *reassembler) process(first, last uint16, more bool, vv buffer.Vectorise
 	}
 	// Check if all the holes have been deleted and we are ready to reassamble.
 	if r.deleted < len(r.holes) {
-		return buffer.VectorisedView{}, false, consumed, nil
+		return buffer.VectorisedView{}, 0, false, consumed, nil
 	}
 	res, err := r.heap.reassemble()
 	if err != nil {
-		return buffer.VectorisedView{}, false, consumed, fmt.Errorf("fragment reassembly failed: %v", err)
+		return buffer.VectorisedView{}, 0, false, consumed, fmt.Errorf("fragment reassembly failed: %w", err)
 	}
-	return res, true, consumed, nil
+	return res, r.proto, true, consumed, nil
 }
 
 func (r *reassembler) tooOld(timeout time.Duration) bool {
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 79872ec9a..63ffb3660 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -415,18 +415,20 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 		}
 		var ready bool
 		var err error
-		pkt.Data, ready, err = e.protocol.fragmentation.Process(
+		proto := h.Protocol()
+		pkt.Data, _, ready, err = e.protocol.fragmentation.Process(
 			// As per RFC 791 section 2.3, the identification value is unique
 			// for a source-destination pair and protocol.
 			fragmentation.FragmentID{
 				Source:      h.SourceAddress(),
 				Destination: h.DestinationAddress(),
 				ID:          uint32(h.ID()),
-				Protocol:    h.Protocol(),
+				Protocol:    proto,
 			},
 			h.FragmentOffset(),
 			last,
 			h.More(),
+			proto,
 			pkt.Data,
 		)
 		if err != nil {
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 0eafe9790..267d2cce8 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -321,10 +321,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 				return
 			}
 
-			var ready bool
 			// Note that pkt doesn't have its transport header set after reassembly,
 			// and won't until DeliverNetworkPacket sets it.
-			pkt.Data, ready, err = e.protocol.fragmentation.Process(
+			data, proto, ready, err := e.protocol.fragmentation.Process(
 				// IPv6 ignores the Protocol field since the ID only needs to be unique
 				// across source-destination pairs, as per RFC 8200 section 4.5.
 				fragmentation.FragmentID{
@@ -335,6 +334,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 				start,
 				last,
 				extHdr.More(),
+				uint8(rawPayload.Identifier),
 				rawPayload.Buf,
 			)
 			if err != nil {
@@ -342,12 +342,14 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 				r.Stats().IP.MalformedFragmentsReceived.Increment()
 				return
 			}
+			pkt.Data = data
 
 			if ready {
 				// We create a new iterator with the reassembled packet because we could
 				// have more extension headers in the reassembled payload, as per RFC
-				// 8200 section 4.5.
-				it = header.MakeIPv6PayloadIterator(rawPayload.Identifier, pkt.Data)
+				// 8200 section 4.5. We also use the NextHeader value from the first
+				// fragment.
+				it = header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(proto), pkt.Data)
 			}
 
 		case header.IPv6DestinationOptionsExtHdr:
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index 0a183bfde..54787198f 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -865,6 +865,46 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			},
 			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
 		},
+		{
+			name: "Two fragments with different Next Header values",
+			fragments: []fragmentData{
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr1ToAddr2[:64],
+						},
+					),
+				},
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							// NextHeader value is different than the one in the first fragment, so
+							// this NextHeader should be ignored.
+							buffer.View([]byte{uint8(header.IPv6NoNextHeaderIdentifier), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr1ToAddr2[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
+		},
 		{
 			name: "Two fragments with last fragment size not a multiple of fragment block size",
 			fragments: []fragmentData{
-- 
cgit v1.2.3


From e2c1084cc8eb52bdfda299df2386ba974c320d54 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 20 Aug 2020 13:23:21 -0700
Subject: Skip listening TCP ports when trying to bind a free port.

PiperOrigin-RevId: 327686558
---
 pkg/sentry/socket/netstack/netstack.go             | 15 ++++-
 pkg/tcpip/ports/ports.go                           | 19 ++++++-
 pkg/tcpip/ports/ports_test.go                      |  2 +-
 pkg/tcpip/transport/tcp/endpoint.go                | 60 ++++++++++----------
 pkg/tcpip/transport/udp/endpoint.go                |  2 +-
 test/syscalls/linux/socket_inet_loopback.cc        | 38 +++++++++++++
 .../linux/socket_inet_loopback_nogotsan.cc         | 65 ++++++++++++++++++++++
 test/syscalls/linux/socket_ipv4_udp_unbound.cc     | 25 ++++++++-
 8 files changed, 189 insertions(+), 37 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 0e5913b60..4d0e33696 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -803,7 +803,20 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 	}
 
 	// Issue the bind request to the endpoint.
-	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
+	err := s.Endpoint.Bind(addr)
+	if err == tcpip.ErrNoPortAvailable {
+		// Bind always returns EADDRINUSE irrespective of if the specified port was
+		// already bound or if an ephemeral port was requested but none were
+		// available.
+		//
+		// tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because
+		// UDP connect returns EAGAIN on ephemeral port exhaustion.
+		//
+		// TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion.
+		err = tcpip.ErrPortInUse
+	}
+
+	return syserr.TranslateNetstackError(err)
 }
 
 // Listen implements the linux syscall listen(2) for sockets backed by
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index f6d592eb5..d87193650 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -400,7 +400,11 @@ func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumb
 // reserved by another endpoint. If port is zero, ReservePort will search for
 // an unreserved ephemeral port and reserve it, returning its value in the
 // "port" return value.
-func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress) (reservedPort uint16, err *tcpip.Error) {
+//
+// An optional testPort closure can be passed in which if provided will be used
+// to test if the picked port can be used. The function should return true if
+// the port is safe to use, false otherwise.
+func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress, testPort func(port uint16) bool) (reservedPort uint16, err *tcpip.Error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
@@ -412,12 +416,23 @@ func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transp
 		if !s.reserveSpecificPort(networks, transport, addr, port, flags, bindToDevice, dst) {
 			return 0, tcpip.ErrPortInUse
 		}
+		if testPort != nil && !testPort(port) {
+			s.releasePortLocked(networks, transport, addr, port, flags.Bits(), bindToDevice, dst)
+			return 0, tcpip.ErrPortInUse
+		}
 		return port, nil
 	}
 
 	// A port wasn't specified, so try to find one.
 	return s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
-		return s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice, dst), nil
+		if !s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice, dst) {
+			return false, nil
+		}
+		if testPort != nil && !testPort(p) {
+			s.releasePortLocked(networks, transport, addr, p, flags.Bits(), bindToDevice, dst)
+			return false, nil
+		}
+		return true, nil
 	})
 }
 
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index 58db5868c..4bc949fd8 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -332,7 +332,7 @@ func TestPortReservation(t *testing.T) {
 					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device, test.dest)
 					continue
 				}
-				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device, test.dest)
+				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device, test.dest, nil /* testPort */)
 				if err != test.want {
 					t.Fatalf("ReservePort(.., .., %s, %d, %+v, %d, %v) = %v, want %v", test.ip, test.port, test.flags, test.device, test.dest, err, test.want)
 				}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 21a4b6e2f..9df22ac84 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2169,7 +2169,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 			if sameAddr && p == e.ID.RemotePort {
 				return false, nil
 			}
-			if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr); err != nil {
+			if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr, nil /* testPort */); err != nil {
 				if err != tcpip.ErrPortInUse || !reuse {
 					return false, nil
 				}
@@ -2207,7 +2207,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 				tcpEP.notifyProtocolGoroutine(notifyAbort)
 				tcpEP.UnlockUser()
 				// Now try and Reserve again if it fails then we skip.
-				if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr); err != nil {
+				if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr, nil /* testPort */); err != nil {
 					return false, nil
 				}
 			}
@@ -2505,47 +2505,45 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 		}
 	}
 
-	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.portFlags, e.bindToDevice, tcpip.FullAddress{})
-	if err != nil {
-		return err
-	}
-
-	e.boundBindToDevice = e.bindToDevice
-	e.boundPortFlags = e.portFlags
-	e.isPortReserved = true
-	e.effectiveNetProtos = netProtos
-	e.ID.LocalPort = port
-
-	// Any failures beyond this point must remove the port registration.
-	defer func(portFlags ports.Flags, bindToDevice tcpip.NICID) {
-		if err != nil {
-			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, portFlags, bindToDevice, tcpip.FullAddress{})
-			e.isPortReserved = false
-			e.effectiveNetProtos = nil
-			e.ID.LocalPort = 0
-			e.ID.LocalAddress = ""
-			e.boundNICID = 0
-			e.boundBindToDevice = 0
-			e.boundPortFlags = ports.Flags{}
-		}
-	}(e.boundPortFlags, e.boundBindToDevice)
-
+	var nic tcpip.NICID
 	// If an address is specified, we must ensure that it's one of our
 	// local addresses.
 	if len(addr.Addr) != 0 {
-		nic := e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
+		nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
 		if nic == 0 {
 			return tcpip.ErrBadLocalAddress
 		}
-
-		e.boundNICID = nic
 		e.ID.LocalAddress = addr.Addr
 	}
 
-	if err := e.stack.CheckRegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e.boundPortFlags, e.boundBindToDevice); err != nil {
+	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.portFlags, e.bindToDevice, tcpip.FullAddress{}, func(p uint16) bool {
+		id := e.ID
+		id.LocalPort = p
+		// CheckRegisterTransportEndpoint should only return an error if there is a
+		// listening endpoint bound with the same id and portFlags and bindToDevice
+		// options.
+		//
+		// NOTE: Only listening and connected endpoint register with
+		// demuxer. Further connected endpoints always have a remote
+		// address/port. Hence this will only return an error if there is a matching
+		// listening endpoint.
+		if err := e.stack.CheckRegisterTransportEndpoint(nic, netProtos, ProtocolNumber, id, e.portFlags, e.bindToDevice); err != nil {
+			return false
+		}
+		return true
+	})
+	if err != nil {
 		return err
 	}
 
+	e.boundBindToDevice = e.bindToDevice
+	e.boundPortFlags = e.portFlags
+	// TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct.
+	e.boundNICID = nic
+	e.isPortReserved = true
+	e.effectiveNetProtos = netProtos
+	e.ID.LocalPort = port
+
 	// Mark endpoint as bound.
 	e.setEndpointState(StateBound)
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 73608783c..c33434b75 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1226,7 +1226,7 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 
 func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) {
 	if e.ID.LocalPort == 0 {
-		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.portFlags, e.bindToDevice, tcpip.FullAddress{})
+		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.portFlags, e.bindToDevice, tcpip.FullAddress{}, nil /* testPort */)
 		if err != nil {
 			return id, e.bindToDevice, err
 		}
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index c3b42682f..a62a10088 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -2573,6 +2573,44 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReservedReuseAddr) {
       SyscallSucceeds());
 }
 
+TEST_P(SocketMultiProtocolInetLoopbackTest,
+       MultipleBindsAllowedNoListeningReuseAddr) {
+  const auto& param = GetParam();
+  // UDP sockets are allowed to bind/listen on the port w/ SO_REUSEADDR, for TCP
+  // this is only permitted if there is no other listening socket.
+  SKIP_IF(param.type != SOCK_STREAM);
+  // Bind the v4 loopback on a v4 socket.
+  const TestAddress& test_addr = V4Loopback();
+  sockaddr_storage bound_addr = test_addr.addr;
+  FileDescriptor bound_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+
+  ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                   test_addr.addr_len),
+              SyscallSucceeds());
+  // Get the port that we bound.
+  socklen_t bound_addr_len = test_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                  &bound_addr_len),
+      SyscallSucceeds());
+
+  // Now create a socket and bind it to the same port, this should
+  // succeed since there is no listening socket for the same port.
+  FileDescriptor second_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+
+  ASSERT_THAT(setsockopt(second_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(second_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                   test_addr.addr_len),
+              SyscallSucceeds());
+}
+
 TEST_P(SocketMultiProtocolInetLoopbackTest, PortReuseTwoSockets) {
   auto const& param = GetParam();
   TestAddress const& test_addr = V4Loopback();
diff --git a/test/syscalls/linux/socket_inet_loopback_nogotsan.cc b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc
index 791e2bd51..1a0b53394 100644
--- a/test/syscalls/linux/socket_inet_loopback_nogotsan.cc
+++ b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc
@@ -168,6 +168,71 @@ INSTANTIATE_TEST_SUITE_P(
         TestParam{V6Loopback(), V6Loopback()}),
     DescribeTestParam);
 
+struct ProtocolTestParam {
+  std::string description;
+  int type;
+};
+
+std::string DescribeProtocolTestParam(
+    ::testing::TestParamInfo<ProtocolTestParam> const& info) {
+  return info.param.description;
+}
+
+using SocketMultiProtocolInetLoopbackTest =
+    ::testing::TestWithParam<ProtocolTestParam>;
+
+TEST_P(SocketMultiProtocolInetLoopbackTest,
+       BindAvoidsListeningPortsReuseAddr_NoRandomSave) {
+  const auto& param = GetParam();
+  // UDP sockets are allowed to bind/listen on the port w/ SO_REUSEADDR, for TCP
+  // this is only permitted if there is no other listening socket.
+  SKIP_IF(param.type != SOCK_STREAM);
+
+  DisableSave ds;  // Too many syscalls.
+
+  // A map of port to file descriptor binding the port.
+  std::map<uint16_t, FileDescriptor> listen_sockets;
+
+  // Exhaust all ephemeral ports.
+  while (true) {
+    // Bind the v4 loopback on a v4 socket.
+    TestAddress const& test_addr = V4Loopback();
+    sockaddr_storage bound_addr = test_addr.addr;
+    FileDescriptor bound_fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+
+    ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                           &kSockOptOn, sizeof(kSockOptOn)),
+                SyscallSucceeds());
+
+    int ret = bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                   test_addr.addr_len);
+    if (ret != 0) {
+      ASSERT_EQ(errno, EADDRINUSE);
+      break;
+    }
+    // Get the port that we bound.
+    socklen_t bound_addr_len = test_addr.addr_len;
+    ASSERT_THAT(
+        getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                    &bound_addr_len),
+        SyscallSucceeds());
+    uint16_t port = reinterpret_cast<sockaddr_in*>(&bound_addr)->sin_port;
+
+    // Newly bound port should not already be in use by a listening socket.
+    ASSERT_EQ(listen_sockets.find(port), listen_sockets.end());
+    auto fd = bound_fd.get();
+    listen_sockets.insert(std::make_pair(port, std::move(bound_fd)));
+    ASSERT_THAT(listen(fd, SOMAXCONN), SyscallSucceeds());
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllFamilies, SocketMultiProtocolInetLoopbackTest,
+    ::testing::Values(ProtocolTestParam{"TCP", SOCK_STREAM},
+                      ProtocolTestParam{"UDP", SOCK_DGRAM}),
+    DescribeProtocolTestParam);
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index bc005e2bb..cdc9c2266 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -2121,7 +2121,7 @@ TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
               SyscallSucceedsWithValue(kMessageSize));
 }
 
-// Check that connect returns EADDRNOTAVAIL when out of local ephemeral ports.
+// Check that connect returns EAGAIN when out of local ephemeral ports.
 // We disable S/R because this test creates a large number of sockets.
 TEST_P(IPv4UDPUnboundSocketTest, UDPConnectPortExhaustion_NoRandomSave) {
   auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
@@ -2154,6 +2154,29 @@ TEST_P(IPv4UDPUnboundSocketTest, UDPConnectPortExhaustion_NoRandomSave) {
   }
 }
 
+// Check that bind returns EADDRINUSE when out of local ephemeral ports.
+// We disable S/R because this test creates a large number of sockets.
+TEST_P(IPv4UDPUnboundSocketTest, UDPBindPortExhaustion_NoRandomSave) {
+  auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  constexpr int kClients = 65536;
+  auto addr = V4Loopback();
+  // Disable cooperative S/R as we are making too many syscalls.
+  DisableSave ds;
+  std::vector<std::unique_ptr<FileDescriptor>> sockets;
+  for (int i = 0; i < kClients; i++) {
+    auto s = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+    int ret =
+        bind(s->get(), reinterpret_cast<sockaddr*>(&addr.addr), addr.addr_len);
+    if (ret == 0) {
+      sockets.push_back(std::move(s));
+      continue;
+    }
+    ASSERT_THAT(ret, SyscallFailsWithErrno(EADDRINUSE));
+    break;
+  }
+}
+
 // Test that socket will receive packet info control message.
 TEST_P(IPv4UDPUnboundSocketTest, SetAndReceiveIPPKTINFO) {
   // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
-- 
cgit v1.2.3


From a3f446a86fed6f3f70daef91b7f7cb5db4ebd383 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 20 Aug 2020 13:28:43 -0700
Subject: Consistent precondition formatting

Our "Preconditions:" blocks are very useful to determine the input invariants,
but they are bit inconsistent throughout the codebase, which makes them harder
to read (particularly cases with 5+ conditions in a single paragraph).

I've reformatted all of the cases to fit in simple rules:

1. Cases with a single condition are placed on a single line.
2. Cases with multiple conditions are placed in a bulleted list.

This format has been added to the style guide.

I've also mentioned "Postconditions:", though those are much less frequently
used, and all uses already match this style.

PiperOrigin-RevId: 327687465
---
 g3doc/style.md                             |  9 ++++
 pkg/fdnotifier/poll_unsafe.go              |  3 +-
 pkg/flipcall/flipcall.go                   | 31 ++++++-----
 pkg/metric/metric.go                       |  6 +--
 pkg/safemem/seq_unsafe.go                  |  7 +--
 pkg/segment/set.go                         | 32 ++++++-----
 pkg/sentry/fs/copy_up.go                   | 13 +++--
 pkg/sentry/fs/dirent.go                    | 12 ++---
 pkg/sentry/fs/file_operations.go           |  5 +-
 pkg/sentry/fs/fsutil/file_range_set.go     | 10 ++--
 pkg/sentry/fs/fsutil/host_file_mapper.go   | 12 +++--
 pkg/sentry/fs/fsutil/inode_cached.go       |  4 +-
 pkg/sentry/fs/overlay.go                   | 20 ++++---
 pkg/sentry/fs/tty/queue.go                 |  9 ++--
 pkg/sentry/fsimpl/devpts/queue.go          |  9 ++--
 pkg/sentry/fsimpl/ext/filesystem.go        | 12 ++---
 pkg/sentry/fsimpl/gofer/directory.go       | 21 +++++---
 pkg/sentry/fsimpl/gofer/filesystem.go      | 38 ++++++++-----
 pkg/sentry/fsimpl/gofer/gofer.go           |  4 +-
 pkg/sentry/fsimpl/gofer/time.go            | 15 +++---
 pkg/sentry/fsimpl/kernfs/filesystem.go     | 21 +++++---
 pkg/sentry/fsimpl/overlay/directory.go     |  4 +-
 pkg/sentry/fsimpl/overlay/filesystem.go    | 32 +++++++----
 pkg/sentry/fsimpl/overlay/overlay.go       |  4 +-
 pkg/sentry/fsimpl/tmpfs/directory.go       |  5 +-
 pkg/sentry/fsimpl/tmpfs/filesystem.go      | 13 +++--
 pkg/sentry/fsimpl/tmpfs/named_pipe.go      |  4 +-
 pkg/sentry/fsimpl/tmpfs/tmpfs.go           | 15 ++++--
 pkg/sentry/kernel/kernel.go                | 10 ++--
 pkg/sentry/kernel/ptrace.go                | 25 +++++----
 pkg/sentry/kernel/rseq.go                  | 31 ++++++-----
 pkg/sentry/kernel/task_exec.go             |  7 +--
 pkg/sentry/kernel/task_sched.go            | 11 ++--
 pkg/sentry/kernel/task_signals.go          | 12 +++--
 pkg/sentry/kernel/task_stop.go             | 14 ++---
 pkg/sentry/kernel/task_usermem.go          | 12 +++--
 pkg/sentry/kernel/time/time.go             |  6 ++-
 pkg/sentry/kernel/vdso.go                  |  3 --
 pkg/sentry/loader/elf.go                   | 13 ++---
 pkg/sentry/loader/loader.go                |  4 +-
 pkg/sentry/memmap/mapping_set.go           |  4 +-
 pkg/sentry/memmap/memmap.go                | 59 +++++++++++++--------
 pkg/sentry/mm/address_space.go             |  8 ++-
 pkg/sentry/mm/io.go                        |  9 +++-
 pkg/sentry/mm/pma.go                       | 85 ++++++++++++++++++++----------
 pkg/sentry/mm/syscalls.go                  |  9 ++--
 pkg/sentry/mm/vma.go                       | 42 ++++++++++-----
 pkg/sentry/pgalloc/pgalloc.go              | 10 ++--
 pkg/sentry/platform/interrupt/interrupt.go |  5 +-
 pkg/sentry/platform/platform.go            | 13 +++--
 pkg/sentry/vfs/dentry.go                   |  5 +-
 pkg/sentry/vfs/file_description.go         | 10 ++--
 pkg/sentry/vfs/filesystem.go               | 41 ++++++++------
 pkg/sentry/vfs/mount.go                    | 24 +++++----
 pkg/sentry/vfs/mount_unsafe.go             | 18 ++++---
 pkg/syncevent/broadcaster.go               |  4 +-
 pkg/syncevent/source.go                    |  8 +--
 pkg/tcpip/stack/conntrack.go               |  4 +-
 pkg/tcpip/stack/iptables.go                | 12 ++---
 pkg/unet/unet.go                           |  2 +-
 pkg/usermem/addr_range_seq_unsafe.go       |  6 ++-
 pkg/usermem/usermem.go                     | 77 +++++++++++++++------------
 62 files changed, 596 insertions(+), 377 deletions(-)

(limited to 'pkg')

diff --git a/g3doc/style.md b/g3doc/style.md
index d10549fe9..8258b0233 100644
--- a/g3doc/style.md
+++ b/g3doc/style.md
@@ -46,6 +46,15 @@ protected.
 Each field or variable protected by a mutex should state as such in a comment on
 the field or variable declaration.
 
+### Function comments
+
+Functions with special entry conditions (e.g., a lock must be held) should state
+these conditions in a `Preconditions:` comment block. One condition per line;
+multiple conditions are specified with a bullet (`*`).
+
+Functions with notable exit conditions (e.g., a `Done` function must eventually
+be called by the caller) can similarly have a `Postconditions:` block.
+
 ### Unused returns
 
 Unused returns should be explicitly ignored with underscores. If there is a
diff --git a/pkg/fdnotifier/poll_unsafe.go b/pkg/fdnotifier/poll_unsafe.go
index 4225b04dd..ec2f997a2 100644
--- a/pkg/fdnotifier/poll_unsafe.go
+++ b/pkg/fdnotifier/poll_unsafe.go
@@ -65,8 +65,7 @@ func NonBlockingPoll(fd int32, mask waiter.EventMask) waiter.EventMask {
 
 // epollWait performs a blocking wait on epfd.
 //
-// Preconditions:
-//  * len(events) > 0
+// Preconditions: len(events) > 0
 func epollWait(epfd int, events []syscall.EpollEvent, msec int) (int, error) {
 	if len(events) == 0 {
 		panic("Empty events passed to EpollWait")
diff --git a/pkg/flipcall/flipcall.go b/pkg/flipcall/flipcall.go
index ec742c091..c4a3366ce 100644
--- a/pkg/flipcall/flipcall.go
+++ b/pkg/flipcall/flipcall.go
@@ -179,8 +179,10 @@ const (
 
 // Connect blocks until the peer Endpoint has called Endpoint.RecvFirst().
 //
-// Preconditions: ep is a client Endpoint. ep.Connect(), ep.RecvFirst(),
-// ep.SendRecv(), and ep.SendLast() have never been called.
+// Preconditions:
+// * ep is a client Endpoint.
+// * ep.Connect(), ep.RecvFirst(), ep.SendRecv(), and ep.SendLast() have never
+//   been called.
 func (ep *Endpoint) Connect() error {
 	err := ep.ctrlConnect()
 	if err == nil {
@@ -192,8 +194,9 @@ func (ep *Endpoint) Connect() error {
 // RecvFirst blocks until the peer Endpoint calls Endpoint.SendRecv(), then
 // returns the datagram length specified by that call.
 //
-// Preconditions: ep is a server Endpoint. ep.SendRecv(), ep.RecvFirst(), and
-// ep.SendLast() have never been called.
+// Preconditions:
+// * ep is a server Endpoint.
+// * ep.SendRecv(), ep.RecvFirst(), and ep.SendLast() have never been called.
 func (ep *Endpoint) RecvFirst() (uint32, error) {
 	if err := ep.ctrlWaitFirst(); err != nil {
 		return 0, err
@@ -211,10 +214,12 @@ func (ep *Endpoint) RecvFirst() (uint32, error) {
 // datagram length, then blocks until the peer Endpoint calls
 // Endpoint.SendRecv() or Endpoint.SendLast().
 //
-// Preconditions: dataLen <= ep.DataCap(). No previous call to ep.SendRecv() or
-// ep.RecvFirst() has returned an error. ep.SendLast() has never been called.
-// If ep is a client Endpoint, ep.Connect() has previously been called and
-// returned nil.
+// Preconditions:
+// * dataLen <= ep.DataCap().
+// * No previous call to ep.SendRecv() or ep.RecvFirst() has returned an error.
+// * ep.SendLast() has never been called.
+// * If ep is a client Endpoint, ep.Connect() has previously been called and
+//   returned nil.
 func (ep *Endpoint) SendRecv(dataLen uint32) (uint32, error) {
 	if dataLen > ep.dataCap {
 		panic(fmt.Sprintf("attempting to send packet with datagram length %d (maximum %d)", dataLen, ep.dataCap))
@@ -240,10 +245,12 @@ func (ep *Endpoint) SendRecv(dataLen uint32) (uint32, error) {
 // SendLast causes the peer Endpoint's call to Endpoint.SendRecv() or
 // Endpoint.RecvFirst() to return with the given datagram length.
 //
-// Preconditions: dataLen <= ep.DataCap(). No previous call to ep.SendRecv() or
-// ep.RecvFirst() has returned an error. ep.SendLast() has never been called.
-// If ep is a client Endpoint, ep.Connect() has previously been called and
-// returned nil.
+// Preconditions:
+// * dataLen <= ep.DataCap().
+// * No previous call to ep.SendRecv() or ep.RecvFirst() has returned an error.
+// * ep.SendLast() has never been called.
+// * If ep is a client Endpoint, ep.Connect() has previously been called and
+//   returned nil.
 func (ep *Endpoint) SendLast(dataLen uint32) error {
 	if dataLen > ep.dataCap {
 		panic(fmt.Sprintf("attempting to send packet with datagram length %d (maximum %d)", dataLen, ep.dataCap))
diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 64aa365ce..d012c5734 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -106,8 +106,8 @@ type customUint64Metric struct {
 // after Initialized.
 //
 // Preconditions:
-//  * name must be globally unique.
-//  * Initialize/Disable have not been called.
+// * name must be globally unique.
+// * Initialize/Disable have not been called.
 func RegisterCustomUint64Metric(name string, cumulative, sync bool, units pb.MetricMetadata_Units, description string, value func() uint64) error {
 	if initialized {
 		return ErrInitializationDone
@@ -221,7 +221,7 @@ var (
 // EmitMetricUpdate is thread-safe.
 //
 // Preconditions:
-//  * Initialize has been called.
+// * Initialize has been called.
 func EmitMetricUpdate() {
 	emitMu.Lock()
 	defer emitMu.Unlock()
diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go
index f5f0574f8..fc4049eeb 100644
--- a/pkg/safemem/seq_unsafe.go
+++ b/pkg/safemem/seq_unsafe.go
@@ -91,9 +91,10 @@ func BlockSeqFromSlice(slice []Block) BlockSeq {
 	return blockSeqFromSliceLimited(slice, limit)
 }
 
-// Preconditions: The combined length of all Blocks in slice <= limit. If
-// len(slice) != 0, the first Block in slice has non-zero length, and limit >
-// 0.
+// Preconditions:
+// * The combined length of all Blocks in slice <= limit.
+// * If len(slice) != 0, the first Block in slice has non-zero length and
+//   limit > 0.
 func blockSeqFromSliceLimited(slice []Block, limit uint64) BlockSeq {
 	switch len(slice) {
 	case 0:
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index 1a17ad9cb..fbb31dbea 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -407,7 +407,9 @@ func (s *Set) InsertWithoutMerging(gap GapIterator, r Range, val Value) Iterator
 // and returns an iterator to the inserted segment. All existing iterators
 // (including gap, but not including the returned iterator) are invalidated.
 //
-// Preconditions: r.Start >= gap.Start(); r.End <= gap.End().
+// Preconditions:
+// * r.Start >= gap.Start().
+// * r.End <= gap.End().
 func (s *Set) InsertWithoutMergingUnchecked(gap GapIterator, r Range, val Value) Iterator {
 	gap = gap.node.rebalanceBeforeInsert(gap)
 	splitMaxGap := trackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get())
@@ -1211,12 +1213,10 @@ func (seg Iterator) End() Key {
 // does not invalidate any iterators.
 //
 // Preconditions:
-//
-// - r.Length() > 0.
-//
-// - The new range must not overlap an existing one: If seg.NextSegment().Ok(),
-// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then
-// r.start >= seg.PrevSegment().End().
+// * r.Length() > 0.
+// * The new range must not overlap an existing one:
+//   * If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start().
+//   * If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End().
 func (seg Iterator) SetRangeUnchecked(r Range) {
 	seg.node.keys[seg.index] = r
 }
@@ -1241,8 +1241,9 @@ func (seg Iterator) SetRange(r Range) {
 // SetStartUnchecked mutates the iterated segment's start. This operation does
 // not invalidate any iterators.
 //
-// Preconditions: The new start must be valid: start < seg.End(); if
-// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End().
+// Preconditions: The new start must be valid:
+// * start < seg.End()
+// * If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End().
 func (seg Iterator) SetStartUnchecked(start Key) {
 	seg.node.keys[seg.index].Start = start
 }
@@ -1264,8 +1265,9 @@ func (seg Iterator) SetStart(start Key) {
 // SetEndUnchecked mutates the iterated segment's end. This operation does not
 // invalidate any iterators.
 //
-// Preconditions: The new end must be valid: end > seg.Start(); if
-// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start().
+// Preconditions: The new end must be valid:
+// * end > seg.Start().
+// * If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start().
 func (seg Iterator) SetEndUnchecked(end Key) {
 	seg.node.keys[seg.index].End = end
 }
@@ -1695,9 +1697,11 @@ func (s *Set) ExportSortedSlices() *SegmentDataSlices {
 
 // ImportSortedSlice initializes the given set from the given slice.
 //
-// Preconditions: s must be empty. sds must represent a valid set (the segments
-// in sds must have valid lengths that do not overlap). The segments in sds
-// must be sorted in ascending key order.
+// Preconditions:
+// * s must be empty.
+// * sds must represent a valid set (the segments in sds must have valid
+//   lengths that do not overlap).
+// * The segments in sds must be sorted in ascending key order.
 func (s *Set) ImportSortedSlices(sds *SegmentDataSlices) error {
 	if !s.IsEmpty() {
 		return fmt.Errorf("cannot import into non-empty set %v", s)
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index 735452b07..ff2fe6712 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -107,8 +107,7 @@ func copyUp(ctx context.Context, d *Dirent) error {
 // leave the upper filesystem filled with any number of parent directories
 // but the upper filesystem will never be in an inconsistent state.
 //
-// Preconditions:
-// - d.Inode.overlay is non-nil.
+// Preconditions: d.Inode.overlay is non-nil.
 func copyUpLockedForRename(ctx context.Context, d *Dirent) error {
 	for {
 		// Did we race with another copy up or does there
@@ -183,12 +182,12 @@ func doCopyUp(ctx context.Context, d *Dirent) error {
 // Returns a generic error on failure.
 //
 // Preconditions:
-// - parent.Inode.overlay.upper must be non-nil.
-// - next.Inode.overlay.copyMu must be locked writable.
-// - next.Inode.overlay.lower must be non-nil.
-// - next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory,
+// * parent.Inode.overlay.upper must be non-nil.
+// * next.Inode.overlay.copyMu must be locked writable.
+// * next.Inode.overlay.lower must be non-nil.
+// * next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory,
 //   or Symlink.
-// - upper filesystem must support setting file ownership and timestamps.
+// * upper filesystem must support setting file ownership and timestamps.
 func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 	// Extract the attributes of the file we wish to copy.
 	attrs, err := next.Inode.overlay.lower.UnstableAttr(ctx)
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index a2f751068..00c526b03 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -413,9 +413,9 @@ func (d *Dirent) descendantOf(p *Dirent) bool {
 // Inode.Lookup, otherwise walk will keep d.mu locked.
 //
 // Preconditions:
-// - renameMu must be held for reading.
-// - d.mu must be held.
-// - name must must not contain "/"s.
+// * renameMu must be held for reading.
+// * d.mu must be held.
+// * name must must not contain "/"s.
 func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnlock bool) (*Dirent, error) {
 	if !IsDir(d.Inode.StableAttr) {
 		return nil, syscall.ENOTDIR
@@ -577,9 +577,9 @@ func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent,
 // exists returns true if name exists in relation to d.
 //
 // Preconditions:
-// - renameMu must be held for reading.
-// - d.mu must be held.
-// - name must must not contain "/"s.
+// * renameMu must be held for reading.
+// * d.mu must be held.
+// * name must must not contain "/"s.
 func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
 	child, err := d.walk(ctx, root, name, false /* may unlock */)
 	if err != nil {
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index 305c0f840..6ec721022 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -159,8 +159,9 @@ type FileOperations interface {
 	// io provides access to the virtual memory space to which pointers in args
 	// refer.
 	//
-	// Preconditions: The AddressSpace (if any) that io refers to is activated.
-	// Must only be called from a task goroutine.
+	// Preconditions:
+	// * The AddressSpace (if any) that io refers to is activated.
+	// * Must only be called from a task goroutine.
 	Ioctl(ctx context.Context, file *File, io usermem.IO, args arch.SyscallArguments) (uintptr, error)
 }
 
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index bbafebf03..9197aeb88 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -70,7 +70,9 @@ func (seg FileRangeIterator) FileRange() memmap.FileRange {
 
 // FileRangeOf returns the FileRange mapped by mr.
 //
-// Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0.
+// Preconditions:
+// * seg.Range().IsSupersetOf(mr).
+// * mr.Length() != 0.
 func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRange {
 	frstart := seg.Value() + (mr.Start - seg.Start())
 	return memmap.FileRange{frstart, frstart + mr.Length()}
@@ -88,8 +90,10 @@ func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRan
 // outside of optional. It returns a non-nil error if any error occurs, even
 // if the error only affects offsets in optional, but not in required.
 //
-// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
-// required and optional must be page-aligned.
+// Preconditions:
+// * required.Length() > 0.
+// * optional.IsSupersetOf(required).
+// * required and optional must be page-aligned.
 func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
 	gap := frs.LowerBoundGap(required.Start)
 	for gap.Ok() && gap.Start() < required.End {
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index ef0113b52..1390a9a7f 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -80,7 +80,9 @@ func NewHostFileMapper() *HostFileMapper {
 
 // IncRefOn increments the reference count on all offsets in mr.
 //
-// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+// Preconditions:
+// * mr.Length() != 0.
+// * mr.Start and mr.End must be page-aligned.
 func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) {
 	f.refsMu.Lock()
 	defer f.refsMu.Unlock()
@@ -97,7 +99,9 @@ func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) {
 
 // DecRefOn decrements the reference count on all offsets in mr.
 //
-// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+// Preconditions:
+// * mr.Length() != 0.
+// * mr.Start and mr.End must be page-aligned.
 func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) {
 	f.refsMu.Lock()
 	defer f.refsMu.Unlock()
@@ -204,7 +208,9 @@ func (f *HostFileMapper) UnmapAll() {
 	}
 }
 
-// Preconditions: f.mapsMu must be locked. f.mappings[chunkStart] == m.
+// Preconditions:
+// * f.mapsMu must be locked.
+// * f.mappings[chunkStart] == m.
 func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) {
 	if _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m.addr, chunkSize, 0); errno != 0 {
 		// This leaks address space and is unexpected, but is otherwise
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index fe8b0b6ac..9eb6f522e 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -684,7 +684,9 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 // maybeGrowFile grows the file's size if data has been written past the old
 // size.
 //
-// Preconditions: rw.c.attrMu and rw.c.dataMu bust be locked.
+// Preconditions:
+// * rw.c.attrMu must be locked.
+// * rw.c.dataMu must be locked.
 func (rw *inodeReadWriter) maybeGrowFile() {
 	// If the write ends beyond the file's previous size, it causes the
 	// file to grow.
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 35013a21b..01a1235b8 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -86,13 +86,12 @@ func isXattrOverlay(name string) bool {
 // NewOverlayRoot produces the root of an overlay.
 //
 // Preconditions:
-//
-// - upper and lower must be non-nil.
-// - upper must not be an overlay.
-// - lower should not expose character devices, pipes, or sockets, because
+// * upper and lower must be non-nil.
+// * upper must not be an overlay.
+// * lower should not expose character devices, pipes, or sockets, because
 //   copying up these types of files is not supported.
-// - lower must not require that file objects be revalidated.
-// - lower must not have dynamic file/directory content.
+// * lower must not require that file objects be revalidated.
+// * lower must not have dynamic file/directory content.
 func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags MountSourceFlags) (*Inode, error) {
 	if !IsDir(upper.StableAttr) {
 		return nil, fmt.Errorf("upper Inode is a %v, not a directory", upper.StableAttr.Type)
@@ -117,12 +116,11 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount
 // NewOverlayRootFile produces the root of an overlay that points to a file.
 //
 // Preconditions:
-//
-// - lower must be non-nil.
-// - lower should not expose character devices, pipes, or sockets, because
+// * lower must be non-nil.
+// * lower should not expose character devices, pipes, or sockets, because
 //   copying up these types of files is not supported. Neither it can be a dir.
-// - lower must not require that file objects be revalidated.
-// - lower must not have dynamic file/directory content.
+// * lower must not require that file objects be revalidated.
+// * lower must not have dynamic file/directory content.
 func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, flags MountSourceFlags) (*Inode, error) {
 	if !IsRegular(lower.StableAttr) {
 		return nil, fmt.Errorf("lower Inode is not a regular file")
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index ceabb9b1e..c5d7ec717 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -104,8 +104,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
 // as whether the read caused more readable data to become available (whether
 // data was pushed from the wait buffer to the read buffer).
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -145,8 +144,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
 
 // write writes to q from userspace.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -188,8 +186,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
 
 // writeBytes writes to q from b.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
index dffb4232c..331c13997 100644
--- a/pkg/sentry/fsimpl/devpts/queue.go
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -102,8 +102,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
 // as whether the read caused more readable data to become available (whether
 // data was pushed from the wait buffer to the read buffer).
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -143,8 +142,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
 
 // write writes to q from userspace.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -186,8 +184,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
 
 // writeBytes writes to q from b.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index c714ddf73..8565d1a66 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -81,9 +81,9 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil)
 // stepLocked is loosely analogous to fs/namei.c:walk_component().
 //
 // Preconditions:
-//     - filesystem.mu must be locked (for writing if write param is true).
-//     - !rp.Done().
-//     - inode == vfsd.Impl().(*Dentry).inode.
+// * filesystem.mu must be locked (for writing if write param is true).
+// * !rp.Done().
+// * inode == vfsd.Impl().(*Dentry).inode.
 func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
 	if !inode.isDir() {
 		return nil, nil, syserror.ENOTDIR
@@ -166,7 +166,7 @@ func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, in
 // walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
 //
 // Preconditions:
-//     - filesystem.mu must be locked (for writing if write param is true).
+// * filesystem.mu must be locked (for writing if write param is true).
 func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
 	vfsd := rp.Start()
 	inode := vfsd.Impl().(*dentry).inode
@@ -194,8 +194,8 @@ func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.De
 // walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat().
 //
 // Preconditions:
-//     - filesystem.mu must be locked (for writing if write param is true).
-//     - !rp.Done().
+// * filesystem.mu must be locked (for writing if write param is true).
+// * !rp.Done().
 func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
 	vfsd := rp.Start()
 	inode := vfsd.Impl().(*dentry).inode
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 40dce553e..91d2ae199 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -34,8 +34,11 @@ func (d *dentry) isDir() bool {
 	return d.fileType() == linux.S_IFDIR
 }
 
-// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked.
-// d.isDir(). child must be a newly-created dentry that has never had a parent.
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+// * child must be a newly-created dentry that has never had a parent.
 func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
 	d.IncRef() // reference held by child on its parent
 	child.parent = d
@@ -46,7 +49,9 @@ func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
 	d.children[name] = child
 }
 
-// Preconditions: d.dirMu must be locked. d.isDir().
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
 func (d *dentry) cacheNegativeLookupLocked(name string) {
 	// Don't cache negative lookups if InteropModeShared is in effect (since
 	// this makes remote lookup unavoidable), or if d.isSynthetic() (in which
@@ -79,8 +84,10 @@ type createSyntheticOpts struct {
 // createSyntheticChildLocked creates a synthetic file with the given name
 // in d.
 //
-// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain
-// a child with the given name.
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
+// * d does not already contain a child with the given name.
 func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
 	child := &dentry{
 		refs:      1, // held by d
@@ -151,7 +158,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	return nil
 }
 
-// Preconditions: d.isDir(). There exists at least one directoryFD representing d.
+// Preconditions:
+// * d.isDir().
+// * There exists at least one directoryFD representing d.
 func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 	// NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the
 	// presence of concurrent mutation of an iterated directory, so
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 1b6fa4e14..4d581fc29 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -115,9 +115,12 @@ func putDentrySlice(ds *[]*dentry) {
 // Dentries which may become cached as a result of the traversal are appended
 // to *ds.
 //
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata
-// must be up to date.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
+// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
+//   to date.
 //
 // Postconditions: The returned dentry's cached metadata is up to date.
 func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
@@ -185,8 +188,11 @@ afterSymlink:
 // getChildLocked returns a dentry representing the child of parent with the
 // given name. If no such child exists, getChildLocked returns (nil, nil).
 //
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
-// parent.isDir(). name is not "." or "..".
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
+// * parent.isDir().
+// * name is not "." or "..".
 //
 // Postconditions: If getChildLocked returns a non-nil dentry, its cached
 // metadata is up to date.
@@ -206,7 +212,8 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil
 	return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds)
 }
 
-// Preconditions: As for getChildLocked. !parent.isSynthetic().
+// Preconditions: Same as getChildLocked, plus:
+// * !parent.isSynthetic().
 func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
 	if child != nil {
 		// Need to lock child.metadataMu because we might be updating child
@@ -279,9 +286,11 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 // rp.Start().Impl().(*dentry)). It does not check that the returned directory
 // is searchable by the provider of rp.
 //
-// Preconditions: fs.renameMu must be locked. !rp.Done(). If
-// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to
-// date.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
+// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
+//   to date.
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	for !rp.Final() {
 		d.dirMu.Lock()
@@ -328,8 +337,9 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 // createInRemoteDir (if the parent directory is a real remote directory) or
 // createInSyntheticDir (if the parent directory is synthetic) to do so.
 //
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
 func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) error, createInSyntheticDir func(parent *dentry, name string) error) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -1087,8 +1097,10 @@ retry:
 	return &fd.vfsfd, nil
 }
 
-// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
-// !d.isSynthetic().
+// Preconditions:
+// * d.fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !d.isSynthetic().
 func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
 	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 63e589859..c6696b9d8 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -1418,7 +1418,9 @@ func (d *dentry) userXattrSupported() bool {
 	return filetype == linux.ModeRegular || filetype == linux.ModeDirectory
 }
 
-// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDir().
+// Preconditions:
+// * !d.isSynthetic().
+// * d.isRegularFile() || d.isDir().
 func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
 	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
 	// O_TRUNC).
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index e59d07e90..98733253d 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -52,8 +52,9 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) {
 	mnt.EndWrite()
 }
 
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// successfully called vfs.Mount.CheckBeginWrite().
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has successfully called vfs.Mount.CheckBeginWrite().
 func (d *dentry) touchCtime() {
 	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
@@ -61,8 +62,9 @@ func (d *dentry) touchCtime() {
 	d.metadataMu.Unlock()
 }
 
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// successfully called vfs.Mount.CheckBeginWrite().
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has successfully called vfs.Mount.CheckBeginWrite().
 func (d *dentry) touchCMtime() {
 	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
@@ -72,8 +74,9 @@ func (d *dentry) touchCMtime() {
 	d.metadataMu.Unlock()
 }
 
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// locked d.metadataMu.
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has locked d.metadataMu.
 func (d *dentry) touchCMtimeLocked() {
 	now := d.fs.clock.Now().Nanoseconds()
 	atomic.StoreInt64(&d.mtime, now)
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 3e5192edd..e5d6b5c35 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -32,7 +32,9 @@ import (
 //
 // stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
 //
-// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * !rp.Done().
 //
 // Postcondition: Caller must call fs.processDeferredDecRefs*.
 func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) {
@@ -107,8 +109,11 @@ afterSymlink:
 // or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
 // nil) to verify that the returned child (or lack thereof) is correct.
 //
-// Preconditions: Filesystem.mu must be locked for at least reading.
-// parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * parent.dirMu must be locked.
+// * parent.isDir().
+// * name is not "." or "..".
 //
 // Postconditions: Caller must call fs.processDeferredDecRefs*.
 func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) {
@@ -171,7 +176,9 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP
 // walkParentDirLocked is loosely analogous to Linux's
 // fs/namei.c:path_parentat().
 //
-// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * !rp.Done().
 //
 // Postconditions: Caller must call fs.processDeferredDecRefs*.
 func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
@@ -193,8 +200,10 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
 // checkCreateLocked checks that a file named rp.Component() may be created in
 // directory parentVFSD, then returns rp.Component().
 //
-// Preconditions: Filesystem.mu must be locked for at least reading. parentInode
-// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true.
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * parentInode == parentVFSD.Impl().(*Dentry).Inode.
+// * isDir(parentInode) == true.
 func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
 	if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return "", err
diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go
index 6a79f7ffe..b1b292e83 100644
--- a/pkg/sentry/fsimpl/overlay/directory.go
+++ b/pkg/sentry/fsimpl/overlay/directory.go
@@ -29,7 +29,9 @@ func (d *dentry) isDir() bool {
 	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
 }
 
-// Preconditions: d.dirMu must be locked. d.isDir().
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
 func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) {
 	vfsObj := d.fs.vfsfs.VirtualFilesystem()
 	var readdirErr error
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index 86d0164b4..a3cee4047 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -110,8 +110,10 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de
 // Dentries which may have a reference count of zero, and which therefore
 // should be dropped once traversal is complete, are appended to ds.
 //
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
 func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
@@ -159,7 +161,9 @@ afterSymlink:
 	return child, nil
 }
 
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
 func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
 	if child, ok := parent.children[name]; ok {
 		return child, nil
@@ -177,7 +181,9 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
 	return child, nil
 }
 
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
 func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
 	childPath := fspath.Parse(name)
 	child := fs.newDentry()
@@ -300,7 +306,9 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 // lookupLayerLocked is similar to lookupLocked, but only returns information
 // about the file rather than a dentry.
 //
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
 func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) {
 	childPath := fspath.Parse(name)
 	lookupLayer := lookupLayerNone
@@ -385,7 +393,9 @@ func (ll lookupLayer) existsInOverlay() bool {
 // rp.Start().Impl().(*dentry)). It does not check that the returned directory
 // is searchable by the provider of rp.
 //
-// Preconditions: fs.renameMu must be locked. !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	for !rp.Final() {
 		d.dirMu.Lock()
@@ -425,8 +435,9 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 // doCreateAt checks that creating a file at rp is permitted, then invokes
 // create to do so.
 //
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
 func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -851,8 +862,9 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *
 	return &fd.vfsfd, nil
 }
 
-// Preconditions: parent.dirMu must be locked. parent does not already contain
-// a child named rp.Component().
+// Preconditions:
+// * parent.dirMu must be locked.
+// * parent does not already contain a child named rp.Component().
 func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
 	creds := rp.Credentials()
 	if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil {
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index 75cc006bf..4b3dfbc01 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -482,7 +482,9 @@ func (d *dentry) checkDropLocked(ctx context.Context) {
 
 // destroyLocked destroys the dentry.
 //
-// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+// Preconditions:
+// * d.fs.renameMu must be locked for writing.
+// * d.refs == 0.
 func (d *dentry) destroyLocked(ctx context.Context) {
 	switch atomic.LoadInt64(&d.refs) {
 	case 0:
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 78b4fc5be..070c75e68 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -57,8 +57,9 @@ func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.Fi
 	return dir
 }
 
-// Preconditions: filesystem.mu must be locked for writing. dir must not
-// already contain a child with the given name.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * dir must not already contain a child with the given name.
 func (dir *directory) insertChildLocked(child *dentry, name string) {
 	child.parent = &dir.dentry
 	child.name = name
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index b0ec177e6..7924a0911 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -39,7 +39,9 @@ func (fs *filesystem) Sync(ctx context.Context) error {
 //
 // stepLocked is loosely analogous to fs/namei.c:walk_component().
 //
-// Preconditions: filesystem.mu must be locked. !rp.Done().
+// Preconditions:
+// * filesystem.mu must be locked.
+// * !rp.Done().
 func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
 	dir, ok := d.inode.impl.(*directory)
 	if !ok {
@@ -97,7 +99,9 @@ afterSymlink:
 // walkParentDirLocked is loosely analogous to Linux's
 // fs/namei.c:path_parentat().
 //
-// Preconditions: filesystem.mu must be locked. !rp.Done().
+// Preconditions:
+// * filesystem.mu must be locked.
+// * !rp.Done().
 func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
 	for !rp.Final() {
 		next, err := stepLocked(ctx, rp, d)
@@ -139,8 +143,9 @@ func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error)
 // doCreateAt is loosely analogous to a conjunction of Linux's
 // fs/namei.c:filename_create() and done_path_create().
 //
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
 func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 739350cf0..5b0471ff4 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -28,8 +28,8 @@ type namedPipe struct {
 }
 
 // Preconditions:
-//   * fs.mu must be locked.
-//   * rp.Mount().CheckBeginWrite() has been called successfully.
+// * fs.mu must be locked.
+// * rp.Mount().CheckBeginWrite() has been called successfully.
 func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
 	file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index de2af6d01..428f62aaa 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -340,8 +340,10 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth
 
 // incLinksLocked increments i's link count.
 //
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
-// i.nlink < maxLinks.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * i.nlink != 0.
+// * i.nlink < maxLinks.
 func (i *inode) incLinksLocked() {
 	if i.nlink == 0 {
 		panic("tmpfs.inode.incLinksLocked() called with no existing links")
@@ -355,7 +357,9 @@ func (i *inode) incLinksLocked() {
 // decLinksLocked decrements i's link count. If the link count reaches 0, we
 // remove a reference on i as well.
 //
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * i.nlink != 0.
 func (i *inode) decLinksLocked(ctx context.Context) {
 	if i.nlink == 0 {
 		panic("tmpfs.inode.decLinksLocked() called with no existing links")
@@ -594,8 +598,9 @@ func (i *inode) touchCMtime() {
 	i.mu.Unlock()
 }
 
-// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds
-// inode.mu.
+// Preconditions:
+// * The caller has called vfs.Mount.CheckBeginWrite().
+// * inode.mu must be locked.
 func (i *inode) touchCMtimeLocked() {
 	now := i.fs.clock.Now().Nanoseconds()
 	atomic.StoreInt64(&i.mtime, now)
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 1028d13c6..2e0175e36 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -1067,8 +1067,9 @@ func (k *Kernel) Start() error {
 
 // pauseTimeLocked pauses all Timers and Timekeeper updates.
 //
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
 func (k *Kernel) pauseTimeLocked(ctx context.Context) {
 	// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
 	// Kernel.Start().
@@ -1111,8 +1112,9 @@ func (k *Kernel) pauseTimeLocked(ctx context.Context) {
 // pauseTimeLocked has not been previously called, resumeTimeLocked has no
 // effect.
 //
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
 func (k *Kernel) resumeTimeLocked(ctx context.Context) {
 	if k.cpuClockTicker != nil {
 		k.cpuClockTicker.Resume()
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 619b0cb7c..50df179c3 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -224,8 +224,9 @@ func (s *ptraceStop) Killable() bool {
 // beginPtraceStopLocked does not signal t's tracer or wake it if it is
 // waiting.
 //
-// Preconditions: The TaskSet mutex must be locked. The caller must be running
-// on the task goroutine.
+// Preconditions:
+// * The TaskSet mutex must be locked.
+// * The caller must be running on the task goroutine.
 func (t *Task) beginPtraceStopLocked() bool {
 	t.tg.signalHandlers.mu.Lock()
 	defer t.tg.signalHandlers.mu.Unlock()
@@ -270,8 +271,9 @@ func (t *Task) ptraceTrapLocked(code int32) {
 // ptraceStop, temporarily preventing it from being removed by a concurrent
 // Task.Kill, and returns true. Otherwise it returns false.
 //
-// Preconditions: The TaskSet mutex must be locked. The caller must be running
-// on the task goroutine of t's tracer.
+// Preconditions:
+// * The TaskSet mutex must be locked.
+// * The caller must be running on the task goroutine of t's tracer.
 func (t *Task) ptraceFreeze() bool {
 	t.tg.signalHandlers.mu.Lock()
 	defer t.tg.signalHandlers.mu.Unlock()
@@ -301,8 +303,9 @@ func (t *Task) ptraceUnfreeze() {
 	t.ptraceUnfreezeLocked()
 }
 
-// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be
-// locked.
+// Preconditions:
+// * t must be in a frozen ptraceStop.
+// * t's signal mutex must be locked.
 func (t *Task) ptraceUnfreezeLocked() {
 	// Do this even if the task has been killed to ensure a panic if t.stop is
 	// nil or not a ptraceStop.
@@ -497,8 +500,9 @@ func (t *Task) forgetTracerLocked() {
 // ptraceSignalLocked is called after signal dequeueing to check if t should
 // enter ptrace signal-delivery-stop.
 //
-// Preconditions: The signal mutex must be locked. The caller must be running
-// on the task goroutine.
+// Preconditions:
+// * The signal mutex must be locked.
+// * The caller must be running on the task goroutine.
 func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool {
 	if linux.Signal(info.Signo) == linux.SIGKILL {
 		return false
@@ -828,8 +832,9 @@ func (t *Task) ptraceInterrupt(target *Task) error {
 	return nil
 }
 
-// Preconditions: The TaskSet mutex must be locked for writing. t must have a
-// tracer.
+// Preconditions:
+// * The TaskSet mutex must be locked for writing.
+// * t must have a tracer.
 func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
 	const valid = uintptr(linux.PTRACE_O_EXITKILL |
 		linux.PTRACE_O_TRACESYSGOOD |
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 18416643b..2a9023fdf 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -173,8 +173,10 @@ func (t *Task) OldRSeqCPUAddr() usermem.Addr {
 // SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
 // t's CPU number.
 //
-// Preconditions: t.RSeqAvailable() == true. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions:
+// * t.RSeqAvailable() == true.
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
 	t.oldRSeqCPUAddr = addr
 
@@ -189,8 +191,9 @@ func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
 	return nil
 }
 
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) rseqUpdateCPU() error {
 	if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
 		t.rseqCPU = -1
@@ -209,8 +212,9 @@ func (t *Task) rseqUpdateCPU() error {
 	return oerr
 }
 
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) oldRSeqCopyOutCPU() error {
 	if t.oldRSeqCPUAddr == 0 {
 		return nil
@@ -222,8 +226,9 @@ func (t *Task) oldRSeqCopyOutCPU() error {
 	return err
 }
 
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) rseqCopyOutCPU() error {
 	if t.rseqAddr == 0 {
 		return nil
@@ -240,8 +245,9 @@ func (t *Task) rseqCopyOutCPU() error {
 	return err
 }
 
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) rseqClearCPU() error {
 	buf := t.CopyScratchBuffer(8)
 	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
@@ -269,8 +275,9 @@ func (t *Task) rseqClearCPU() error {
 //
 // See kernel/rseq.c:rseq_ip_fixup for reference.
 //
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) rseqAddrInterrupt() {
 	if t.rseqAddr == 0 {
 		return
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 5e4fb3e3a..412d471d3 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -237,9 +237,10 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 // promoteLocked makes t the leader of its thread group. If t is already the
 // thread group leader, promoteLocked is a no-op.
 //
-// Preconditions: All other tasks in t's thread group, including the existing
-// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must
-// be locked for writing.
+// Preconditions:
+// * All other tasks in t's thread group, including the existing leader (if it
+//   is not t), have reached TaskExitZombie.
+// * The TaskSet mutex must be locked for writing.
 func (t *Task) promoteLocked() {
 	oldLeader := t.tg.leader
 	if t == oldLeader {
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 09366b60c..52c55d13d 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -133,9 +133,10 @@ func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
 	}
 }
 
-// Preconditions: The caller must be running on the task goroutine, and leaving
-// a state indicated by a previous call to
-// t.accountTaskGoroutineEnter(state).
+// Preconditions:
+// * The caller must be running on the task goroutine
+// * The caller must be leaving a state indicated by a previous call to
+//   t.accountTaskGoroutineEnter(state).
 func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
 	if state != TaskGoroutineRunningApp {
 		// Task is unblocking/continuing.
@@ -191,8 +192,8 @@ func (tg *ThreadGroup) CPUStats() usage.CPUStats {
 	return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
 }
 
-// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex
-// must be locked.
+// Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus:
+// * The TaskSet mutex must be locked.
 func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
 	stats := tg.exitedCPUStats
 	// Account for live tasks.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index d6a2040bc..feaa38596 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -319,8 +319,9 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
 
 // Sigtimedwait implements the semantics of sigtimedwait(2).
 //
-// Preconditions: The caller must be running on the task goroutine. t.exitState
-// < TaskExitZombie.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t.exitState < TaskExitZombie.
 func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
 	// set is the set of signals we're interested in; invert it to get the set
 	// of signals to block.
@@ -584,8 +585,9 @@ func (t *Task) SignalMask() linux.SignalSet {
 
 // SetSignalMask sets t's signal mask.
 //
-// Preconditions: SetSignalMask can only be called by the task goroutine.
-// t.exitState < TaskExitZombie.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t.exitState < TaskExitZombie.
 func (t *Task) SetSignalMask(mask linux.SignalSet) {
 	// By precondition, t prevents t.tg from completing an execve and mutating
 	// t.tg.signalHandlers, so we can skip the TaskSet mutex.
@@ -631,7 +633,7 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
 // SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
 // comment).
 //
-// Preconditions: SetSavedSignalMask can only be called by the task goroutine.
+// Preconditions: The caller must be running on the task goroutine.
 func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
 	t.savedSignalMask = mask
 	t.haveSavedSignalMask = true
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index 296735d32..a35948a5f 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -99,8 +99,9 @@ type TaskStop interface {
 
 // beginInternalStop indicates the start of an internal stop that applies to t.
 //
-// Preconditions: The task must not already be in an internal stop (i.e. t.stop
-// == nil). The caller must be running on the task goroutine.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * The task must not already be in an internal stop (i.e. t.stop == nil).
 func (t *Task) beginInternalStop(s TaskStop) {
 	t.tg.pidns.owner.mu.RLock()
 	defer t.tg.pidns.owner.mu.RUnlock()
@@ -109,8 +110,8 @@ func (t *Task) beginInternalStop(s TaskStop) {
 	t.beginInternalStopLocked(s)
 }
 
-// Preconditions: The signal mutex must be locked. All preconditions for
-// Task.beginInternalStop also apply.
+// Preconditions: Same as beginInternalStop, plus:
+// * The signal mutex must be locked.
 func (t *Task) beginInternalStopLocked(s TaskStop) {
 	if t.stop != nil {
 		panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop))
@@ -128,8 +129,9 @@ func (t *Task) beginInternalStopLocked(s TaskStop) {
 // t.stop, which is why there is no endInternalStop that locks the signal mutex
 // for you.
 //
-// Preconditions: The signal mutex must be locked. The task must be in an
-// internal stop (i.e. t.stop != nil).
+// Preconditions:
+// * The signal mutex must be locked.
+// * The task must be in an internal stop (i.e. t.stop != nil).
 func (t *Task) endInternalStopLocked() {
 	if t.stop == nil {
 		panic("Attempting to leave non-existent internal stop")
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index b02044ad2..4550b9f89 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -143,8 +143,9 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
 // CopyOutIovecs converts src to an array of struct iovecs and copies it to the
 // memory mapped at addr.
 //
-// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions: Same as usermem.IO.CopyOut, plus:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error {
 	switch t.Arch().Width() {
 	case 8:
@@ -191,8 +192,9 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
 // combined length of all AddrRanges would otherwise exceed this amount, ranges
 // beyond MAX_RW_COUNT are silently truncated.
 //
-// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions: Same as usermem.IO.CopyIn, plus:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) {
 	if numIovecs == 0 {
 		return usermem.AddrRangeSeq{}, nil
@@ -284,7 +286,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp
 //
 // IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
 //
-// Preconditions: As for Task.CopyInIovecs.
+// Preconditions: Same as Task.CopyInIovecs.
 func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
 	if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
 		return usermem.IOSequence{}, syserror.EINVAL
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index e959700f2..f61a8e164 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -616,8 +616,10 @@ func (t *Timer) Swap(s Setting) (Time, Setting) {
 // Timer's Clock) at which the Setting was changed. Setting s.Enabled to true
 // starts the timer, while setting s.Enabled to false stops it.
 //
-// Preconditions: The Timer must not be paused. f cannot call any Timer methods
-// since it is called with the Timer mutex locked.
+// Preconditions:
+// * The Timer must not be paused.
+// * f cannot call any Timer methods since it is called with the Timer mutex
+//   locked.
 func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
 	now := t.clock.Now()
 	t.mu.Lock()
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 290c32466..e44a139b3 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -73,13 +73,10 @@ type VDSOParamPage struct {
 // NewVDSOParamPage returns a VDSOParamPage.
 //
 // Preconditions:
-//
 // * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does
 //   not take ownership of fr; it must remain allocated for the lifetime of the
 //   VDSOParamPage.
-//
 // * VDSOParamPage must be the only writer to fr.
-//
 // * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block.
 func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage {
 	return &VDSOParamPage{mfp: mfp, fr: fr}
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 20dd1cc21..d4610ec3b 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -402,8 +402,7 @@ type loadedELF struct {
 //
 // It does not load the ELF interpreter, or return any auxv entries.
 //
-// Preconditions:
-//  * f is an ELF file
+// Preconditions: f is an ELF file.
 func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
 	first := true
 	var start, end usermem.Addr
@@ -571,8 +570,8 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in
 // It does not load the ELF interpreter, or return any auxv entries.
 //
 // Preconditions:
-//  * f is an ELF file
-//  * f is the first ELF loaded into m
+// * f is an ELF file.
+// * f is the first ELF loaded into m.
 func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f fsbridge.File) (loadedELF, arch.Context, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
@@ -609,8 +608,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS
 //
 // It does not return any auxv entries.
 //
-// Preconditions:
-//  * f is an ELF file
+// Preconditions: f is an ELF file.
 func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
@@ -640,8 +638,7 @@ func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.Fil
 // If loadELF returns ErrSwitchFile it should be called again with the returned
 // path and argv.
 //
-// Preconditions:
-//  * args.File is an ELF file
+// Preconditions: args.File is an ELF file.
 func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error) {
 	bin, ac, err := loadInitialELF(ctx, args.MemoryManager, args.Features, args.File)
 	if err != nil {
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 8d6802ea3..15c88aa7c 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -215,8 +215,8 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 // path and argv.
 //
 // Preconditions:
-//  * The Task MemoryManager is empty.
-//  * Load is called on the Task goroutine.
+// * The Task MemoryManager is empty.
+// * Load is called on the Task goroutine.
 func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
 	// Load the executable itself.
 	loaded, ac, file, newArgv, err := loadExecutable(ctx, args)
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
index d609c1ae0..457ed87f8 100644
--- a/pkg/sentry/memmap/mapping_set.go
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -177,7 +177,7 @@ func subsetMapping(wholeRange, subsetRange MappableRange, ms MappingSpace, addr
 // AddMapping adds the given mapping and returns the set of MappableRanges that
 // previously had no mappings.
 //
-// Preconditions: As for Mappable.AddMapping.
+// Preconditions: Same as Mappable.AddMapping.
 func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) []MappableRange {
 	mr := MappableRange{offset, offset + uint64(ar.Length())}
 	var mapped []MappableRange
@@ -204,7 +204,7 @@ func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset ui
 // RemoveMapping removes the given mapping and returns the set of
 // MappableRanges that now have no mappings.
 //
-// Preconditions: As for Mappable.RemoveMapping.
+// Preconditions: Same as Mappable.RemoveMapping.
 func (s *MappingSet) RemoveMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) []MappableRange {
 	mr := MappableRange{offset, offset + uint64(ar.Length())}
 	var unmapped []MappableRange
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 65d83096f..a44fa2b95 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -28,9 +28,9 @@ import (
 //
 // See mm/mm.go for Mappable's place in the lock order.
 //
-// Preconditions: For all Mappable methods, usermem.AddrRanges and
-// MappableRanges must be non-empty (Length() != 0), and usermem.Addrs and
-// Mappable offsets must be page-aligned.
+// All Mappable methods have the following preconditions:
+// * usermem.AddrRanges and MappableRanges must be non-empty (Length() != 0).
+// * usermem.Addrs and Mappable offsets must be page-aligned.
 type Mappable interface {
 	// AddMapping notifies the Mappable of a mapping from addresses ar in ms to
 	// offsets [offset, offset+ar.Length()) in this Mappable.
@@ -48,8 +48,10 @@ type Mappable interface {
 	// addresses ar in ms to offsets [offset, offset+ar.Length()) in this
 	// Mappable.
 	//
-	// Preconditions: offset+ar.Length() does not overflow. The removed mapping
-	// must exist. writable must match the corresponding call to AddMapping.
+	// Preconditions:
+	// * offset+ar.Length() does not overflow.
+	// * The removed mapping must exist. writable must match the
+	//   corresponding call to AddMapping.
 	RemoveMapping(ctx context.Context, ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool)
 
 	// CopyMapping notifies the Mappable of an attempt to copy a mapping in ms
@@ -60,9 +62,10 @@ type Mappable interface {
 	// CopyMapping is only called when a mapping is copied within a given
 	// MappingSpace; it is analogous to Linux's vm_operations_struct::mremap.
 	//
-	// Preconditions: offset+srcAR.Length() and offset+dstAR.Length() do not
-	// overflow. The mapping at srcAR must exist. writable must match the
-	// corresponding call to AddMapping.
+	// Preconditions:
+	// * offset+srcAR.Length() and offset+dstAR.Length() do not overflow.
+	// * The mapping at srcAR must exist. writable must match the
+	//   corresponding call to AddMapping.
 	CopyMapping(ctx context.Context, ms MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error
 
 	// Translate returns the Mappable's current mappings for at least the range
@@ -77,11 +80,14 @@ type Mappable interface {
 	// reference is held on all pages in a File that may be the result
 	// of a valid Translation.
 	//
-	// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
-	// required and optional must be page-aligned. The caller must have
-	// established a mapping for all of the queried offsets via a previous call
-	// to AddMapping. The caller is responsible for ensuring that calls to
-	// Translate synchronize with invalidation.
+	// Preconditions:
+	// * required.Length() > 0.
+	// * optional.IsSupersetOf(required).
+	// * required and optional must be page-aligned.
+	// * The caller must have established a mapping for all of the queried
+	//   offsets via a previous call to AddMapping.
+	// * The caller is responsible for ensuring that calls to Translate
+	//   synchronize with invalidation.
 	//
 	// Postconditions: See CheckTranslateResult.
 	Translate(ctx context.Context, required, optional MappableRange, at usermem.AccessType) ([]Translation, error)
@@ -118,7 +124,7 @@ func (t Translation) FileRange() FileRange {
 // CheckTranslateResult returns an error if (ts, terr) does not satisfy all
 // postconditions for Mappable.Translate(required, optional, at).
 //
-// Preconditions: As for Mappable.Translate.
+// Preconditions: Same as Mappable.Translate.
 func CheckTranslateResult(required, optional MappableRange, at usermem.AccessType, ts []Translation, terr error) error {
 	// Verify that the inputs to Mappable.Translate were valid.
 	if !required.WellFormed() || required.Length() <= 0 {
@@ -214,7 +220,9 @@ type MappingSpace interface {
 	// Invalidate must not take any locks preceding mm.MemoryManager.activeMu
 	// in the lock order.
 	//
-	// Preconditions: ar.Length() != 0. ar must be page-aligned.
+	// Preconditions:
+	// * ar.Length() != 0.
+	// * ar must be page-aligned.
 	Invalidate(ar usermem.AddrRange, opts InvalidateOpts)
 }
 
@@ -375,16 +383,20 @@ type File interface {
 
 	// IncRef increments the reference count on all pages in fr.
 	//
-	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
-	// 0. At least one reference must be held on all pages in fr. (The File
-	// interface does not provide a way to acquire an initial reference;
-	// implementors may define mechanisms for doing so.)
+	// Preconditions:
+	// * fr.Start and fr.End must be page-aligned.
+	// * fr.Length() > 0.
+	// * At least one reference must be held on all pages in fr. (The File
+	//   interface does not provide a way to acquire an initial reference;
+	//   implementors may define mechanisms for doing so.)
 	IncRef(fr FileRange)
 
 	// DecRef decrements the reference count on all pages in fr.
 	//
-	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
-	// 0. At least one reference must be held on all pages in fr.
+	// Preconditions:
+	// * fr.Start and fr.End must be page-aligned.
+	// * fr.Length() > 0.
+	// * At least one reference must be held on all pages in fr.
 	DecRef(fr FileRange)
 
 	// MapInternal returns a mapping of the given file offsets in the invoking
@@ -392,8 +404,9 @@ type File interface {
 	//
 	// Note that fr.Start and fr.End need not be page-aligned.
 	//
-	// Preconditions: fr.Length() > 0. At least one reference must be held on
-	// all pages in fr.
+	// Preconditions:
+	// * fr.Length() > 0.
+	// * At least one reference must be held on all pages in fr.
 	//
 	// Postconditions: The returned mapping is valid as long as at least one
 	// reference is held on the mapped pages.
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 5c667117c..a93e76c75 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -166,8 +166,12 @@ func (mm *MemoryManager) Deactivate() {
 // mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings
 // for all addresses in ar should be precommitted.
 //
-// Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
-// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start).
+// Preconditions:
+// * mm.activeMu must be locked.
+// * mm.as != nil.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
+// * pseg == mm.pmas.LowerBoundSegment(ar.Start).
 func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
 	// By default, map entire pmas at a time, under the assumption that there
 	// is no cost to mapping more of a pma than necessary.
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
index fa776f9c6..a8ac48080 100644
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@@ -441,7 +441,10 @@ func (mm *MemoryManager) LoadUint32(ctx context.Context, addr usermem.Addr, opts
 // handleASIOFault handles a page fault at address addr for an AddressSpaceIO
 // operation spanning ioar.
 //
-// Preconditions: mm.as != nil. ioar.Length() != 0. ioar.Contains(addr).
+// Preconditions:
+// * mm.as != nil.
+// * ioar.Length() != 0.
+// * ioar.Contains(addr).
 func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr, ioar usermem.AddrRange, at usermem.AccessType) error {
 	// Try to map all remaining pages in the I/O operation. This RoundUp can't
 	// overflow because otherwise it would have been caught by CheckIORange.
@@ -629,7 +632,9 @@ func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars userme
 // at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to
 // truncate usermem.AddrRangeSeq when errors occur.
 //
-// Preconditions: !arsit.IsEmpty(). end <= arsit.Head().End.
+// Preconditions:
+// * !arsit.IsEmpty().
+// * end <= arsit.Head().End.
 func truncatedAddrRangeSeq(ars, arsit usermem.AddrRangeSeq, end usermem.Addr) usermem.AddrRangeSeq {
 	ar := arsit.Head()
 	if end <= ar.Start {
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index 930ec895f..30facebf7 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -31,7 +31,9 @@ import (
 // iterator to the pma containing ar.Start. Otherwise it returns a terminal
 // iterator.
 //
-// Preconditions: mm.activeMu must be locked. ar.Length() != 0.
+// Preconditions:
+// * mm.activeMu must be locked.
+// * ar.Length() != 0.
 func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator {
 	if checkInvariants {
 		if !ar.WellFormed() || ar.Length() <= 0 {
@@ -89,10 +91,13 @@ func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at user
 //
 // - An error that is non-nil if pmas exist for only a subset of ar.
 //
-// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
-// writing. ar.Length() != 0. vseg.Range().Contains(ar.Start). vmas must exist
-// for all addresses in ar, and support accesses of type at (i.e. permission
-// checks must have been performed against vmas).
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * mm.activeMu must be locked for writing.
+// * ar.Length() != 0.
+// * vseg.Range().Contains(ar.Start).
+// * vmas must exist for all addresses in ar, and support accesses of type at
+//   (i.e. permission checks must have been performed against vmas).
 func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) {
 	if checkInvariants {
 		if !ar.WellFormed() || ar.Length() <= 0 {
@@ -135,9 +140,11 @@ func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar
 // exist. If this is not equal to ars, it returns a non-nil error explaining
 // why.
 //
-// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
-// writing. vmas must exist for all addresses in ars, and support accesses of
-// type at (i.e. permission checks must have been performed against vmas).
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * mm.activeMu must be locked for writing.
+// * vmas must exist for all addresses in ars, and support accesses of type at
+//   (i.e. permission checks must have been performed against vmas).
 func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType) (usermem.AddrRangeSeq, error) {
 	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
 		ar := arsit.Head()
@@ -518,8 +525,10 @@ func privateAligned(ar usermem.AddrRange) usermem.AddrRange {
 // the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory
 // and update the pma to indicate that it does not require copy-on-write.
 //
-// Preconditions: vseg.Range().IsSupersetOf(pseg.Range()). mm.mappingMu must be
-// locked. mm.activeMu must be locked for writing.
+// Preconditions:
+// * vseg.Range().IsSupersetOf(pseg.Range()).
+// * mm.mappingMu must be locked.
+// * mm.activeMu must be locked for writing.
 func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool {
 	pma := pseg.ValuePtr()
 	if !pma.needCOW {
@@ -568,8 +577,10 @@ func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.Invalidate
 // invalidateLocked removes pmas and AddressSpace mappings of those pmas for
 // addresses in ar.
 //
-// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar
-// must be page-aligned.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
 func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivate, invalidateShared bool) {
 	if checkInvariants {
 		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
@@ -613,7 +624,9 @@ func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivat
 // most I/O. It should only be used in contexts that would use get_user_pages()
 // in the Linux kernel.
 //
-// Preconditions: ar.Length() != 0. ar must be page-aligned.
+// Preconditions:
+// * ar.Length() != 0.
+// * ar must be page-aligned.
 func (mm *MemoryManager) Pin(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) ([]PinnedRange, error) {
 	if checkInvariants {
 		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
@@ -693,9 +706,13 @@ func Unpin(prs []PinnedRange) {
 
 // movePMAsLocked moves all pmas in oldAR to newAR.
 //
-// Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0.
-// oldAR.Length() <= newAR.Length(). !oldAR.Overlaps(newAR).
-// mm.pmas.IsEmptyRange(newAR). oldAR and newAR must be page-aligned.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * oldAR.Length() != 0.
+// * oldAR.Length() <= newAR.Length().
+// * !oldAR.Overlaps(newAR).
+// * mm.pmas.IsEmptyRange(newAR).
+// * oldAR and newAR must be page-aligned.
 func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
 	if checkInvariants {
 		if !oldAR.WellFormed() || oldAR.Length() <= 0 || !oldAR.IsPageAligned() {
@@ -751,9 +768,11 @@ func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
 // - An error that is non-nil if internal mappings exist for only a subset of
 // ar.
 //
-// Preconditions: mm.activeMu must be locked for writing.
-// pseg.Range().Contains(ar.Start). pmas must exist for all addresses in ar.
-// ar.Length() != 0.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * pseg.Range().Contains(ar.Start).
+// * pmas must exist for all addresses in ar.
+// * ar.Length() != 0.
 //
 // Postconditions: getPMAInternalMappingsLocked does not invalidate iterators
 // into mm.pmas.
@@ -783,8 +802,9 @@ func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar userm
 // internal mappings exist. If this is not equal to ars, it returns a non-nil
 // error explaining why.
 //
-// Preconditions: mm.activeMu must be locked for writing. pmas must exist for
-// all addresses in ar.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * pmas must exist for all addresses in ar.
 //
 // Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators
 // into mm.pmas.
@@ -803,9 +823,12 @@ func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars usermem.AddrRangeSe
 
 // internalMappingsLocked returns internal mappings for addresses in ar.
 //
-// Preconditions: mm.activeMu must be locked. Internal mappings must have been
-// previously established for all addresses in ar. ar.Length() != 0.
-// pseg.Range().Contains(ar.Start).
+// Preconditions:
+// * mm.activeMu must be locked.
+// * Internal mappings must have been previously established for all addresses
+//   in ar.
+// * ar.Length() != 0.
+// * pseg.Range().Contains(ar.Start).
 func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) safemem.BlockSeq {
 	if checkInvariants {
 		if !ar.WellFormed() || ar.Length() <= 0 {
@@ -839,8 +862,10 @@ func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.Add
 
 // vecInternalMappingsLocked returns internal mappings for addresses in ars.
 //
-// Preconditions: mm.activeMu must be locked. Internal mappings must have been
-// previously established for all addresses in ars.
+// Preconditions:
+// * mm.activeMu must be locked.
+// * Internal mappings must have been previously established for all addresses
+//   in ars.
 func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) safemem.BlockSeq {
 	var ims []safemem.Block
 	for ; !ars.IsEmpty(); ars = ars.Tail() {
@@ -969,7 +994,9 @@ func (pmaSetFunctions) Split(ar usermem.AddrRange, p pma, split usermem.Addr) (p
 // findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do
 // so by scanning linearly backward from pgap.
 //
-// Preconditions: mm.activeMu must be locked. addr <= pgap.Start().
+// Preconditions:
+// * mm.activeMu must be locked.
+// * addr <= pgap.Start().
 func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr usermem.Addr, pgap pmaGapIterator) pmaIterator {
 	if checkInvariants {
 		if !pgap.Ok() {
@@ -1015,7 +1042,9 @@ func (pseg pmaIterator) fileRange() memmap.FileRange {
 	return pseg.fileRangeOf(pseg.Range())
 }
 
-// Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0.
+// Preconditions:
+// * pseg.Range().IsSupersetOf(ar).
+// * ar.Length != 0.
 func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) memmap.FileRange {
 	if checkInvariants {
 		if !pseg.Ok() {
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index e74d4e1c1..4c9a575e7 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -166,7 +166,9 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 // populateVMA obtains pmas for addresses in ar in the given vma, and maps them
 // into mm.as if it is active.
 //
-// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * vseg.Range().IsSupersetOf(ar).
 func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
 	if !vseg.ValuePtr().effectivePerms.Any() {
 		// Linux doesn't populate inaccessible pages. See
@@ -208,8 +210,9 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar u
 // preferable to populateVMA since it unlocks mm.mappingMu before performing
 // expensive operations that don't require it to be locked.
 //
-// Preconditions: mm.mappingMu must be locked for writing.
-// vseg.Range().IsSupersetOf(ar).
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * vseg.Range().IsSupersetOf(ar).
 //
 // Postconditions: mm.mappingMu will be unlocked.
 func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index c4e1989ed..f769d8294 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -27,8 +27,9 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// Preconditions: mm.mappingMu must be locked for writing. opts must be valid
-// as defined by the checks in MMap.
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * opts must be valid as defined by the checks in MMap.
 func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, usermem.AddrRange, error) {
 	if opts.MaxPerms != opts.MaxPerms.Effective() {
 		panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms))
@@ -260,8 +261,9 @@ func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
 //
 // - An error that is non-nil if vmas exist for only a subset of ar.
 //
-// Preconditions: mm.mappingMu must be locked for reading; it may be
-// temporarily unlocked. ar.Length() != 0.
+// Preconditions:
+// * mm.mappingMu must be locked for reading; it may be temporarily unlocked.
+// * ar.Length() != 0.
 func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) {
 	if checkInvariants {
 		if !ar.WellFormed() || ar.Length() <= 0 {
@@ -342,8 +344,10 @@ const guardBytes = 256 * usermem.PageSize
 // unmapLocked unmaps all addresses in ar and returns the resulting gap in
 // mm.vmas.
 //
-// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0.
-// ar must be page-aligned.
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
 func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
 	if checkInvariants {
 		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
@@ -361,8 +365,10 @@ func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange)
 // gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients
 // must do so before calling removeVMAsLocked.
 //
-// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. ar
-// must be page-aligned.
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
 func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
 	if checkInvariants {
 		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
@@ -467,7 +473,9 @@ func (vmaSetFunctions) Split(ar usermem.AddrRange, v vma, split usermem.Addr) (v
 	return v, v2
 }
 
-// Preconditions: vseg.ValuePtr().mappable != nil. vseg.Range().Contains(addr).
+// Preconditions:
+// * vseg.ValuePtr().mappable != nil.
+// * vseg.Range().Contains(addr).
 func (vseg vmaIterator) mappableOffsetAt(addr usermem.Addr) uint64 {
 	if checkInvariants {
 		if !vseg.Ok() {
@@ -491,8 +499,10 @@ func (vseg vmaIterator) mappableRange() memmap.MappableRange {
 	return vseg.mappableRangeOf(vseg.Range())
 }
 
-// Preconditions: vseg.ValuePtr().mappable != nil.
-// vseg.Range().IsSupersetOf(ar). ar.Length() != 0.
+// Preconditions:
+// * vseg.ValuePtr().mappable != nil.
+// * vseg.Range().IsSupersetOf(ar).
+// * ar.Length() != 0.
 func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRange {
 	if checkInvariants {
 		if !vseg.Ok() {
@@ -514,8 +524,10 @@ func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRan
 	return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)}
 }
 
-// Preconditions: vseg.ValuePtr().mappable != nil.
-// vseg.mappableRange().IsSupersetOf(mr). mr.Length() != 0.
+// Preconditions:
+// * vseg.ValuePtr().mappable != nil.
+// * vseg.mappableRange().IsSupersetOf(mr).
+// * mr.Length() != 0.
 func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
 	if checkInvariants {
 		if !vseg.Ok() {
@@ -540,7 +552,9 @@ func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
 // seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by
 // scanning linearly forward from vseg.
 //
-// Preconditions: mm.mappingMu must be locked. addr >= vseg.Start().
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * addr >= vseg.Start().
 func (vseg vmaIterator) seekNextLowerBound(addr usermem.Addr) vmaIterator {
 	if checkInvariants {
 		if !vseg.Ok() {
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 46d3be58c..626d1eaa4 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -507,7 +507,9 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6
 // nearest page. If this is shorter than length bytes due to an error returned
 // by r.ReadToBlocks(), it returns that error.
 //
-// Preconditions: length > 0. length must be page-aligned.
+// Preconditions:
+// * length > 0.
+// * length must be page-aligned.
 func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (memmap.FileRange, error) {
 	fr, err := f.Allocate(length, kind)
 	if err != nil {
@@ -1167,8 +1169,10 @@ func (f *MemoryFile) startEvictionsLocked() bool {
 	return startedAny
 }
 
-// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be
-// locked.
+// Preconditions:
+// * info == f.evictable[user].
+// * !info.evicting.
+// * f.mu must be locked.
 func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
 	info.evicting = true
 	f.evictionWG.Add(1)
diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go
index 57be41647..9dfac3eae 100644
--- a/pkg/sentry/platform/interrupt/interrupt.go
+++ b/pkg/sentry/platform/interrupt/interrupt.go
@@ -54,8 +54,9 @@ type Forwarder struct {
 // }
 // defer f.Disable()
 //
-// Preconditions: r must not be nil. f must not already be forwarding
-// interrupts to a Receiver.
+// Preconditions:
+// * r must not be nil.
+// * f must not already be forwarding interrupts to a Receiver.
 func (f *Forwarder) Enable(r Receiver) bool {
 	if r == nil {
 		panic("nil Receiver")
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index ba031516a..530e779b0 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -245,14 +245,19 @@ type AddressSpace interface {
 	// physical memory) to the mapping. The precommit flag is advisory and
 	// implementations may choose to ignore it.
 	//
-	// Preconditions: addr and fr must be page-aligned. fr.Length() > 0.
-	// at.Any() == true. At least one reference must be held on all pages in
-	// fr, and must continue to be held as long as pages are mapped.
+	// Preconditions:
+	// * addr and fr must be page-aligned.
+	// * fr.Length() > 0.
+	// * at.Any() == true.
+	// * At least one reference must be held on all pages in fr, and must
+	//   continue to be held as long as pages are mapped.
 	MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error
 
 	// Unmap unmaps the given range.
 	//
-	// Preconditions: addr is page-aligned. length > 0.
+	// Preconditions:
+	// * addr is page-aligned.
+	// * length > 0.
 	Unmap(addr usermem.Addr, length uint64)
 
 	// Release releases this address space. After releasing, a new AddressSpace
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index bc7ea93ea..a69a5b2f1 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -242,8 +242,9 @@ func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) {
 // caller must call AbortRenameDentry, CommitRenameReplaceDentry, or
 // CommitRenameExchangeDentry depending on the rename's outcome.
 //
-// Preconditions: If to is not nil, it must be a child Dentry from the same
-// Filesystem. from != to.
+// Preconditions:
+// * If to is not nil, it must be a child Dentry from the same Filesystem.
+// * from != to.
 func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
 	vfs.mountMu.Lock()
 	if mntns.mountpoints[from] != 0 {
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index dcafffe57..d3abe28ee 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -371,8 +371,9 @@ type FileDescriptionImpl interface {
 	//
 	// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
 	//
-	// Preconditions: The FileDescription was opened for reading.
-	// FileDescriptionOptions.DenyPRead == false.
+	// Preconditions:
+	// * The FileDescription was opened for reading.
+	// * FileDescriptionOptions.DenyPRead == false.
 	PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
 
 	// Read is similar to PRead, but does not specify an offset.
@@ -403,8 +404,9 @@ type FileDescriptionImpl interface {
 	// - If opts.Flags specifies unsupported options, PWrite returns
 	// EOPNOTSUPP.
 	//
-	// Preconditions: The FileDescription was opened for writing.
-	// FileDescriptionOptions.DenyPWrite == false.
+	// Preconditions:
+	// * The FileDescription was opened for writing.
+	// * FileDescriptionOptions.DenyPWrite == false.
 	PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
 
 	// Write is similar to PWrite, but does not specify an offset, which is
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index df3758fd1..2c60cfab2 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -212,8 +212,9 @@ type FilesystemImpl interface {
 	// ENOENT. Equivalently, if vd represents a file with a link count of 0 not
 	// created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If LinkAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -231,8 +232,9 @@ type FilesystemImpl interface {
 	// - If the directory in which the new directory would be created has been
 	// removed by RmdirAt or RenameAt, MkdirAt returns ENOENT.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If MkdirAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -253,8 +255,9 @@ type FilesystemImpl interface {
 	// - If the directory in which the file would be created has been removed
 	// by RmdirAt or RenameAt, MknodAt returns ENOENT.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If MknodAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -345,11 +348,12 @@ type FilesystemImpl interface {
 	// - If renaming would replace a non-empty directory, RenameAt returns
 	// ENOTEMPTY.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a
-	// previous call to
-	// oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is
-	// not "." or "..".
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
+	// * oldParentVD.Dentry() was obtained from a previous call to
+	//   oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt().
+	// * oldName is not "." or "..".
 	//
 	// Postconditions: If RenameAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -372,8 +376,9 @@ type FilesystemImpl interface {
 	// - If the file at rp exists but is not a directory, RmdirAt returns
 	// ENOTDIR.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If RmdirAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -410,8 +415,9 @@ type FilesystemImpl interface {
 	// - If the directory in which the symbolic link would be created has been
 	// removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If SymlinkAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -431,8 +437,9 @@ type FilesystemImpl interface {
 	//
 	// - If the file at rp exists but is a directory, UnlinkAt returns EISDIR.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If UnlinkAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 67dfba986..714af6907 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -369,8 +369,9 @@ type umountRecursiveOptions struct {
 //
 // umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
 //
-// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section.
+// Preconditions:
+// * vfs.mountMu must be locked.
+// * vfs.mounts.seq must be in a writer critical section.
 func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) {
 	if !mnt.umounted {
 		mnt.umounted = true
@@ -399,9 +400,11 @@ func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecu
 // connectLocked makes vd the mount parent/point for mnt. It consumes
 // references held by vd.
 //
-// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section. d.mu must be locked. mnt.parent() == nil, i.e. mnt
-// must not already be connected.
+// Preconditions:
+// * vfs.mountMu must be locked.
+// * vfs.mounts.seq must be in a writer critical section.
+// * d.mu must be locked.
+// * mnt.parent() == nil, i.e. mnt must not already be connected.
 func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
 	if checkInvariants {
 		if mnt.parent() != nil {
@@ -429,8 +432,10 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
 // disconnectLocked makes vd have no mount parent/point and returns its old
 // mount parent/point with a reference held.
 //
-// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section. mnt.parent() != nil.
+// Preconditions:
+// * vfs.mountMu must be locked.
+// * vfs.mounts.seq must be in a writer critical section.
+// * mnt.parent() != nil.
 func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
 	vd := mnt.loadKey()
 	if checkInvariants {
@@ -576,8 +581,9 @@ retryFirst:
 // mnt. It takes a reference on the returned VirtualDentry. If no such mount
 // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
 //
-// Preconditions: References are held on mnt and root. vfsroot is not (mnt,
-// mnt.root).
+// Preconditions:
+// * References are held on mnt and root.
+// * vfsroot is not (mnt, mnt.root).
 func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
 	// The first mount is special-cased:
 	//
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index 70f850ca4..777d631cb 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -217,8 +217,9 @@ func (mt *mountTable) Insert(mount *Mount) {
 
 // insertSeqed inserts the given mount into mt.
 //
-// Preconditions: mt.seq must be in a writer critical section. mt must not
-// already contain a Mount with the same mount point and parent.
+// Preconditions:
+// * mt.seq must be in a writer critical section.
+// * mt must not already contain a Mount with the same mount point and parent.
 func (mt *mountTable) insertSeqed(mount *Mount) {
 	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
 
@@ -269,9 +270,11 @@ func (mt *mountTable) insertSeqed(mount *Mount) {
 	atomic.StorePointer(&mt.slots, newSlots)
 }
 
-// Preconditions: There are no concurrent mutators of the table (slots, cap).
-// If the table is visible to readers, then mt.seq must be in a writer critical
-// section. cap must be a power of 2.
+// Preconditions:
+// * There are no concurrent mutators of the table (slots, cap).
+// * If the table is visible to readers, then mt.seq must be in a writer
+//   critical section.
+// * cap must be a power of 2.
 func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
 	mask := cap - 1
 	off := (hash & mask) * mountSlotBytes
@@ -313,8 +316,9 @@ func (mt *mountTable) Remove(mount *Mount) {
 
 // removeSeqed removes the given mount from mt.
 //
-// Preconditions: mt.seq must be in a writer critical section. mt must contain
-// mount.
+// Preconditions:
+// * mt.seq must be in a writer critical section.
+// * mt must contain mount.
 func (mt *mountTable) removeSeqed(mount *Mount) {
 	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
 	tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
diff --git a/pkg/syncevent/broadcaster.go b/pkg/syncevent/broadcaster.go
index 4bff59e7d..dabf08895 100644
--- a/pkg/syncevent/broadcaster.go
+++ b/pkg/syncevent/broadcaster.go
@@ -111,7 +111,9 @@ func (b *Broadcaster) SubscribeEvents(r *Receiver, filter Set) SubscriptionID {
 	return id
 }
 
-// Preconditions: table must not be full. len(table) is a power of 2.
+// Preconditions:
+// * table must not be full.
+// * len(table) is a power of 2.
 func broadcasterTableInsert(table []broadcasterSlot, id SubscriptionID, r *Receiver, filter Set) {
 	entry := broadcasterSlot{
 		receiver: r,
diff --git a/pkg/syncevent/source.go b/pkg/syncevent/source.go
index ddffb171a..d3d0f34c5 100644
--- a/pkg/syncevent/source.go
+++ b/pkg/syncevent/source.go
@@ -19,9 +19,11 @@ type Source interface {
 	// SubscribeEvents causes the Source to notify the given Receiver of the
 	// given subset of events.
 	//
-	// Preconditions: r != nil. The ReceiverCallback for r must not take locks
-	// that are ordered prior to the Source; for example, it cannot call any
-	// Source methods.
+	// Preconditions:
+	// * r != nil.
+	// * The ReceiverCallback for r must not take locks that are ordered
+	//   prior to the Source; for example, it cannot call any Source
+	//   methods.
 	SubscribeEvents(r *Receiver, filter Set) SubscriptionID
 
 	// UnsubscribeEvents causes the Source to stop notifying the Receiver
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go
index 7dd344b4f..836682ea0 100644
--- a/pkg/tcpip/stack/conntrack.go
+++ b/pkg/tcpip/stack/conntrack.go
@@ -572,7 +572,9 @@ func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, tim
 // reapTupleLocked tries to remove tuple and its reply from the table. It
 // returns whether the tuple's connection has timed out.
 //
-// Preconditions: ct.mu is locked for reading and bucket is locked.
+// Preconditions:
+// * ct.mu is locked for reading.
+// * bucket is locked.
 func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bool {
 	if !tuple.conn.timedOut(now) {
 		return false
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index c37da814f..41ef4236b 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -315,8 +315,8 @@ func (it *IPTables) startReaper(interval time.Duration) {
 // should not go forward.
 //
 // Preconditions:
-// - pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-// - pkt.NetworkHeader is not nil.
+// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// * pkt.NetworkHeader is not nil.
 //
 // NOTE: unlike the Check API the returned map contains packets that should be
 // dropped.
@@ -341,8 +341,8 @@ func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r *
 }
 
 // Preconditions:
-// - pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-// - pkt.NetworkHeader is not nil.
+// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// * pkt.NetworkHeader is not nil.
 func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) chainVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
@@ -388,8 +388,8 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId
 }
 
 // Preconditions:
-// - pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-// - pkt.NetworkHeader is not nil.
+// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// * pkt.NetworkHeader is not nil.
 func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) (RuleVerdict, int) {
 	rule := table.Rules[ruleIdx]
 
diff --git a/pkg/unet/unet.go b/pkg/unet/unet.go
index d843f19cf..c976d7230 100644
--- a/pkg/unet/unet.go
+++ b/pkg/unet/unet.go
@@ -522,7 +522,7 @@ func (s *ServerSocket) Listen() error {
 // This is always blocking.
 //
 // Preconditions:
-//  * ServerSocket is listening (Listen called).
+// * ServerSocket is listening (Listen called).
 func (s *ServerSocket) Accept() (*Socket, error) {
 	fd, ok := s.socket.enterFD()
 	if !ok {
diff --git a/pkg/usermem/addr_range_seq_unsafe.go b/pkg/usermem/addr_range_seq_unsafe.go
index c09337c15..495896ded 100644
--- a/pkg/usermem/addr_range_seq_unsafe.go
+++ b/pkg/usermem/addr_range_seq_unsafe.go
@@ -81,8 +81,10 @@ func AddrRangeSeqFromSlice(slice []AddrRange) AddrRangeSeq {
 	return addrRangeSeqFromSliceLimited(slice, limit)
 }
 
-// Preconditions: The combined length of all AddrRanges in slice <= limit.
-// limit >= 0. If len(slice) != 0, then limit > 0.
+// Preconditions:
+// * The combined length of all AddrRanges in slice <= limit.
+// * limit >= 0.
+// * If len(slice) != 0, then limit > 0.
 func addrRangeSeqFromSliceLimited(slice []AddrRange, limit int64) AddrRangeSeq {
 	switch len(slice) {
 	case 0:
diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go
index cd6a0ea6b..27279b409 100644
--- a/pkg/usermem/usermem.go
+++ b/pkg/usermem/usermem.go
@@ -54,8 +54,10 @@ type IO interface {
 	// of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a
 	// non-nil error explaining why.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. toZero >= 0.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * toZero >= 0.
 	ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error)
 
 	// CopyOutFrom copies ars.NumBytes() bytes from src to the memory mapped at
@@ -66,9 +68,11 @@ type IO interface {
 	//
 	// CopyOutFrom calls src.ReadToBlocks at most once.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. src.ReadToBlocks must not block
-	// on mm.MemoryManager.activeMu or any preceding locks in the lock order.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * src.ReadToBlocks must not block on mm.MemoryManager.activeMu or
+	//   any preceding locks in the lock order.
 	CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error)
 
 	// CopyInTo copies ars.NumBytes() bytes from the memory mapped at ars to
@@ -78,10 +82,11 @@ type IO interface {
 	//
 	// CopyInTo calls dst.WriteFromBlocks at most once.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. dst.WriteFromBlocks must not
-	// block on mm.MemoryManager.activeMu or any preceding locks in the lock
-	// order.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * dst.WriteFromBlocks must not block on mm.MemoryManager.activeMu or
+	//   any preceding locks in the lock order.
 	CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error)
 
 	// TODO(jamieliu): The requirement that CopyOutFrom/CopyInTo call src/dst
@@ -93,25 +98,28 @@ type IO interface {
 	// SwapUint32 atomically sets the uint32 value at addr to new and
 	// returns the previous value.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. addr must be aligned to a 4-byte
-	// boundary.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * addr must be aligned to a 4-byte boundary.
 	SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error)
 
 	// CompareAndSwapUint32 atomically compares the uint32 value at addr to
 	// old; if they are equal, the value in memory is replaced by new. In
 	// either case, the previous value stored in memory is returned.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. addr must be aligned to a 4-byte
-	// boundary.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * addr must be aligned to a 4-byte boundary.
 	CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error)
 
 	// LoadUint32 atomically loads the uint32 value at addr and returns it.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. addr must be aligned to a 4-byte
-	// boundary.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * addr must be aligned to a 4-byte boundary.
 	LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error)
 }
 
@@ -183,7 +191,7 @@ func (rw *IOReadWriter) Write(src []byte) (int, error) {
 // CopyObjectOut must use reflection to encode src; performance-sensitive
 // clients should do encoding manually and use uio.CopyOut directly.
 //
-// Preconditions: As for IO.CopyOut.
+// Preconditions: Same as IO.CopyOut.
 func CopyObjectOut(ctx context.Context, uio IO, addr Addr, src interface{}, opts IOOpts) (int, error) {
 	w := &IOReadWriter{
 		Ctx:  ctx,
@@ -205,7 +213,7 @@ func CopyObjectOut(ctx context.Context, uio IO, addr Addr, src interface{}, opts
 // CopyObjectIn must use reflection to decode dst; performance-sensitive
 // clients should use uio.CopyIn directly and do decoding manually.
 //
-// Preconditions: As for IO.CopyIn.
+// Preconditions: Same as IO.CopyIn.
 func CopyObjectIn(ctx context.Context, uio IO, addr Addr, dst interface{}, opts IOOpts) (int, error) {
 	r := &IOReadWriter{
 		Ctx:  ctx,
@@ -233,7 +241,8 @@ const (
 // would exceed maxlen, CopyStringIn returns the string truncated to maxlen and
 // ENAMETOOLONG.
 //
-// Preconditions: As for IO.CopyFromUser. maxlen >= 0.
+// Preconditions: Same as IO.CopyFromUser, plus:
+// * maxlen >= 0.
 func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpts) (string, error) {
 	initLen := maxlen
 	if initLen > copyStringMaxInitBufLen {
@@ -287,7 +296,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 // less. CopyOutVec returns the number of bytes copied; if this is less than
 // the maximum, it returns a non-nil error explaining why.
 //
-// Preconditions: As for IO.CopyOut.
+// Preconditions: Same as IO.CopyOut.
 func CopyOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, src []byte, opts IOOpts) (int, error) {
 	var done int
 	for !ars.IsEmpty() && done < len(src) {
@@ -311,7 +320,7 @@ func CopyOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, src []byte, opts
 // less. CopyInVec returns the number of bytes copied; if this is less than the
 // maximum, it returns a non-nil error explaining why.
 //
-// Preconditions: As for IO.CopyIn.
+// Preconditions: Same as IO.CopyIn.
 func CopyInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst []byte, opts IOOpts) (int, error) {
 	var done int
 	for !ars.IsEmpty() && done < len(dst) {
@@ -335,7 +344,7 @@ func CopyInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst []byte, opts I
 // ZeroOutVec returns the number of bytes written; if this is less than the
 // maximum, it returns a non-nil error explaining why.
 //
-// Preconditions: As for IO.ZeroOut.
+// Preconditions: Same as IO.ZeroOut.
 func ZeroOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, toZero int64, opts IOOpts) (int64, error) {
 	var done int64
 	for !ars.IsEmpty() && done < toZero {
@@ -388,7 +397,7 @@ func isASCIIWhitespace(b byte) bool {
 //
 // - CopyInt32StringsInVec returns EINVAL if ars.NumBytes() == 0.
 //
-// Preconditions: As for CopyInVec.
+// Preconditions: Same as CopyInVec.
 func CopyInt32StringsInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dsts []int32, opts IOOpts) (int64, error) {
 	if len(dsts) == 0 {
 		return 0, nil
@@ -481,28 +490,28 @@ func (s IOSequence) NumBytes() int64 {
 
 // DropFirst returns a copy of s with s.Addrs.DropFirst(n).
 //
-// Preconditions: As for AddrRangeSeq.DropFirst.
+// Preconditions: Same as AddrRangeSeq.DropFirst.
 func (s IOSequence) DropFirst(n int) IOSequence {
 	return IOSequence{s.IO, s.Addrs.DropFirst(n), s.Opts}
 }
 
 // DropFirst64 returns a copy of s with s.Addrs.DropFirst64(n).
 //
-// Preconditions: As for AddrRangeSeq.DropFirst64.
+// Preconditions: Same as AddrRangeSeq.DropFirst64.
 func (s IOSequence) DropFirst64(n int64) IOSequence {
 	return IOSequence{s.IO, s.Addrs.DropFirst64(n), s.Opts}
 }
 
 // TakeFirst returns a copy of s with s.Addrs.TakeFirst(n).
 //
-// Preconditions: As for AddrRangeSeq.TakeFirst.
+// Preconditions: Same as AddrRangeSeq.TakeFirst.
 func (s IOSequence) TakeFirst(n int) IOSequence {
 	return IOSequence{s.IO, s.Addrs.TakeFirst(n), s.Opts}
 }
 
 // TakeFirst64 returns a copy of s with s.Addrs.TakeFirst64(n).
 //
-// Preconditions: As for AddrRangeSeq.TakeFirst64.
+// Preconditions: Same as AddrRangeSeq.TakeFirst64.
 func (s IOSequence) TakeFirst64(n int64) IOSequence {
 	return IOSequence{s.IO, s.Addrs.TakeFirst64(n), s.Opts}
 }
@@ -512,7 +521,7 @@ func (s IOSequence) TakeFirst64(n int64) IOSequence {
 // As with CopyOutVec, if s.NumBytes() < len(src), the copy will be truncated
 // to s.NumBytes(), and a nil error will be returned.
 //
-// Preconditions: As for CopyOutVec.
+// Preconditions: Same as CopyOutVec.
 func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) {
 	return CopyOutVec(ctx, s.IO, s.Addrs, src, s.Opts)
 }
@@ -522,7 +531,7 @@ func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) {
 // As with CopyInVec, if s.NumBytes() < len(dst), the copy will be truncated to
 // s.NumBytes(), and a nil error will be returned.
 //
-// Preconditions: As for CopyInVec.
+// Preconditions: Same as CopyInVec.
 func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) {
 	return CopyInVec(ctx, s.IO, s.Addrs, dst, s.Opts)
 }
@@ -532,21 +541,21 @@ func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) {
 // As with ZeroOutVec, if s.NumBytes() < toZero, the write will be truncated
 // to s.NumBytes(), and a nil error will be returned.
 //
-// Preconditions: As for ZeroOutVec.
+// Preconditions: Same as ZeroOutVec.
 func (s IOSequence) ZeroOut(ctx context.Context, toZero int64) (int64, error) {
 	return ZeroOutVec(ctx, s.IO, s.Addrs, toZero, s.Opts)
 }
 
 // CopyOutFrom invokes s.CopyOutFrom over s.Addrs.
 //
-// Preconditions: As for IO.CopyOutFrom.
+// Preconditions: Same as IO.CopyOutFrom.
 func (s IOSequence) CopyOutFrom(ctx context.Context, src safemem.Reader) (int64, error) {
 	return s.IO.CopyOutFrom(ctx, s.Addrs, src, s.Opts)
 }
 
 // CopyInTo invokes s.CopyInTo over s.Addrs.
 //
-// Preconditions: As for IO.CopyInTo.
+// Preconditions: Same as IO.CopyInTo.
 func (s IOSequence) CopyInTo(ctx context.Context, dst safemem.Writer) (int64, error) {
 	return s.IO.CopyInTo(ctx, s.Addrs, dst, s.Opts)
 }
-- 
cgit v1.2.3


From 76e5213c113a0eefa7341122dc06e4a4191800e8 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 20 Aug 2020 14:10:46 -0700
Subject: Add reference count checking to the fsimpl/host package.

Includes a minor refactor for inode construction.

Updates #1486.

PiperOrigin-RevId: 327694933
---
 pkg/sentry/fsimpl/host/BUILD     | 25 +++++++++++
 pkg/sentry/fsimpl/host/host.go   | 94 +++++++++++++++++++++++-----------------
 pkg/sentry/fsimpl/host/socket.go | 20 ++++-----
 3 files changed, 88 insertions(+), 51 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index bd701bbc7..090ae0804 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -1,12 +1,37 @@
 load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "inode_refs",
+    out = "inode_refs.go",
+    package = "host",
+    prefix = "inode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "inode",
+    },
+)
+
+go_template_instance(
+    name = "connected_endpoint_refs",
+    out = "connected_endpoint_refs.go",
+    package = "host",
+    prefix = "ConnectedEndpoint",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "ConnectedEndpoint",
+    },
+)
+
 go_library(
     name = "host",
     srcs = [
+        "connected_endpoint_refs.go",
         "control.go",
         "host.go",
+        "inode_refs.go",
         "ioctl_unsafe.go",
         "mmap.go",
         "socket.go",
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 56869f59a..2d3821f33 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -27,7 +27,6 @@ import (
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/refs"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/hostfd"
@@ -41,6 +40,44 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+func newInode(fs *filesystem, hostFD int, fileType linux.FileMode, isTTY bool) (*inode, error) {
+	// Determine if hostFD is seekable. If not, this syscall will return ESPIPE
+	// (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
+	// devices.
+	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
+	seekable := err != syserror.ESPIPE
+
+	i := &inode{
+		hostFD:     hostFD,
+		ino:        fs.NextIno(),
+		isTTY:      isTTY,
+		wouldBlock: wouldBlock(uint32(fileType)),
+		seekable:   seekable,
+		// NOTE(b/38213152): Technically, some obscure char devices can be memory
+		// mapped, but we only allow regular files.
+		canMap: fileType == linux.S_IFREG,
+	}
+	i.pf.inode = i
+	i.refs.EnableLeakCheck()
+
+	// Non-seekable files can't be memory mapped, assert this.
+	if !i.seekable && i.canMap {
+		panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
+	}
+
+	// If the hostFD would block, we must set it to non-blocking and handle
+	// blocking behavior in the sentry.
+	if i.wouldBlock {
+		if err := syscall.SetNonblock(i.hostFD, true); err != nil {
+			return nil, err
+		}
+		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
+			return nil, err
+		}
+	}
+	return i, nil
+}
+
 // NewFDOptions contains options to NewFD.
 type NewFDOptions struct {
 	// If IsTTY is true, the file descriptor is a TTY.
@@ -76,44 +113,11 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
 		flags = uint32(flagsInt)
 	}
 
-	fileMode := linux.FileMode(s.Mode)
-	fileType := fileMode.FileType()
-
-	// Determine if hostFD is seekable. If not, this syscall will return ESPIPE
-	// (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
-	// devices.
-	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
-	seekable := err != syserror.ESPIPE
-
-	i := &inode{
-		hostFD:     hostFD,
-		ino:        fs.NextIno(),
-		isTTY:      opts.IsTTY,
-		wouldBlock: wouldBlock(uint32(fileType)),
-		seekable:   seekable,
-		// NOTE(b/38213152): Technically, some obscure char devices can be memory
-		// mapped, but we only allow regular files.
-		canMap: fileType == linux.S_IFREG,
-	}
-	i.pf.inode = i
-
-	// Non-seekable files can't be memory mapped, assert this.
-	if !i.seekable && i.canMap {
-		panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
-	}
-
-	// If the hostFD would block, we must set it to non-blocking and handle
-	// blocking behavior in the sentry.
-	if i.wouldBlock {
-		if err := syscall.SetNonblock(i.hostFD, true); err != nil {
-			return nil, err
-		}
-		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
-			return nil, err
-		}
-	}
-
 	d := &kernfs.Dentry{}
+	i, err := newInode(fs, hostFD, linux.FileMode(s.Mode).FileType(), opts.IsTTY)
+	if err != nil {
+		return nil, err
+	}
 	d.Init(i)
 
 	// i.open will take a reference on d.
@@ -188,7 +192,7 @@ type inode struct {
 	locks vfs.FileLocks
 
 	// When the reference count reaches zero, the host fd is closed.
-	refs.AtomicRefCount
+	refs inodeRefs
 
 	// hostFD contains the host fd that this file was originally created from,
 	// which must be available at time of restore.
@@ -430,9 +434,19 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 	return nil
 }
 
+// IncRef implements kernfs.Inode.
+func (i *inode) IncRef() {
+	i.refs.IncRef()
+}
+
+// TryIncRef implements kernfs.Inode.
+func (i *inode) TryIncRef() bool {
+	return i.refs.TryIncRef()
+}
+
 // DecRef implements kernfs.Inode.
 func (i *inode) DecRef(ctx context.Context) {
-	i.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
+	i.refs.DecRef(func() {
 		if i.wouldBlock {
 			fdnotifier.RemoveFD(int32(i.hostFD))
 		}
diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go
index 4979dd0a9..131145b85 100644
--- a/pkg/sentry/fsimpl/host/socket.go
+++ b/pkg/sentry/fsimpl/host/socket.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
@@ -59,8 +58,7 @@ func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transpor
 //
 // +stateify savable
 type ConnectedEndpoint struct {
-	// ref keeps track of references to a ConnectedEndpoint.
-	ref refs.AtomicRefCount
+	ConnectedEndpointRefs
 
 	// mu protects fd below.
 	mu sync.RWMutex `state:"nosave"`
@@ -132,9 +130,9 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable
 		return nil, err
 	}
 
-	// AtomicRefCounters start off with a single reference. We need two.
-	e.ref.IncRef()
-	e.ref.EnableLeakCheck("host.ConnectedEndpoint")
+	// ConnectedEndpointRefs start off with a single reference. We need two.
+	e.IncRef()
+	e.EnableLeakCheck()
 	return &e, nil
 }
 
@@ -318,7 +316,7 @@ func (c *ConnectedEndpoint) destroyLocked() {
 // Release implements transport.ConnectedEndpoint.Release and
 // transport.Receiver.Release.
 func (c *ConnectedEndpoint) Release(ctx context.Context) {
-	c.ref.DecRefWithDestructor(ctx, func(context.Context) {
+	c.DecRef(func() {
 		c.mu.Lock()
 		c.destroyLocked()
 		c.mu.Unlock()
@@ -348,7 +346,7 @@ func (e *SCMConnectedEndpoint) Init() error {
 // Release implements transport.ConnectedEndpoint.Release and
 // transport.Receiver.Release.
 func (e *SCMConnectedEndpoint) Release(ctx context.Context) {
-	e.ref.DecRefWithDestructor(ctx, func(context.Context) {
+	e.DecRef(func() {
 		e.mu.Lock()
 		if err := syscall.Close(e.fd); err != nil {
 			log.Warningf("Failed to close host fd %d: %v", err)
@@ -378,8 +376,8 @@ func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr s
 		return nil, err
 	}
 
-	// AtomicRefCounters start off with a single reference. We need two.
-	e.ref.IncRef()
-	e.ref.EnableLeakCheck("host.SCMConnectedEndpoint")
+	// ConnectedEndpointRefs start off with a single reference. We need two.
+	e.IncRef()
+	e.EnableLeakCheck()
 	return &e, nil
 }
-- 
cgit v1.2.3


From b17c7094f706ce92eba44e72e2cede8814b29607 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Thu, 20 Aug 2020 15:38:06 -0700
Subject: stateify: Fix afterLoad not being called for root object

PiperOrigin-RevId: 327711264
---
 pkg/state/decode.go          | 6 ++++--
 pkg/state/tests/load_test.go | 8 ++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/state/decode.go b/pkg/state/decode.go
index c9971cdf6..89467ca8e 100644
--- a/pkg/state/decode.go
+++ b/pkg/state/decode.go
@@ -584,10 +584,12 @@ func (ds *decodeState) Load(obj reflect.Value) {
 	})
 
 	// Create the root object.
-	ds.objectsByID = append(ds.objectsByID, &objectDecodeState{
+	rootOds := &objectDecodeState{
 		id:  1,
 		obj: obj,
-	})
+	}
+	ds.objectsByID = append(ds.objectsByID, rootOds)
+	ds.pending.PushBack(rootOds)
 
 	// Read the number of objects.
 	lastID, object, err := ReadHeader(ds.r)
diff --git a/pkg/state/tests/load_test.go b/pkg/state/tests/load_test.go
index 1e9794296..3c73ac391 100644
--- a/pkg/state/tests/load_test.go
+++ b/pkg/state/tests/load_test.go
@@ -20,6 +20,14 @@ import (
 
 func TestLoadHooks(t *testing.T) {
 	runTestCases(t, false, "load-hooks", []interface{}{
+		// Root object being a struct.
+		afterLoadStruct{v: 1},
+		valueLoadStruct{v: 1},
+		genericContainer{v: &afterLoadStruct{v: 1}},
+		genericContainer{v: &valueLoadStruct{v: 1}},
+		sliceContainer{v: []interface{}{&afterLoadStruct{v: 1}}},
+		sliceContainer{v: []interface{}{&valueLoadStruct{v: 1}}},
+		// Root object being a pointer.
 		&afterLoadStruct{v: 1},
 		&valueLoadStruct{v: 1},
 		&genericContainer{v: &afterLoadStruct{v: 1}},
-- 
cgit v1.2.3


From fc68f90fc066473951521b6bdd3adfd6c7dfc61f Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Thu, 20 Aug 2020 16:25:57 -0700
Subject: [vfs] Create recursive dir creation util.

Refactored the recursive dir creation util in runsc/boot/vfs.go to be more
flexible.

PiperOrigin-RevId: 327719100
---
 pkg/sentry/vfs/vfs.go | 33 +++++++++++++++++++++++++++++++++
 runsc/boot/vfs.go     | 32 ++------------------------------
 2 files changed, 35 insertions(+), 30 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 8a79e1325..ec27562d6 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -36,6 +36,7 @@ package vfs
 
 import (
 	"fmt"
+	"path"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -782,6 +783,38 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
 	return retErr
 }
 
+// MkdirAllAt recursively creates non-existent directories on the given path
+// (including the last component).
+func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions) error {
+	pop := &PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(currentPath),
+	}
+	stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE})
+	switch err {
+	case nil:
+		if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory {
+			return syserror.ENOTDIR
+		}
+		// Directory already exists.
+		return nil
+	case syserror.ENOENT:
+		// Expected, we will create the dir.
+	default:
+		return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err)
+	}
+
+	// Recurse to ensure parent is created and then create the final directory.
+	if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts); err != nil {
+		return err
+	}
+	if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil {
+		return fmt.Errorf("failed to create directory %q: %w", currentPath, err)
+	}
+	return nil
+}
+
 // A VirtualDentry represents a node in a VFS tree, by combining a Dentry
 // (which represents a node in a Filesystem's tree) and a Mount (which
 // represents the Filesystem's position in a VFS mount tree).
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 3da7a64f0..f27a6ff6b 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -16,7 +16,6 @@ package boot
 
 import (
 	"fmt"
-	"path"
 	"sort"
 	"strings"
 
@@ -274,7 +273,7 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C
 		return nil
 	}
 
-	if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
+	if err := c.k.VFS().MkdirAllAt(ctx, submount.Destination, root, creds, &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}); err != nil {
 		return err
 	}
 	if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
@@ -348,33 +347,6 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
 	return fsName, opts, nil
 }
 
-func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
-	target := &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(currentPath),
-	}
-	_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
-	if err == nil {
-		log.Debugf("Mount point %q already exists", currentPath)
-		return nil
-	}
-	if err != syserror.ENOENT {
-		return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err)
-	}
-
-	// Recurse to ensure parent is created and then create the mount point.
-	if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil {
-		return err
-	}
-	log.Debugf("Creating dir %q for mount point", currentPath)
-	mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
-	if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil {
-		return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err)
-	}
-	return nil
-}
-
 // mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
 // Technically we don't have to mount tmpfs at /tmp, as we could just rely on
 // the host /tmp, but this is a nice optimization, and fixes some apps that call
@@ -503,7 +475,7 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co
 
 	root := mns.Root()
 	defer root.DecRef(ctx)
-	if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil {
+	if err := c.k.VFS().MkdirAllAt(ctx, mount.Destination, root, creds, &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}); err != nil {
 		return err
 	}
 
-- 
cgit v1.2.3


From 124b95efc2799b09a8a5b47f0bfa387eaace8cf5 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 21 Aug 2020 12:08:25 -0700
Subject: Fix parent directory creation in CreateDeviceFile.

It was not properly creating recursive directories. Added tests for this case.

Updates #1196

PiperOrigin-RevId: 327850811
---
 pkg/sentry/fsimpl/devtmpfs/BUILD            |   1 +
 pkg/sentry/fsimpl/devtmpfs/devtmpfs.go      |  13 +-
 pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go | 189 ++++++++++++++++++++++------
 3 files changed, 155 insertions(+), 48 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/devtmpfs/BUILD b/pkg/sentry/fsimpl/devtmpfs/BUILD
index aa0c2ad8c..01bbee5ad 100644
--- a/pkg/sentry/fsimpl/devtmpfs/BUILD
+++ b/pkg/sentry/fsimpl/devtmpfs/BUILD
@@ -24,6 +24,7 @@ go_test(
     library = ":devtmpfs",
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fspath",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/tmpfs",
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index 2ed5fa8a9..52f44f66d 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -18,6 +18,7 @@ package devtmpfs
 
 import (
 	"fmt"
+	"path"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -150,13 +151,11 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v
 
 	// Create any parent directories. See
 	// devtmpfs.c:handle_create()=>path_create().
-	for it := fspath.Parse(pathname).Begin; it.NextOk(); it = it.Next() {
-		pop := a.pathOperationAt(it.String())
-		if err := a.vfsObj.MkdirAt(actx, a.creds, pop, &vfs.MkdirOptions{
-			Mode: 0755,
-		}); err != nil {
-			return fmt.Errorf("failed to create directory %q: %v", it.String(), err)
-		}
+	parent := path.Dir(pathname)
+	if err := a.vfsObj.MkdirAllAt(ctx, parent, a.root, a.creds, &vfs.MkdirOptions{
+		Mode: 0755,
+	}); err != nil {
+		return fmt.Errorf("failed to create device parent directory %q: %v", parent, err)
 	}
 
 	// NOTE: Linux's devtmpfs refuses to automatically delete files it didn't
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 747867cca..6b56c5e71 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -15,9 +15,11 @@
 package devtmpfs
 
 import (
+	"path"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
@@ -25,10 +27,13 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
-func TestDevtmpfs(t *testing.T) {
+const devPath = "/dev"
+
+func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry, func()) {
+	t.Helper()
+
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
-
 	vfsObj := &vfs.VirtualFilesystem{}
 	if err := vfsObj.Init(ctx); err != nil {
 		t.Fatalf("VFS init: %v", err)
@@ -43,14 +48,11 @@ func TestDevtmpfs(t *testing.T) {
 	})
 
 	// Create a test mount namespace with devtmpfs mounted at "/dev".
-	const devPath = "/dev"
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
-	defer mntns.DecRef(ctx)
 	root := mntns.Root()
-	defer root.DecRef(ctx)
 	devpop := vfs.PathOperation{
 		Root:  root,
 		Start: root,
@@ -65,6 +67,16 @@ func TestDevtmpfs(t *testing.T) {
 		t.Fatalf("failed to mount devtmpfs: %v", err)
 	}
 
+	return ctx, creds, vfsObj, root, func() {
+		root.DecRef(ctx)
+		mntns.DecRef(ctx)
+	}
+}
+
+func TestUserspaceInit(t *testing.T) {
+	ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t)
+	defer cleanup()
+
 	a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
 	if err != nil {
 		t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
@@ -75,48 +87,143 @@ func TestDevtmpfs(t *testing.T) {
 	if err := a.UserspaceInit(ctx); err != nil {
 		t.Fatalf("failed to userspace-initialize devtmpfs: %v", err)
 	}
+
 	// Created files should be visible in the test mount namespace.
-	abspath := devPath + "/fd"
-	target, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(abspath),
-	})
-	if want := "/proc/self/fd"; err != nil || target != want {
-		t.Fatalf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, target, err, want)
+	links := []struct {
+		source string
+		target string
+	}{
+		{
+			source: "fd",
+			target: "/proc/self/fd",
+		},
+		{
+			source: "stdin",
+			target: "/proc/self/fd/0",
+		},
+		{
+			source: "stdout",
+			target: "/proc/self/fd/1",
+		},
+		{
+			source: "stderr",
+			target: "/proc/self/fd/2",
+		},
+		{
+			source: "ptmx",
+			target: "pts/ptmx",
+		},
 	}
 
-	// Create a dummy device special file using a devtmpfs.Accessor.
-	const (
-		pathInDev = "dummy"
-		kind      = vfs.CharDevice
-		major     = 12
-		minor     = 34
-		perms     = 0600
-		wantMode  = linux.S_IFCHR | perms
-	)
-	if err := a.CreateDeviceFile(ctx, pathInDev, kind, major, minor, perms); err != nil {
-		t.Fatalf("failed to create device file: %v", err)
+	for _, link := range links {
+		abspath := path.Join(devPath, link.source)
+		if gotTarget, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
+			Root:  root,
+			Start: root,
+			Path:  fspath.Parse(abspath),
+		}); err != nil || gotTarget != link.target {
+			t.Errorf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, gotTarget, err, link.target)
+		}
 	}
-	// The device special file should be visible in the test mount namespace.
-	abspath = devPath + "/" + pathInDev
-	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(abspath),
-	}, &vfs.StatOptions{
-		Mask: linux.STATX_TYPE | linux.STATX_MODE,
-	})
-	if err != nil {
-		t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+
+	dirs := []string{"shm", "pts"}
+	for _, dir := range dirs {
+		abspath := path.Join(devPath, dir)
+		statx, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+			Root:  root,
+			Start: root,
+			Path:  fspath.Parse(abspath),
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_MODE,
+		})
+		if err != nil {
+			t.Errorf("stat(%q): got error %v ", abspath, err)
+			continue
+		}
+		if want := uint16(0755) | linux.S_IFDIR; statx.Mode != want {
+			t.Errorf("stat(%q): got mode %x, want %x", abspath, statx.Mode, want)
+		}
 	}
-	if stat.Mode != wantMode {
-		t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+}
+
+func TestCreateDeviceFile(t *testing.T) {
+	ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t)
+	defer cleanup()
+
+	a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
+	if err != nil {
+		t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
 	}
-	if stat.RdevMajor != major {
-		t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, major)
+	defer a.Release(ctx)
+
+	devFiles := []struct {
+		path  string
+		kind  vfs.DeviceKind
+		major uint32
+		minor uint32
+		perms uint16
+	}{
+		{
+			path:  "dummy",
+			kind:  vfs.CharDevice,
+			major: 12,
+			minor: 34,
+			perms: 0600,
+		},
+		{
+			path:  "foo/bar",
+			kind:  vfs.BlockDevice,
+			major: 13,
+			minor: 35,
+			perms: 0660,
+		},
+		{
+			path:  "foo/baz",
+			kind:  vfs.CharDevice,
+			major: 12,
+			minor: 40,
+			perms: 0666,
+		},
+		{
+			path:  "a/b/c/d/e",
+			kind:  vfs.BlockDevice,
+			major: 12,
+			minor: 34,
+			perms: 0600,
+		},
 	}
-	if stat.RdevMinor != minor {
-		t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, minor)
+
+	for _, f := range devFiles {
+		if err := a.CreateDeviceFile(ctx, f.path, f.kind, f.major, f.minor, f.perms); err != nil {
+			t.Fatalf("failed to create device file: %v", err)
+		}
+		// The device special file should be visible in the test mount namespace.
+		abspath := path.Join(devPath, f.path)
+		stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+			Root:  root,
+			Start: root,
+			Path:  fspath.Parse(abspath),
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_TYPE | linux.STATX_MODE,
+		})
+		if err != nil {
+			t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+		}
+		if stat.RdevMajor != f.major {
+			t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, f.major)
+		}
+		if stat.RdevMinor != f.minor {
+			t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, f.minor)
+		}
+		wantMode := f.perms
+		switch f.kind {
+		case vfs.CharDevice:
+			wantMode |= linux.S_IFCHR
+		case vfs.BlockDevice:
+			wantMode |= linux.S_IFBLK
+		}
+		if stat.Mode != wantMode {
+			t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+		}
 	}
 }
-- 
cgit v1.2.3


From 1666c8919d9d4ced966977f23e2905ff835eaaa0 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 21 Aug 2020 14:28:27 -0700
Subject: Make mounts ReadWrite first, then later change to ReadOnly.

This lets us create "synthetic" mountpoint directories in ReadOnly mounts
during VFS setup.

Also add context.WithMountNamespace, as some filesystems (like overlay) require
a MountNamespace on ctx to handle vfs.Filesystem Operations.

PiperOrigin-RevId: 327874971
---
 pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go       |  2 +-
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go |  2 +-
 pkg/sentry/fsimpl/proc/tasks_test.go              |  2 +-
 pkg/sentry/fsimpl/tmpfs/benchmark_test.go         |  2 +-
 pkg/sentry/syscalls/linux/vfs2/mount.go           |  4 +-
 pkg/sentry/vfs/context.go                         | 24 ++++++++++
 pkg/sentry/vfs/mount.go                           | 19 ++++++--
 runsc/boot/vfs.go                                 | 55 ++++++++++++++++-------
 8 files changed, 83 insertions(+), 27 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 6b56c5e71..827a608cb 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -63,7 +63,7 @@ func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.Virtu
 	}); err != nil {
 		t.Fatalf("failed to create mount point: %v", err)
 	}
-	if err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
+	if _, err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
 		t.Fatalf("failed to mount devtmpfs: %v", err)
 	}
 
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 8f7d5a9bb..a2cc9b59f 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -90,7 +90,7 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf
 	ctx := contexttest.Context(b)
 	creds := auth.CredentialsFromContext(ctx)
 
-	if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
+	if _, err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
 		GetFilesystemOptions: vfs.GetFilesystemOptions{
 			InternalData: int(f.Fd()),
 		},
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 3c9297dee..d82b3d2f3 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -132,7 +132,7 @@ func setup(t *testing.T) *testutil.System {
 			},
 		},
 	}
-	if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
+	if _, err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
 		t.Fatalf("MountAt(/proc): %v", err)
 	}
 	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index d263147c2..e5a4218e8 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -405,7 +405,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 			}
 			defer mountPoint.DecRef(ctx)
 			// Create and mount the submount.
-			if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
+			if _, err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
 				b.Fatalf("failed to mount tmpfs submount: %v", err)
 			}
 			filePathBuilder.WriteString(mountPointName)
diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go
index 4bd5c7ca2..769c9b92f 100644
--- a/pkg/sentry/syscalls/linux/vfs2/mount.go
+++ b/pkg/sentry/syscalls/linux/vfs2/mount.go
@@ -109,8 +109,8 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return 0, nil, err
 	}
 	defer target.Release(t)
-
-	return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
+	_, err = t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
+	return 0, nil, err
 }
 
 // Umount2 implements Linux syscall umount2(2).
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index c9e724fef..97018651f 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -40,6 +40,30 @@ func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
 	return nil
 }
 
+type mountNamespaceContext struct {
+	context.Context
+	mntns *MountNamespace
+}
+
+// Value implements Context.Value.
+func (mc mountNamespaceContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxMountNamespace:
+		mc.mntns.IncRef()
+		return mc.mntns
+	default:
+		return mc.Context.Value(key)
+	}
+}
+
+// WithMountNamespace returns a copy of ctx with the given MountNamespace.
+func WithMountNamespace(ctx context.Context, mntns *MountNamespace) context.Context {
+	return &mountNamespaceContext{
+		Context: ctx,
+		mntns:   mntns,
+	}
+}
+
 // RootFromContext returns the VFS root used by ctx. It takes a reference on
 // the returned VirtualDentry. If ctx does not have a specific VFS root,
 // RootFromContext returns a zero-value VirtualDentry.
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 714af6907..09fea3628 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -263,16 +263,20 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr
 }
 
 // MountAt creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+// The VirtualFilesystem will hold a reference to the Mount until it is unmounted.
+//
+// This method returns the mounted Mount without a reference, for convenience
+// during VFS setup when there is no chance of racing with unmount.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) {
 	mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	defer mnt.DecRef(ctx)
 	if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
-		return err
+		return nil, err
 	}
-	return nil
+	return mnt, nil
 }
 
 // UmountAt removes the Mount at the given path.
@@ -657,6 +661,13 @@ retryFirst:
 	return VirtualDentry{mnt, d}
 }
 
+// SetMountReadOnly sets the mount as ReadOnly.
+func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error {
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+	return mnt.setReadOnlyLocked(ro)
+}
+
 // CheckBeginWrite increments the counter of in-progress write operations on
 // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns
 // EROFS.
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index f27a6ff6b..fb200e988 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -205,15 +205,34 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.
 	for i := range mounts {
 		submount := &mounts[i]
 		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
+		var (
+			mnt *vfs.Mount
+			err error
+		)
+
 		if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
-			if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil {
+			mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint)
+			if err != nil {
 				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
 			}
 		} else {
-			if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
+			mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount)
+			if err != nil {
 				return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
 			}
 		}
+
+		if mnt != nil && mnt.ReadOnly() {
+			// Switch to ReadWrite while we setup submounts.
+			if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
+				return fmt.Errorf("failed to set mount at %q readwrite: %v", submount.Destination, err)
+			}
+			defer func() {
+				if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
+					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err))
+				}
+			}()
+		}
 	}
 
 	if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil {
@@ -256,7 +275,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
 	return mounts, nil
 }
 
-func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) {
 	root := mns.Root()
 	defer root.DecRef(ctx)
 	target := &vfs.PathOperation{
@@ -266,21 +285,22 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C
 	}
 	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
 	if err != nil {
-		return fmt.Errorf("mountOptions failed: %w", err)
+		return nil, fmt.Errorf("mountOptions failed: %w", err)
 	}
 	if len(fsName) == 0 {
 		// Filesystem is not supported (e.g. cgroup), just skip it.
-		return nil
+		return nil, nil
 	}
 
 	if err := c.k.VFS().MkdirAllAt(ctx, submount.Destination, root, creds, &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}); err != nil {
-		return err
+		return nil, err
 	}
-	if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
-		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+	mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
+	if err != nil {
+		return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
 	}
 	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data)
-	return nil
+	return mnt, nil
 }
 
 // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
@@ -407,7 +427,8 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config
 			// another user. This is normally done for /tmp.
 			Options: []string{"mode=01777"},
 		}
-		return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+		_, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+		return err
 
 	case syserror.ENOTDIR:
 		// Not a dir?! Let it be.
@@ -458,25 +479,25 @@ func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *conf
 
 // mountSharedSubmount binds mount to a previously mounted volume that is shared
 // among containers in the same pod.
-func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error {
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) {
 	if err := source.checkCompatible(mount); err != nil {
-		return err
+		return nil, err
 	}
 
 	_, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
 	if err != nil {
-		return err
+		return nil, err
 	}
 	newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	defer newMnt.DecRef(ctx)
 
 	root := mns.Root()
 	defer root.DecRef(ctx)
 	if err := c.k.VFS().MkdirAllAt(ctx, mount.Destination, root, creds, &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}); err != nil {
-		return err
+		return nil, err
 	}
 
 	target := &vfs.PathOperation{
@@ -485,8 +506,8 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co
 		Path:  fspath.Parse(mount.Destination),
 	}
 	if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
-		return err
+		return nil, err
 	}
 	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
-	return nil
+	return newMnt, nil
 }
-- 
cgit v1.2.3


From edf3d6c9e6730d246fd7f26925fbfec8823638d2 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 21 Aug 2020 15:04:24 -0700
Subject: Pass overlay credentials via context in copy up.

Some VFS operations (those which operate on FDs) get their credentials via the
context instead of via an explicit creds param. For these cases, we must pass
the overlay credentials on the context.

PiperOrigin-RevId: 327881259
---
 pkg/sentry/contexttest/contexttest.go | 22 +---------------------
 pkg/sentry/fsimpl/overlay/copy_up.go  |  5 +++++
 pkg/sentry/kernel/auth/context.go     | 20 ++++++++++++++++++++
 3 files changed, 26 insertions(+), 21 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/contexttest/contexttest.go b/pkg/sentry/contexttest/contexttest.go
index 8e5658c7a..dfd195a23 100644
--- a/pkg/sentry/contexttest/contexttest.go
+++ b/pkg/sentry/contexttest/contexttest.go
@@ -144,27 +144,7 @@ func (t *TestContext) MemoryFile() *pgalloc.MemoryFile {
 // RootContext returns a Context that may be used in tests that need root
 // credentials. Uses ptrace as the platform.Platform.
 func RootContext(tb testing.TB) context.Context {
-	return WithCreds(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace()))
-}
-
-// WithCreds returns a copy of ctx carrying creds.
-func WithCreds(ctx context.Context, creds *auth.Credentials) context.Context {
-	return &authContext{ctx, creds}
-}
-
-type authContext struct {
-	context.Context
-	creds *auth.Credentials
-}
-
-// Value implements context.Context.
-func (ac *authContext) Value(key interface{}) interface{} {
-	switch key {
-	case auth.CtxCredentials:
-		return ac.creds
-	default:
-		return ac.Context.Value(key)
-	}
+	return auth.ContextWithCredentials(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace()))
 }
 
 // WithLimitSet returns a copy of ctx carrying l.
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index b3d19ff82..13735eb05 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -40,6 +41,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		return nil
 	}
 
+	// Attach our credentials to the context, as some VFS operations use
+	// credentials from context rather an take an explicit creds parameter.
+	ctx = auth.ContextWithCredentials(ctx, d.fs.creds)
+
 	ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
 	switch ftype {
 	case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR:
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
index ef5723127..c08d47787 100644
--- a/pkg/sentry/kernel/auth/context.go
+++ b/pkg/sentry/kernel/auth/context.go
@@ -34,3 +34,23 @@ func CredentialsFromContext(ctx context.Context) *Credentials {
 	}
 	return NewAnonymousCredentials()
 }
+
+// ContextWithCredentials returns a copy of ctx carrying creds.
+func ContextWithCredentials(ctx context.Context, creds *Credentials) context.Context {
+	return &authContext{ctx, creds}
+}
+
+type authContext struct {
+	context.Context
+	creds *Credentials
+}
+
+// Value implements context.Context.
+func (ac *authContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxCredentials:
+		return ac.creds
+	default:
+		return ac.Context.Value(key)
+	}
+}
-- 
cgit v1.2.3


From 3810a62b3a2e6bb55c3d030e15ba09665f2f91b3 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Fri, 21 Aug 2020 16:03:38 -0700
Subject: Clarify seek behaviour for kernfs.GenericDirectoryFD.

- Remove comment about GenericDirectoryFD not being compatible with
  dynamic directories. It is currently being used to implement dynamic
  directories.

- Try to handle SEEK_END better than setting the offset to
  infinity. SEEK_END is poorly defined for dynamic directories
  anyways, so at least try make it work correctly for the static
  entries.

Updates #1193.

PiperOrigin-RevId: 327890128
---
 pkg/sentry/fsimpl/devpts/devpts.go          |  4 ++-
 pkg/sentry/fsimpl/fuse/fusefs.go            |  4 ++-
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go    | 46 +++++++++++++++++++++++------
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 12 ++++----
 pkg/sentry/fsimpl/kernfs/kernfs_test.go     |  8 +++--
 pkg/sentry/fsimpl/proc/filesystem.go        |  6 ++++
 pkg/sentry/fsimpl/proc/subtasks.go          |  4 ++-
 pkg/sentry/fsimpl/proc/task.go              |  8 +++--
 pkg/sentry/fsimpl/proc/task_fds.go          |  8 +++--
 pkg/sentry/fsimpl/proc/tasks.go             |  4 ++-
 pkg/sentry/fsimpl/proc/tasks_sys.go         | 12 ++++----
 pkg/sentry/fsimpl/sys/sys.go                |  4 ++-
 12 files changed, 89 insertions(+), 31 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 7169e91af..3f3a099bd 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -185,7 +185,9 @@ func (i *rootInode) masterClose(t *Terminal) {
 
 // Open implements kernfs.Inode.Open.
 func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index 83c24ec25..44021ee4b 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -220,7 +220,9 @@ func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *ke
 
 // Open implements kernfs.Inode.Open.
 func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index fcee6200a..6518ff5cd 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -15,7 +15,7 @@
 package kernfs
 
 import (
-	"math"
+	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -28,9 +28,25 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// SeekEndConfig describes the SEEK_END behaviour for FDs.
+type SeekEndConfig int
+
+// Constants related to SEEK_END behaviour for FDs.
+const (
+	// Consider the end of the file to be after the final static entry. This is
+	// the default option.
+	SeekEndStaticEntries = iota
+	// Consider the end of the file to be at offset 0.
+	SeekEndZero
+)
+
+// GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD.
+type GenericDirectoryFDOptions struct {
+	SeekEnd SeekEndConfig
+}
+
 // GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory
-// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not
-// compatible with dynamic directories.
+// inode that uses OrderChildren to track child nodes.
 //
 // Note that GenericDirectoryFD holds a lock over OrderedChildren while calling
 // IterDirents callback. The IterDirents callback therefore cannot hash or
@@ -45,6 +61,9 @@ type GenericDirectoryFD struct {
 	vfs.DirectoryFileDescriptionDefaultImpl
 	vfs.LockFD
 
+	// Immutable.
+	seekEnd SeekEndConfig
+
 	vfsfd    vfs.FileDescription
 	children *OrderedChildren
 
@@ -57,9 +76,9 @@ type GenericDirectoryFD struct {
 
 // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
 // dentry.
-func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) {
 	fd := &GenericDirectoryFD{}
-	if err := fd.Init(children, locks, opts); err != nil {
+	if err := fd.Init(children, locks, opts, fdOpts); err != nil {
 		return nil, err
 	}
 	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
@@ -71,12 +90,13 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre
 // Init initializes a GenericDirectoryFD. Use it when overriding
 // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
 // correct implementation.
-func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error {
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error {
 	if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
 		// Can't open directories for writing.
 		return syserror.EISDIR
 	}
 	fd.LockFD.Init(locks)
+	fd.seekEnd = fdOpts.SeekEnd
 	fd.children = children
 	return nil
 }
@@ -209,9 +229,17 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
 	case linux.SEEK_CUR:
 		offset += fd.off
 	case linux.SEEK_END:
-		// TODO(gvisor.dev/issue/1193): This can prevent new files from showing up
-		// if they are added after SEEK_END.
-		offset = math.MaxInt64
+		switch fd.seekEnd {
+		case SeekEndStaticEntries:
+			fd.children.mu.RLock()
+			offset += int64(len(fd.children.set))
+			offset += 2 // '.' and '..' aren't tracked in children.
+			fd.children.mu.RUnlock()
+		case SeekEndZero:
+			// No-op: offset += 0.
+		default:
+			panic(fmt.Sprintf("Invalid GenericDirectoryFD.seekEnd = %v", fd.seekEnd))
+		}
 	default:
 		return 0, syserror.EINVAL
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index fe8a1e710..885856868 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -555,15 +555,16 @@ type StaticDirectory struct {
 	InodeNoDynamicLookup
 	OrderedChildren
 
-	locks vfs.FileLocks
+	locks  vfs.FileLocks
+	fdOpts GenericDirectoryFDOptions
 }
 
 var _ Inode = (*StaticDirectory)(nil)
 
 // NewStaticDir creates a new static directory and returns its dentry.
-func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry {
+func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry, fdOpts GenericDirectoryFDOptions) *Dentry {
 	inode := &StaticDirectory{}
-	inode.Init(creds, devMajor, devMinor, ino, perm)
+	inode.Init(creds, devMajor, devMinor, ino, perm, fdOpts)
 
 	dentry := &Dentry{}
 	dentry.Init(inode)
@@ -576,16 +577,17 @@ func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64
 }
 
 // Init initializes StaticDirectory.
-func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, fdOpts GenericDirectoryFDOptions) {
 	if perm&^linux.PermissionsMask != 0 {
 		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
 	}
+	s.fdOpts = fdOpts
 	s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
 }
 
 // Open implements kernfs.Inode.
 func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts)
+	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts, s.fdOpts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index c5d5afedf..e5c28c0e4 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -119,7 +119,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
 }
 
 func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
@@ -151,7 +153,9 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 }
 
 func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 2463d51cd..c350ec127 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -110,6 +110,12 @@ func newStaticFile(data string) *staticFile {
 	return &staticFile{StaticData: vfs.StaticData{Data: data}}
 }
 
+func newStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry {
+	return kernfs.NewStaticDir(creds, devMajor, devMinor, ino, perm, children, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
+}
+
 // InternalData contains internal data passed in to the procfs mount via
 // vfs.GetFilesystemOptions.InternalData.
 type InternalData struct {
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 79c2725f3..f25747da3 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -155,7 +155,9 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro
 // Open implements kernfs.Inode.
 func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &subtasksFD{task: i.task}
-	if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil {
+	if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	}); err != nil {
 		return nil, err
 	}
 	if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index a5c7aa470..109b31b4c 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -105,7 +105,9 @@ func (i *taskInode) Valid(ctx context.Context) bool {
 
 // Open implements kernfs.Inode.
 func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
@@ -142,7 +144,9 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.
 	dir := &kernfs.StaticDirectory{}
 
 	// Note: credentials are overridden by taskOwnedInode.
-	dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm)
+	dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 
 	inode := &taskOwnedInode{Inode: dir, owner: task}
 	d := &kernfs.Dentry{}
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index f0d3f7f5e..e8fcb9aa1 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -144,7 +144,9 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 
 // Open implements kernfs.Inode.
 func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
@@ -271,7 +273,9 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
 
 // Open implements kernfs.Inode.
 func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 6d2b90a8b..1391992b7 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -199,7 +199,9 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 
 // Open implements kernfs.Inode.
 func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 6435385ef..038a194c7 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -39,14 +39,14 @@ const (
 
 // newSysDir returns the dentry corresponding to /proc/sys directory.
 func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
-	return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-		"kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+	return newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+		"kernel": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
 			"hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}),
 			"shmall":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)),
 			"shmmax":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)),
 			"shmmni":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)),
 		}),
-		"vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+		"vm": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
 			"mmap_min_addr":     fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}),
 			"overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")),
 		}),
@@ -62,7 +62,7 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke
 	// network namespace of the calling process.
 	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
 		contents = map[string]*kernfs.Dentry{
-			"ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+			"ipv4": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
 				"tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}),
 				"tcp_rmem":     fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
 				"tcp_sack":     fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
@@ -109,7 +109,7 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke
 				"tcp_syn_retries":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
 				"tcp_timestamps":            fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
 			}),
-			"core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+			"core": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
 				"default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")),
 				"message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")),
 				"message_cost":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
@@ -123,7 +123,7 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke
 		}
 	}
 
-	return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents)
+	return newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents)
 }
 
 // mmapMinAddrData implements vfs.DynamicBytesSource for
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 0401726b6..393feb802 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -131,7 +131,9 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set
 
 // Open implements kernfs.Inode.Open.
 func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
-- 
cgit v1.2.3


From ac83a6a5ed237e0ddcb473b1cbc2e30d8e6c6740 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 21 Aug 2020 16:18:31 -0700
Subject: Internal change.

PiperOrigin-RevId: 327892274
---
 pkg/sentry/limits/context.go  | 9 +++++++++
 pkg/sentry/vfs/permissions.go | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sentry/limits/context.go b/pkg/sentry/limits/context.go
index 77e1fe217..0bade6e57 100644
--- a/pkg/sentry/limits/context.go
+++ b/pkg/sentry/limits/context.go
@@ -33,3 +33,12 @@ func FromContext(ctx context.Context) *LimitSet {
 	}
 	return nil
 }
+
+// FromContextOrDie returns FromContext(ctx) if the latter is not nil.
+// Otherwise, panic is triggered.
+func FromContextOrDie(ctx context.Context) *LimitSet {
+	if v := ctx.Value(CtxLimits); v != nil {
+		return v.(*LimitSet)
+	}
+	panic("failed to create limit set from context")
+}
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index 33389c1df..014b928ed 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -271,7 +271,7 @@ func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth
 // operation must not proceed. Otherwise it returns the max length allowed to
 // without violating the limit.
 func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
-	fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur
+	fileSizeLimit := limits.FromContextOrDie(ctx).Get(limits.FileSize).Cur
 	if fileSizeLimit > math.MaxInt64 {
 		return size, nil
 	}
-- 
cgit v1.2.3


From 8bf0bd8ab97958bb43bb0388f1f40965cf989207 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 21 Aug 2020 17:24:45 -0700
Subject: Provide fdReader/Writer for FileDescription

fdReader/Writer implements io.Reader/Writer so that they can be passed
to Merkle tree library.

PiperOrigin-RevId: 327901376
---
 pkg/sentry/vfs/file_description.go | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'pkg')

diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index d3abe28ee..33910e095 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -847,3 +847,31 @@ func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsyn
 	}
 	return fd.asyncHandler
 }
+
+// FileReadWriteSeeker is a helper struct to pass a FileDescription as
+// io.Reader/io.Writer/io.ReadSeeker/etc.
+type FileReadWriteSeeker struct {
+	Fd    *FileDescription
+	Ctx   context.Context
+	ROpts ReadOptions
+	WOpts WriteOptions
+}
+
+// Read implements io.ReadWriteSeeker.Read.
+func (f *FileReadWriteSeeker) Read(p []byte) (int, error) {
+	dst := usermem.BytesIOSequence(p)
+	ret, err := f.Fd.Read(f.Ctx, dst, f.ROpts)
+	return int(ret), err
+}
+
+// Seek implements io.ReadWriteSeeker.Seek.
+func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) {
+	return f.Fd.Seek(f.Ctx, offset, int32(whence))
+}
+
+// Write implements io.ReadWriteSeeker.Write.
+func (f *FileReadWriteSeeker) Write(p []byte) (int, error) {
+	buf := usermem.BytesIOSequence(p)
+	ret, err := f.Fd.Write(f.Ctx, buf, f.WOpts)
+	return int(ret), err
+}
-- 
cgit v1.2.3


From b9aa0fd7dacc84bbaffbda41e3b40aa4e876b3c1 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 21 Aug 2020 17:32:19 -0700
Subject: stateify: Fix pretty print not printing odd numbered fields.

PiperOrigin-RevId: 327902182
---
 pkg/state/pretty/pretty.go | 1 -
 1 file changed, 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/state/pretty/pretty.go b/pkg/state/pretty/pretty.go
index cf37aaa49..1375fcc38 100644
--- a/pkg/state/pretty/pretty.go
+++ b/pkg/state/pretty/pretty.go
@@ -148,7 +148,6 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 			element, ok := format(graph, depth+1, *x.Field(i), html)
 			allZero = allZero && !ok
 			items = append(items, fmt.Sprintf("\t%d: %s,", i, element))
-			i++
 		}
 		items = append(items, "}")
 		return strings.Join(items, tabs), !allZero
-- 
cgit v1.2.3


From 4459eb7bb42c1f920760d2ca5e147b81d04fdc00 Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Fri, 21 Aug 2020 20:04:31 -0700
Subject: [vfs] Allow mountpoint to be an existing non-directory.

Unlike linux mount(2), OCI spec allows mounting on top of an existing
non-directory file.

PiperOrigin-RevId: 327914342
---
 pkg/sentry/vfs/mount.go | 26 ++++++++++++++++++++++++++
 runsc/boot/vfs.go       |  4 ++--
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 09fea3628..cd5456eef 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -18,12 +18,14 @@ import (
 	"bytes"
 	"fmt"
 	"math"
+	"path"
 	"sort"
 	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -888,6 +890,30 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
 	}
 }
 
+// MakeSyntheticMountpoint creates parent directories of target if they do not
+// exist and attempts to create a directory for the mountpoint. If a
+// non-directory file already exists there then we allow it.
+func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error {
+	mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
+
+	// Make sure the parent directory of target exists.
+	if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil {
+		return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err)
+	}
+
+	// Attempt to mkdir the final component. If a file (of any type) exists
+	// then we let allow mounting on top of that because we do not require the
+	// target to be an existing directory, unlike Linux mount(2).
+	if err := vfs.MkdirAt(ctx, creds, &PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(target),
+	}, mkdirOpts); err != nil && err != syserror.EEXIST {
+		return fmt.Errorf("failed to create mountpoint %q: %w", target, err)
+	}
+	return nil
+}
+
 // manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
 // See Linux fs/seq_file.c:mangle_path.
 func manglePath(p string) string {
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index fb200e988..66b6cf19b 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -292,7 +292,7 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C
 		return nil, nil
 	}
 
-	if err := c.k.VFS().MkdirAllAt(ctx, submount.Destination, root, creds, &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}); err != nil {
+	if err := c.k.VFS().MakeSyntheticMountpoint(ctx, submount.Destination, root, creds); err != nil {
 		return nil, err
 	}
 	mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
@@ -496,7 +496,7 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co
 
 	root := mns.Root()
 	defer root.DecRef(ctx)
-	if err := c.k.VFS().MkdirAllAt(ctx, mount.Destination, root, creds, &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}); err != nil {
+	if err := c.k.VFS().MakeSyntheticMountpoint(ctx, mount.Destination, root, creds); err != nil {
 		return nil, err
 	}
 
-- 
cgit v1.2.3


From 442af00e8cb678859e44a149a97885d102f94edb Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Sat, 22 Aug 2020 09:53:11 -0700
Subject: Implement GetFilesystem for verity fs

verity GetFilesystem is implemented by mounting the underlying file
system, save the mount, and store both the underlying root dentry and
root Merkle file dentry in verity's root dentry.

PiperOrigin-RevId: 327959334
---
 pkg/sentry/fsimpl/verity/verity.go | 128 ++++++++++++++++++++++++++++++++++---
 1 file changed, 118 insertions(+), 10 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index cb29d33a5..1c5b07aa5 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -26,6 +26,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -36,10 +37,16 @@ import (
 // Name is the default filesystem name.
 const Name = "verity"
 
-// testOnlyDebugging allows verity file system to return error instead of
-// crashing the application when a malicious action is detected. This should
-// only be set for tests.
-var testOnlyDebugging bool
+// merklePrefix is the prefix of the Merkle tree files. For example, the Merkle
+// tree file for "/foo" is "/.merkle.verity.foo".
+const merklePrefix = ".merkle.verity."
+
+// noCrashOnVerificationFailure indicates whether the sandbox should panic
+// whenever verification fails. If true, an error is returned instead of
+// panicking. This should only be set for tests.
+// TOOD(b/165661693): Decide whether to panic or return error based on this
+// flag.
+var noCrashOnVerificationFailure bool
 
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
@@ -93,10 +100,10 @@ type InternalFilesystemOptions struct {
 	// system wrapped by verity file system.
 	LowerGetFSOptions vfs.GetFilesystemOptions
 
-	// TestOnlyDebugging allows verity file system to return error instead
-	// of crashing the application when a malicious action is detected. This
-	// should only be set for tests.
-	TestOnlyDebugging bool
+	// NoCrashOnVerificationFailure indicates whether the sandbox should
+	// panic whenever verification fails. If true, an error is returned
+	// instead of panicking. This should only be set for tests.
+	NoCrashOnVerificationFailure bool
 }
 
 // Name implements vfs.FilesystemType.Name.
@@ -106,8 +113,109 @@ func (FilesystemType) Name() string {
 
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	//TODO(b/159261227): Implement GetFilesystem.
-	return nil, nil, nil
+	iopts, ok := opts.InternalData.(InternalFilesystemOptions)
+	if !ok {
+		ctx.Warningf("verity.FilesystemType.GetFilesystem: missing verity configs")
+		return nil, nil, syserror.EINVAL
+	}
+	noCrashOnVerificationFailure = iopts.NoCrashOnVerificationFailure
+
+	// Mount the lower file system. The lower file system is wrapped inside
+	// verity, and should not be exposed or connected.
+	mopts := &vfs.MountOptions{
+		GetFilesystemOptions: iopts.LowerGetFSOptions,
+	}
+	mnt, err := vfsObj.MountDisconnected(ctx, creds, "", iopts.LowerName, mopts)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	fs := &filesystem{
+		creds:              creds.Fork(),
+		lowerMount:         mnt,
+		allowRuntimeEnable: iopts.AllowRuntimeEnable,
+	}
+	fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+	// Construct the root dentry.
+	d := fs.newDentry()
+	d.refs = 1
+	lowerVD := vfs.MakeVirtualDentry(mnt, mnt.Root())
+	lowerVD.IncRef()
+	d.lowerVD = lowerVD
+
+	rootMerkleName := merklePrefix + iopts.RootMerkleFileName
+
+	lowerMerkleVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+		Path:  fspath.Parse(rootMerkleName),
+	}, &vfs.GetDentryOptions{})
+
+	// If runtime enable is allowed, the root merkle tree may be absent. We
+	// should create the tree file.
+	if err == syserror.ENOENT && fs.allowRuntimeEnable {
+		lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  lowerVD,
+			Start: lowerVD,
+			Path:  fspath.Parse(rootMerkleName),
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDWR | linux.O_CREAT,
+			Mode:  0644,
+		})
+		if err != nil {
+			fs.vfsfs.DecRef(ctx)
+			d.DecRef(ctx)
+			return nil, nil, err
+		}
+		lowerMerkleFD.DecRef(ctx)
+		lowerMerkleVD, err = vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  lowerVD,
+			Start: lowerVD,
+			Path:  fspath.Parse(rootMerkleName),
+		}, &vfs.GetDentryOptions{})
+		if err != nil {
+			fs.vfsfs.DecRef(ctx)
+			d.DecRef(ctx)
+			return nil, nil, err
+		}
+	} else if err != nil {
+		// Failed to get dentry for the root Merkle file. This indicates
+		// an attack that removed/renamed the root Merkle file, or it's
+		// never generated.
+		if noCrashOnVerificationFailure {
+			fs.vfsfs.DecRef(ctx)
+			d.DecRef(ctx)
+			return nil, nil, err
+		}
+		panic("Failed to find root Merkle file")
+	}
+	d.lowerMerkleVD = lowerMerkleVD
+
+	// Get metadata from the underlying file system.
+	const statMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID
+	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+	}, &vfs.StatOptions{
+		Mask: statMask,
+	})
+	if err != nil {
+		fs.vfsfs.DecRef(ctx)
+		d.DecRef(ctx)
+		return nil, nil, err
+	}
+
+	// TODO(b/162788573): Verify Metadata.
+	d.mode = uint32(stat.Mode)
+	d.uid = stat.UID
+	d.gid = stat.GID
+
+	d.rootHash = make([]byte, len(iopts.RootHash))
+	copy(d.rootHash, iopts.RootHash)
+	d.vfsd.Init(d)
+
+	return &fs.vfsfs, &d.vfsd, nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
-- 
cgit v1.2.3


From 78c8c9e4c8a47818df14aac192a33a0f7b9f9006 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 24 Aug 2020 11:28:28 -0700
Subject: Update inotify documentation for gofer filesystem.

We now allow hard links to be created within gofer fs (see
github.com/google/gvisor/commit/f20e63e31b56784c596897e86f03441f9d05f567).
Update the inotify documentation accordingly.

PiperOrigin-RevId: 328177485
---
 pkg/sentry/fsimpl/gofer/gofer.go     |  7 +++++++
 pkg/sentry/syscalls/linux/linux64.go | 14 +++++++-------
 pkg/sentry/vfs/g3doc/inotify.md      | 18 +++++++++---------
 3 files changed, 23 insertions(+), 16 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index c6696b9d8..81d34cfe3 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -703,6 +703,13 @@ type dentry struct {
 	locks vfs.FileLocks
 
 	// Inotify watches for this dentry.
+	//
+	// Note that inotify may behave unexpectedly in the presence of hard links,
+	// because dentries corresponding to the same file have separate inotify
+	// watches when they should share the same set. This is the case because it is
+	// impossible for us to know for sure whether two dentries correspond to the
+	// same underlying file (see the gofer filesystem section fo vfs/inotify.md for
+	// a more in-depth discussion on this matter).
 	watches vfs.Watches
 }
 
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 80c65164a..da6bd85e1 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -305,9 +305,9 @@ var AMD64 = &kernel.SyscallTable{
 		250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
 		251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
 		252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
-		253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil),
-		254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
-		255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+		253: syscalls.PartiallySupported("inotify_init", InotifyInit, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+		254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+		255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
 		256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
 		257: syscalls.Supported("openat", Openat),
 		258: syscalls.Supported("mkdirat", Mkdirat),
@@ -346,7 +346,7 @@ var AMD64 = &kernel.SyscallTable{
 		291: syscalls.Supported("epoll_create1", EpollCreate1),
 		292: syscalls.Supported("dup3", Dup3),
 		293: syscalls.Supported("pipe2", Pipe2),
-		294: syscalls.Supported("inotify_init1", InotifyInit1),
+		294: syscalls.PartiallySupported("inotify_init1", InotifyInit1, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
 		295: syscalls.Supported("preadv", Preadv),
 		296: syscalls.Supported("pwritev", Pwritev),
 		297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
@@ -454,9 +454,9 @@ var ARM64 = &kernel.SyscallTable{
 		23:  syscalls.Supported("dup", Dup),
 		24:  syscalls.Supported("dup3", Dup3),
 		25:  syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
-		26:  syscalls.Supported("inotify_init1", InotifyInit1),
-		27:  syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
-		28:  syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+		26:  syscalls.PartiallySupported("inotify_init1", InotifyInit1, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+		27:  syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+		28:  syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
 		29:  syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
 		30:  syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
 		31:  syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
diff --git a/pkg/sentry/vfs/g3doc/inotify.md b/pkg/sentry/vfs/g3doc/inotify.md
index e7da49faa..833db213f 100644
--- a/pkg/sentry/vfs/g3doc/inotify.md
+++ b/pkg/sentry/vfs/g3doc/inotify.md
@@ -28,9 +28,9 @@ The set of all watches held on a single file (i.e., the watch target) is stored
 in vfs.Watches. Each watch will belong to a different inotify instance (an
 instance can only have one watch on any watch target). The watches are stored in
 a map indexed by their vfs.Inotify owner’s id. Hard links and file descriptions
-to a single file will all share the same vfs.Watches. Activity on the target
-causes its vfs.Watches to generate notifications on its watches’ inotify
-instances.
+to a single file will all share the same vfs.Watches (with the exception of the
+gofer filesystem, described in a later section). Activity on the target causes
+its vfs.Watches to generate notifications on its watches’ inotify instances.
 
 ### vfs.Watch
 
@@ -103,12 +103,12 @@ inotify:
     unopened p9 file (and possibly an open FID), through which the Sentry
     interacts with the gofer.
     *   *Solution:* Because there is no inode structure stored in the sandbox,
-        inotify watches must be held on the dentry. This would be an issue in
-        the presence of hard links, where multiple dentries would need to share
-        the same set of watches, but in VFS2, we do not support the internal
-        creation of hard links on gofer fs. As a result, we make the assumption
-        that every dentry corresponds to a unique inode. However, the next point
-        raises an issue with this assumption:
+        inotify watches must be held on the dentry. For the purposes of inotify,
+        we assume that every dentry corresponds to a unique inode, which may
+        cause unexpected behavior in the presence of hard links, where multiple
+        dentries should share the same set of watches. Indeed, it is impossible
+        for us to be absolutely sure whether dentries correspond to the same
+        file or not, due to the following point:
 *   **The Sentry cannot always be aware of hard links on the remote
     filesystem.** There is no way for us to confirm whether two files on the
     remote filesystem are actually links to the same inode. QIDs and inodes are
-- 
cgit v1.2.3


From 901de6dc776c00bbdd60c12e800c6b10839b1466 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Mon, 24 Aug 2020 12:27:01 -0700
Subject: Consider loopback bound to all addresses in subnet

When a loopback interface is configurd with an address and associated
subnet, the loopback should treat all addresses in that subnet as an
address it owns.

This is mimicking linux behaviour as seen below:
```
$ ip addr show dev lo
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group ...
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
$ ping 192.0.2.1
PING 192.0.2.1 (192.0.2.1) 56(84) bytes of data.
^C
--- 192.0.2.1 ping statistics ---
2 packets transmitted, 0 received, 100% packet loss, time 1018ms

$ ping 192.0.2.2
PING 192.0.2.2 (192.0.2.2) 56(84) bytes of data.
^C
--- 192.0.2.2 ping statistics ---
3 packets transmitted, 0 received, 100% packet loss, time 2039ms

$ sudo ip addr add 192.0.2.1/24 dev lo
$ ip addr show dev lo
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group ...
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet 192.0.2.1/24 scope global lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
$ ping 192.0.2.1
PING 192.0.2.1 (192.0.2.1) 56(84) bytes of data.
64 bytes from 192.0.2.1: icmp_seq=1 ttl=64 time=0.131 ms
64 bytes from 192.0.2.1: icmp_seq=2 ttl=64 time=0.046 ms
64 bytes from 192.0.2.1: icmp_seq=3 ttl=64 time=0.048 ms
^C
--- 192.0.2.1 ping statistics ---
3 packets transmitted, 3 received, 0% packet loss, time 2042ms
rtt min/avg/max/mdev = 0.046/0.075/0.131/0.039 ms
$ ping 192.0.2.2
PING 192.0.2.2 (192.0.2.2) 56(84) bytes of data.
64 bytes from 192.0.2.2: icmp_seq=1 ttl=64 time=0.131 ms
64 bytes from 192.0.2.2: icmp_seq=2 ttl=64 time=0.069 ms
64 bytes from 192.0.2.2: icmp_seq=3 ttl=64 time=0.049 ms
64 bytes from 192.0.2.2: icmp_seq=4 ttl=64 time=0.035 ms
^C
--- 192.0.2.2 ping statistics ---
4 packets transmitted, 4 received, 0% packet loss, time 3049ms
rtt min/avg/max/mdev = 0.035/0.071/0.131/0.036 ms
```

Test: integration_test.TestLoopbackAcceptAllInSubnet
PiperOrigin-RevId: 328188546
---
 pkg/tcpip/stack/nic.go                             |  13 +-
 pkg/tcpip/tests/integration/BUILD                  |   6 +-
 pkg/tcpip/tests/integration/loopback_test.go       | 229 +++++++++++++++++++++
 .../tests/integration/multicast_broadcast_test.go  |   2 +-
 test/syscalls/BUILD                                |   8 +
 test/syscalls/linux/BUILD                          |  83 ++++++++
 .../linux/socket_ip_udp_unbound_netlink_util.cc    |  58 ++++++
 .../linux/socket_ip_udp_unbound_netlink_util.h     |  34 +++
 .../socket_ipv4_udp_unbound_loopback_netlink.cc    |  32 +++
 .../linux/socket_ipv4_udp_unbound_netlink.cc       |  60 ++++++
 .../linux/socket_ipv4_udp_unbound_netlink.h        |  29 +++
 .../socket_ipv6_udp_unbound_loopback_netlink.cc    |  32 +++
 .../linux/socket_ipv6_udp_unbound_netlink.cc       |  60 ++++++
 .../linux/socket_ipv6_udp_unbound_netlink.h        |  29 +++
 14 files changed, 672 insertions(+), 3 deletions(-)
 create mode 100644 pkg/tcpip/tests/integration/loopback_test.go
 create mode 100644 test/syscalls/linux/socket_ip_udp_unbound_netlink_util.cc
 create mode 100644 test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h
 create mode 100644 test/syscalls/linux/socket_ipv4_udp_unbound_loopback_netlink.cc
 create mode 100644 test/syscalls/linux/socket_ipv4_udp_unbound_netlink.cc
 create mode 100644 test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h
 create mode 100644 test/syscalls/linux/socket_ipv6_udp_unbound_loopback_netlink.cc
 create mode 100644 test/syscalls/linux/socket_ipv6_udp_unbound_netlink.cc
 create mode 100644 test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 728292782..aff29f9cc 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -666,8 +666,19 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 	}
 
 	// A usable reference was not found, create a temporary one if requested by
-	// the caller or if the address is found in the NIC's subnets.
+	// the caller or if the address is found in the NIC's subnets and the NIC is
+	// a loopback interface.
 	createTempEP := spoofingOrPromiscuous
+	if !createTempEP && n.isLoopback() {
+		for _, r := range n.mu.endpoints {
+			addr := r.addrWithPrefix()
+			subnet := addr.Subnet()
+			if subnet.Contains(address) {
+				createTempEP = true
+				break
+			}
+		}
+	}
 	n.mu.RUnlock()
 
 	if !createTempEP {
diff --git a/pkg/tcpip/tests/integration/BUILD b/pkg/tcpip/tests/integration/BUILD
index 6d52af98a..06c7a3cd3 100644
--- a/pkg/tcpip/tests/integration/BUILD
+++ b/pkg/tcpip/tests/integration/BUILD
@@ -5,12 +5,16 @@ package(licenses = ["notice"])
 go_test(
     name = "integration_test",
     size = "small",
-    srcs = ["multicast_broadcast_test.go"],
+    srcs = [
+        "loopback_test.go",
+        "multicast_broadcast_test.go",
+    ],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/link/loopback",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
diff --git a/pkg/tcpip/tests/integration/loopback_test.go b/pkg/tcpip/tests/integration/loopback_test.go
new file mode 100644
index 000000000..3a2f75837
--- /dev/null
+++ b/pkg/tcpip/tests/integration/loopback_test.go
@@ -0,0 +1,229 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package integration_test
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// TestLoopbackAcceptAllInSubnet tests that a loopback interface considers
+// itself bound to all addresses in the subnet of an assigned address.
+func TestLoopbackAcceptAllInSubnet(t *testing.T) {
+	const (
+		nicID     = 1
+		localPort = 80
+	)
+
+	data := []byte{1, 2, 3, 4}
+
+	ipv4ProtocolAddress := tcpip.ProtocolAddress{
+		Protocol:          header.IPv4ProtocolNumber,
+		AddressWithPrefix: ipv4Addr,
+	}
+	ipv4Bytes := []byte(ipv4Addr.Address)
+	ipv4Bytes[len(ipv4Bytes)-1]++
+	otherIPv4Address := tcpip.Address(ipv4Bytes)
+
+	ipv6ProtocolAddress := tcpip.ProtocolAddress{
+		Protocol:          header.IPv6ProtocolNumber,
+		AddressWithPrefix: ipv6Addr,
+	}
+	ipv6Bytes := []byte(ipv6Addr.Address)
+	ipv6Bytes[len(ipv6Bytes)-1]++
+	otherIPv6Address := tcpip.Address(ipv6Bytes)
+
+	tests := []struct {
+		name       string
+		addAddress tcpip.ProtocolAddress
+		bindAddr   tcpip.Address
+		dstAddr    tcpip.Address
+		expectRx   bool
+	}{
+		{
+			name:       "IPv4 bind to wildcard and send to assigned address",
+			addAddress: ipv4ProtocolAddress,
+			dstAddr:    ipv4Addr.Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv4 bind to wildcard and send to other subnet-local address",
+			addAddress: ipv4ProtocolAddress,
+			dstAddr:    otherIPv4Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv4 bind to wildcard send to other address",
+			addAddress: ipv4ProtocolAddress,
+			dstAddr:    remoteIPv4Addr,
+			expectRx:   false,
+		},
+		{
+			name:       "IPv4 bind to other subnet-local address and send to assigned address",
+			addAddress: ipv4ProtocolAddress,
+			bindAddr:   otherIPv4Address,
+			dstAddr:    ipv4Addr.Address,
+			expectRx:   false,
+		},
+		{
+			name:       "IPv4 bind and send to other subnet-local address",
+			addAddress: ipv4ProtocolAddress,
+			bindAddr:   otherIPv4Address,
+			dstAddr:    otherIPv4Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv4 bind to assigned address and send to other subnet-local address",
+			addAddress: ipv4ProtocolAddress,
+			bindAddr:   ipv4Addr.Address,
+			dstAddr:    otherIPv4Address,
+			expectRx:   false,
+		},
+
+		{
+			name:       "IPv6 bind and send to assigned address",
+			addAddress: ipv6ProtocolAddress,
+			bindAddr:   ipv6Addr.Address,
+			dstAddr:    ipv6Addr.Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv6 bind to wildcard and send to assigned address",
+			addAddress: ipv6ProtocolAddress,
+			dstAddr:    ipv6Addr.Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv6 bind to wildcard and send to other subnet-local address",
+			addAddress: ipv6ProtocolAddress,
+			dstAddr:    otherIPv6Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv6 bind to wildcard send to other address",
+			addAddress: ipv6ProtocolAddress,
+			dstAddr:    remoteIPv6Addr,
+			expectRx:   false,
+		},
+		{
+			name:       "IPv6 bind to other subnet-local address and send to assigned address",
+			addAddress: ipv6ProtocolAddress,
+			bindAddr:   otherIPv6Address,
+			dstAddr:    ipv6Addr.Address,
+			expectRx:   false,
+		},
+		{
+			name:       "IPv6 bind and send to other subnet-local address",
+			addAddress: ipv6ProtocolAddress,
+			bindAddr:   otherIPv6Address,
+			dstAddr:    otherIPv6Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv6 bind to assigned address and send to other subnet-local address",
+			addAddress: ipv6ProtocolAddress,
+			bindAddr:   ipv6Addr.Address,
+			dstAddr:    otherIPv6Address,
+			expectRx:   false,
+		},
+		{
+			name:       "IPv6 bind and send to assigned address",
+			addAddress: ipv6ProtocolAddress,
+			bindAddr:   ipv6Addr.Address,
+			dstAddr:    ipv6Addr.Address,
+			expectRx:   true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			if err := s.CreateNIC(nicID, loopback.New()); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
+			if err := s.AddProtocolAddress(nicID, test.addAddress); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, test.addAddress, err)
+			}
+			s.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: header.IPv4EmptySubnet,
+					NIC:         nicID,
+				},
+				tcpip.Route{
+					Destination: header.IPv6EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
+			wq := waiter.Queue{}
+			rep, err := s.NewEndpoint(udp.ProtocolNumber, test.addAddress.Protocol, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, test.addAddress.Protocol, err)
+			}
+			defer rep.Close()
+
+			bindAddr := tcpip.FullAddress{Addr: test.bindAddr, Port: localPort}
+			if err := rep.Bind(bindAddr); err != nil {
+				t.Fatalf("rep.Bind(%+v): %s", bindAddr, err)
+			}
+
+			sep, err := s.NewEndpoint(udp.ProtocolNumber, test.addAddress.Protocol, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, test.addAddress.Protocol, err)
+			}
+			defer sep.Close()
+
+			wopts := tcpip.WriteOptions{
+				To: &tcpip.FullAddress{
+					Addr: test.dstAddr,
+					Port: localPort,
+				},
+			}
+			n, _, err := sep.Write(tcpip.SlicePayload(data), wopts)
+			if err != nil {
+				t.Fatalf("sep.Write(_, _): %s", err)
+			}
+			if want := int64(len(data)); n != want {
+				t.Fatalf("got sep.Write(_, _) = (%d, _, nil), want = (%d, _, nil)", n, want)
+			}
+
+			if gotPayload, _, err := rep.Read(nil); test.expectRx {
+				if err != nil {
+					t.Fatalf("reep.Read(nil): %s", err)
+				}
+				if diff := cmp.Diff(buffer.View(data), gotPayload); diff != "" {
+					t.Errorf("got UDP payload mismatch (-want +got):\n%s", diff)
+				}
+			} else {
+				if err != tcpip.ErrWouldBlock {
+					t.Fatalf("got rep.Read(nil) = (%x, _, %s), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/tests/integration/multicast_broadcast_test.go b/pkg/tcpip/tests/integration/multicast_broadcast_test.go
index 9f0dd4d6d..52c27e045 100644
--- a/pkg/tcpip/tests/integration/multicast_broadcast_test.go
+++ b/pkg/tcpip/tests/integration/multicast_broadcast_test.go
@@ -430,7 +430,7 @@ func TestIncomingMulticastAndBroadcast(t *testing.T) {
 				}
 			} else {
 				if err != tcpip.ErrWouldBlock {
-					t.Fatalf("got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+					t.Fatalf("got Read(nil) = (%x, _, %s), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
 				}
 			}
 		})
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index ad53e92e5..eea1401ac 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -674,6 +674,14 @@ syscall_test(
     test = "//test/syscalls/linux:socket_ipv4_udp_unbound_loopback_nogotsan_test",
 )
 
+syscall_test(
+    test = "//test/syscalls/linux:socket_ipv4_udp_unbound_loopback_netlink_test",
+)
+
+syscall_test(
+    test = "//test/syscalls/linux:socket_ipv6_udp_unbound_loopback_netlink_test",
+)
+
 syscall_test(
     test = "//test/syscalls/linux:socket_ip_unbound_test",
 )
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ecd2d8d2a..ed0b6ecf4 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2402,6 +2402,57 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "socket_ip_udp_unbound_netlink_test_utils",
+    testonly = 1,
+    srcs = [
+        "socket_ip_udp_unbound_netlink_util.cc",
+    ],
+    hdrs = [
+        "socket_ip_udp_unbound_netlink_util.h",
+    ],
+    deps = [
+        ":socket_test_util",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "socket_ipv4_udp_unbound_netlink_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_ipv4_udp_unbound_netlink.cc",
+    ],
+    hdrs = [
+        "socket_ipv4_udp_unbound_netlink.h",
+    ],
+    deps = [
+        ":socket_ip_udp_unbound_netlink_test_utils",
+        ":socket_netlink_route_util",
+        "//test/util:capability_util",
+        gtest,
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "socket_ipv6_udp_unbound_netlink_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_ipv6_udp_unbound_netlink.cc",
+    ],
+    hdrs = [
+        "socket_ipv6_udp_unbound_netlink.h",
+    ],
+    deps = [
+        ":socket_ip_udp_unbound_netlink_test_utils",
+        ":socket_netlink_route_util",
+        "//test/util:capability_util",
+        gtest,
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "socket_ipv4_udp_unbound_external_networking_test_cases",
     testonly = 1,
@@ -2755,6 +2806,38 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "socket_ipv4_udp_unbound_loopback_netlink_test",
+    testonly = 1,
+    srcs = [
+        "socket_ipv4_udp_unbound_loopback_netlink.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_ipv4_udp_unbound_netlink_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_ipv6_udp_unbound_loopback_netlink_test",
+    testonly = 1,
+    srcs = [
+        "socket_ipv6_udp_unbound_loopback_netlink.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_ipv6_udp_unbound_netlink_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
 cc_binary(
     name = "socket_ip_unbound_test",
     testonly = 1,
diff --git a/test/syscalls/linux/socket_ip_udp_unbound_netlink_util.cc b/test/syscalls/linux/socket_ip_udp_unbound_netlink_util.cc
new file mode 100644
index 000000000..13ffafde7
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_udp_unbound_netlink_util.cc
@@ -0,0 +1,58 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+
+const size_t kSendBufSize = 200;
+
+void IPUDPUnboundSocketNetlinkTest::TestSendRecv(TestAddress sender_addr,
+                                                 TestAddress receiver_addr) {
+  auto snd_sock = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto rcv_sock = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  EXPECT_THAT(
+      bind(snd_sock->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+           sender_addr.addr_len),
+      SyscallSucceeds());
+
+  EXPECT_THAT(
+      bind(rcv_sock->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(rcv_sock->get(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+  char send_buf[kSendBufSize];
+  RandomizeBuffer(send_buf, kSendBufSize);
+  EXPECT_THAT(
+      RetryEINTR(sendto)(snd_sock->get(), send_buf, kSendBufSize, 0,
+                         reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                         receiver_addr.addr_len),
+      SyscallSucceedsWithValue(kSendBufSize));
+
+  // Check that we received the packet.
+  char recv_buf[kSendBufSize] = {};
+  ASSERT_THAT(RetryEINTR(recv)(rcv_sock->get(), recv_buf, kSendBufSize, 0),
+              SyscallSucceedsWithValue(kSendBufSize));
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, kSendBufSize));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h b/test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h
new file mode 100644
index 000000000..157fb0939
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_UNBOUND_NETLINK_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_UNBOUND_NETLINK_UTIL_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to IP UDP sockets.
+class IPUDPUnboundSocketNetlinkTest : public SimpleSocketTest {
+ public:
+  // TestSendRecv tests sending and receiving a UDP packet from |sender_addr| to
+  // |receiver_addr|.
+  void TestSendRecv(TestAddress sender_addr, TestAddress receiver_addr);
+};
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_UNBOUND_NETLINK_UTIL_H_
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_netlink.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_netlink.cc
new file mode 100644
index 000000000..8052bf404
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_netlink.cc
@@ -0,0 +1,32 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+INSTANTIATE_TEST_SUITE_P(
+    IPv4UDPSockets, IPv4UDPUnboundSocketNetlinkTest,
+    ::testing::ValuesIn(ApplyVec<SocketKind>(IPv4UDPUnboundSocket,
+                                             AllBitwiseCombinations(List<int>{
+                                                 0, SOCK_NONBLOCK}))));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.cc
new file mode 100644
index 000000000..696fbb189
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.cc
@@ -0,0 +1,60 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h"
+
+#include <arpa/inet.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+#include "test/util/capability_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Checks that the loopback interface considers itself bound to all IPs in an
+// associated subnet.
+TEST_P(IPv4UDPUnboundSocketNetlinkTest, JoinSubnet) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // Add an IP address to the loopback interface.
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
+  struct in_addr addr;
+  EXPECT_EQ(1, inet_pton(AF_INET, "192.0.2.1", &addr));
+  EXPECT_NO_ERRNO(LinkAddLocalAddr(loopback_link.index, AF_INET,
+                                   /*prefixlen=*/24, &addr, sizeof(addr)));
+
+  // Send from an unassigned address but an address that is in the subnet
+  // associated with the loopback interface.
+  TestAddress sender_addr("V4NotAssignd1");
+  sender_addr.addr.ss_family = AF_INET;
+  sender_addr.addr_len = sizeof(sockaddr_in);
+  EXPECT_EQ(1, inet_pton(AF_INET, "192.0.2.2",
+                         &(reinterpret_cast<sockaddr_in*>(&sender_addr.addr)
+                               ->sin_addr.s_addr)));
+
+  // Send the packet to an unassigned address but an address that is in the
+  // subnet associated with the loopback interface.
+  TestAddress receiver_addr("V4NotAssigned2");
+  receiver_addr.addr.ss_family = AF_INET;
+  receiver_addr.addr_len = sizeof(sockaddr_in);
+  EXPECT_EQ(1, inet_pton(AF_INET, "192.0.2.254",
+                         &(reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)
+                               ->sin_addr.s_addr)));
+
+  TestSendRecv(sender_addr, receiver_addr);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h
new file mode 100644
index 000000000..fcfb3318e
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h
@@ -0,0 +1,29 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_NETLINK_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_NETLINK_UTIL_H_
+
+#include "test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to IPv4 UDP sockets.
+using IPv4UDPUnboundSocketNetlinkTest = IPUDPUnboundSocketNetlinkTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_NETLINK_UTIL_H_
diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_loopback_netlink.cc b/test/syscalls/linux/socket_ipv6_udp_unbound_loopback_netlink.cc
new file mode 100644
index 000000000..17021ff82
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv6_udp_unbound_loopback_netlink.cc
@@ -0,0 +1,32 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+INSTANTIATE_TEST_SUITE_P(
+    IPv6UDPSockets, IPv6UDPUnboundSocketNetlinkTest,
+    ::testing::ValuesIn(ApplyVec<SocketKind>(IPv6UDPUnboundSocket,
+                                             AllBitwiseCombinations(List<int>{
+                                                 0, SOCK_NONBLOCK}))));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.cc b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.cc
new file mode 100644
index 000000000..6275b5aed
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.cc
@@ -0,0 +1,60 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h"
+
+#include <arpa/inet.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+#include "test/util/capability_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Checks that the loopback interface considers itself bound to all IPs in an
+// associated subnet.
+TEST_P(IPv6UDPUnboundSocketNetlinkTest, JoinSubnet) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // Add an IP address to the loopback interface.
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
+  struct in6_addr addr;
+  EXPECT_EQ(1, inet_pton(AF_INET6, "2001:db8::1", &addr));
+  EXPECT_NO_ERRNO(LinkAddLocalAddr(loopback_link.index, AF_INET6,
+                                   /*prefixlen=*/64, &addr, sizeof(addr)));
+
+  // Send from an unassigned address but an address that is in the subnet
+  // associated with the loopback interface.
+  TestAddress sender_addr("V6NotAssignd1");
+  sender_addr.addr.ss_family = AF_INET6;
+  sender_addr.addr_len = sizeof(sockaddr_in6);
+  EXPECT_EQ(1, inet_pton(AF_INET6, "2001:db8::2",
+                         reinterpret_cast<sockaddr_in6*>(&sender_addr.addr)
+                             ->sin6_addr.s6_addr));
+
+  // Send the packet to an unassigned address but an address that is in the
+  // subnet associated with the loopback interface.
+  TestAddress receiver_addr("V6NotAssigned2");
+  receiver_addr.addr.ss_family = AF_INET6;
+  receiver_addr.addr_len = sizeof(sockaddr_in6);
+  EXPECT_EQ(1, inet_pton(AF_INET6, "2001:db8::ffff:ffff:ffff:ffff",
+                         reinterpret_cast<sockaddr_in6*>(&receiver_addr.addr)
+                             ->sin6_addr.s6_addr));
+
+  TestSendRecv(sender_addr, receiver_addr);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h
new file mode 100644
index 000000000..6a2b0a5be
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h
@@ -0,0 +1,29 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_NETLINK_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_NETLINK_UTIL_H_
+
+#include "test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to IPv6 UDP sockets.
+using IPv6UDPUnboundSocketNetlinkTest = IPUDPUnboundSocketNetlinkTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_NETLINK_UTIL_H_
-- 
cgit v1.2.3


From 7eb284eca20b46570c3bd4e9a49113ac5165afbd Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 24 Aug 2020 12:56:58 -0700
Subject: Bump build constraints to 1.17

This enables pre-release testing with 1.16. The intention is to replace these
with a nogo check before the next release.

PiperOrigin-RevId: 328193911
---
 pkg/procid/procid_amd64.s                           | 2 +-
 pkg/procid/procid_arm64.s                           | 2 +-
 pkg/sentry/platform/kvm/bluepill_unsafe.go          | 2 +-
 pkg/sentry/platform/kvm/machine_unsafe.go           | 2 +-
 pkg/sentry/platform/ptrace/subprocess_unsafe.go     | 2 +-
 pkg/sentry/vfs/mount_unsafe.go                      | 2 +-
 pkg/sleep/sleep_unsafe.go                           | 2 +-
 pkg/sync/memmove_unsafe.go                          | 2 +-
 pkg/sync/mutex_unsafe.go                            | 2 +-
 pkg/sync/rwmutex_unsafe.go                          | 2 +-
 pkg/syncevent/waiter_unsafe.go                      | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go | 2 +-
 pkg/tcpip/time_unsafe.go                            | 2 +-
 13 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'pkg')

diff --git a/pkg/procid/procid_amd64.s b/pkg/procid/procid_amd64.s
index 7c622e5d7..a45920040 100644
--- a/pkg/procid/procid_amd64.s
+++ b/pkg/procid/procid_amd64.s
@@ -14,7 +14,7 @@
 
 // +build amd64
 // +build go1.8
-// +build !go1.16
+// +build !go1.17
 
 #include "textflag.h"
 
diff --git a/pkg/procid/procid_arm64.s b/pkg/procid/procid_arm64.s
index 48ebb5fd1..9d3b0666d 100644
--- a/pkg/procid/procid_arm64.s
+++ b/pkg/procid/procid_arm64.s
@@ -14,7 +14,7 @@
 
 // +build arm64
 // +build go1.8
-// +build !go1.16
+// +build !go1.17
 
 #include "textflag.h"
 
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index bf357de1a..979be5d89 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 9f86f6a7a..607c82156 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index 0bee995e4..7ee20d89a 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index 777d631cb..da2a2e9c4 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index 118805492..19bce2afb 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.11
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sync/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go
index 1d7780695..f5e630009 100644
--- a/pkg/sync/memmove_unsafe.go
+++ b/pkg/sync/memmove_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sync/mutex_unsafe.go b/pkg/sync/mutex_unsafe.go
index dc034d561..f4c2e9642 100644
--- a/pkg/sync/mutex_unsafe.go
+++ b/pkg/sync/mutex_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.13
-// +build !go1.16
+// +build !go1.17
 
 // When updating the build constraint (above), check that syncMutex matches the
 // standard library sync.Mutex definition.
diff --git a/pkg/sync/rwmutex_unsafe.go b/pkg/sync/rwmutex_unsafe.go
index 995c0346e..b3b4dee78 100644
--- a/pkg/sync/rwmutex_unsafe.go
+++ b/pkg/sync/rwmutex_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.13
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/syncevent/waiter_unsafe.go b/pkg/syncevent/waiter_unsafe.go
index ad271e1a0..518f18479 100644
--- a/pkg/syncevent/waiter_unsafe.go
+++ b/pkg/syncevent/waiter_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.11
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
index 99313ee25..5db4bf12b 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
@@ -14,7 +14,7 @@
 
 // +build linux,amd64 linux,arm64
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index f32d58091..606363567 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.9
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
-- 
cgit v1.2.3


From 13d63f13f3b28e35c182c674c904520d7bd577db Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Mon, 24 Aug 2020 13:50:56 -0700
Subject: Remove go profiling flag from dockerutil.

Go profiling was removed from runsc debug in a previous change.

PiperOrigin-RevId: 328203826
---
 pkg/test/dockerutil/dockerutil.go | 1 -
 pkg/test/dockerutil/profile.go    | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/test/dockerutil/dockerutil.go b/pkg/test/dockerutil/dockerutil.go
index 952871f95..7027df1a5 100644
--- a/pkg/test/dockerutil/dockerutil.go
+++ b/pkg/test/dockerutil/dockerutil.go
@@ -60,7 +60,6 @@ var (
 	// enabled for each run.
 	pprofBlock = flag.Bool("pprof-block", false, "enables block profiling with runsc debug")
 	pprofCPU   = flag.Bool("pprof-cpu", false, "enables CPU profiling with runsc debug")
-	pprofGo    = flag.Bool("pprof-go", false, "enables goroutine profiling with runsc debug")
 	pprofHeap  = flag.Bool("pprof-heap", false, "enables heap profiling with runsc debug")
 	pprofMutex = flag.Bool("pprof-mutex", false, "enables mutex profiling with runsc debug")
 )
diff --git a/pkg/test/dockerutil/profile.go b/pkg/test/dockerutil/profile.go
index f0396ef24..55f9496cd 100644
--- a/pkg/test/dockerutil/profile.go
+++ b/pkg/test/dockerutil/profile.go
@@ -63,7 +63,7 @@ type Pprof struct {
 
 // MakePprofFromFlags makes a Pprof profile from flags.
 func MakePprofFromFlags(c *Container) *Pprof {
-	if !(*pprofBlock || *pprofCPU || *pprofGo || *pprofHeap || *pprofMutex) {
+	if !(*pprofBlock || *pprofCPU || *pprofHeap || *pprofMutex) {
 		return nil
 	}
 	return &Pprof{
-- 
cgit v1.2.3


From 1ea284305f0aea9452dc590023b271f66a46e0b5 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Mon, 24 Aug 2020 16:32:26 -0700
Subject: Add check for same source in merkle tree lib

If the data is in the same Reader as the merkle tree, we should verify
from the first layer in the tree, instead of from the beginning.

PiperOrigin-RevId: 328230988
---
 pkg/merkletree/merkletree.go      |  61 +++++++--
 pkg/merkletree/merkletree_test.go | 251 +++++++++++++++++++++++---------------
 2 files changed, 205 insertions(+), 107 deletions(-)

(limited to 'pkg')

diff --git a/pkg/merkletree/merkletree.go b/pkg/merkletree/merkletree.go
index 955c9c473..1a0477c6a 100644
--- a/pkg/merkletree/merkletree.go
+++ b/pkg/merkletree/merkletree.go
@@ -45,12 +45,25 @@ type Layout struct {
 
 // InitLayout initializes and returns a new Layout object describing the structure
 // of a tree. dataSize specifies the size of input data in bytes.
-func InitLayout(dataSize int64) Layout {
+func InitLayout(dataSize int64, dataAndTreeInSameFile bool) Layout {
 	layout := Layout{
 		blockSize: usermem.PageSize,
 		// TODO(b/156980949): Allow config other hash methods (SHA384/SHA512).
 		digestSize: sha256DigestSize,
 	}
+
+	// treeStart is the offset (in bytes) of the first level of the tree in
+	// the file. If data and tree are in different files, treeStart should
+	// be zero. If data is in the same file as the tree, treeStart points
+	// to the block after the last data block (which may be zero-padded).
+	var treeStart int64
+	if dataAndTreeInSameFile {
+		treeStart = dataSize
+		if dataSize%layout.blockSize != 0 {
+			treeStart += layout.blockSize - dataSize%layout.blockSize
+		}
+	}
+
 	numBlocks := (dataSize + layout.blockSize - 1) / layout.blockSize
 	level := 0
 	offset := int64(0)
@@ -60,14 +73,15 @@ func InitLayout(dataSize int64) Layout {
 	// contain the hashes of the data blocks, while level numLevels - 1 is
 	// the root.
 	for numBlocks > 1 {
-		layout.levelOffset = append(layout.levelOffset, offset*layout.blockSize)
+		layout.levelOffset = append(layout.levelOffset, treeStart+offset*layout.blockSize)
 		// Round numBlocks up to fill up a block.
 		numBlocks += (layout.hashesPerBlock() - numBlocks%layout.hashesPerBlock()) % layout.hashesPerBlock()
 		offset += numBlocks / layout.hashesPerBlock()
 		numBlocks = numBlocks / layout.hashesPerBlock()
 		level++
 	}
-	layout.levelOffset = append(layout.levelOffset, offset*layout.blockSize)
+	layout.levelOffset = append(layout.levelOffset, treeStart+offset*layout.blockSize)
+
 	return layout
 }
 
@@ -107,11 +121,44 @@ func (layout Layout) blockOffset(level int, index int64) int64 {
 // written to treeWriter. The treeReader should be able to read the tree after
 // it has been written. That is, treeWriter and treeReader should point to the
 // same underlying data but have separate cursors.
-func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter io.Writer) ([]byte, error) {
-	layout := InitLayout(dataSize)
+// Generate will modify the cursor for data, but always restores it to its
+// original position upon exit. The cursor for tree is modified and not
+// restored.
+func Generate(data io.ReadSeeker, dataSize int64, treeReader io.ReadSeeker, treeWriter io.WriteSeeker, dataAndTreeInSameFile bool) ([]byte, error) {
+	layout := InitLayout(dataSize, dataAndTreeInSameFile)
 
 	numBlocks := (dataSize + layout.blockSize - 1) / layout.blockSize
 
+	// If the data is in the same file as the tree, zero pad the last data
+	// block.
+	bytesInLastBlock := dataSize % layout.blockSize
+	if dataAndTreeInSameFile && bytesInLastBlock != 0 {
+		zeroBuf := make([]byte, layout.blockSize-bytesInLastBlock)
+		if _, err := treeWriter.Seek(0, io.SeekEnd); err != nil && err != io.EOF {
+			return nil, err
+		}
+		if _, err := treeWriter.Write(zeroBuf); err != nil {
+			return nil, err
+		}
+	}
+
+	// Store the current offset, so we can set it back once verification
+	// finishes.
+	origOffset, err := data.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return nil, err
+	}
+	defer data.Seek(origOffset, io.SeekStart)
+
+	// Read from the beginning of both data and treeReader.
+	if _, err := data.Seek(0, io.SeekStart); err != nil && err != io.EOF {
+		return nil, err
+	}
+
+	if _, err := treeReader.Seek(0, io.SeekStart); err != nil && err != io.EOF {
+		return nil, err
+	}
+
 	var root []byte
 	for level := 0; level < layout.numLevels(); level++ {
 		for i := int64(0); i < numBlocks; i++ {
@@ -172,11 +219,11 @@ func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter i
 // Verify will modify the cursor for data, but always restores it to its
 // original position upon exit. The cursor for tree is modified and not
 // restored.
-func Verify(w io.Writer, data, tree io.ReadSeeker, dataSize int64, readOffset int64, readSize int64, expectedRoot []byte) error {
+func Verify(w io.Writer, data, tree io.ReadSeeker, dataSize int64, readOffset int64, readSize int64, expectedRoot []byte, dataAndTreeInSameFile bool) error {
 	if readSize <= 0 {
 		return fmt.Errorf("Unexpected read size: %d", readSize)
 	}
-	layout := InitLayout(int64(dataSize))
+	layout := InitLayout(int64(dataSize), dataAndTreeInSameFile)
 
 	// Calculate the index of blocks that includes the target range in input
 	// data.
diff --git a/pkg/merkletree/merkletree_test.go b/pkg/merkletree/merkletree_test.go
index 911f61df9..ad50ba5f6 100644
--- a/pkg/merkletree/merkletree_test.go
+++ b/pkg/merkletree/merkletree_test.go
@@ -27,80 +27,58 @@ import (
 
 func TestLayout(t *testing.T) {
 	testCases := []struct {
-		dataSize            int64
-		expectedLevelOffset []int64
+		dataSize              int64
+		dataAndTreeInSameFile bool
+		expectedLevelOffset   []int64
 	}{
 		{
-			dataSize:            100,
-			expectedLevelOffset: []int64{0},
+			dataSize:              100,
+			dataAndTreeInSameFile: false,
+			expectedLevelOffset:   []int64{0},
 		},
 		{
-			dataSize:            1000000,
-			expectedLevelOffset: []int64{0, 2 * usermem.PageSize, 3 * usermem.PageSize},
+			dataSize:              100,
+			dataAndTreeInSameFile: true,
+			expectedLevelOffset:   []int64{usermem.PageSize},
 		},
 		{
-			dataSize:            4096 * int64(usermem.PageSize),
-			expectedLevelOffset: []int64{0, 32 * usermem.PageSize, 33 * usermem.PageSize},
+			dataSize:              1000000,
+			dataAndTreeInSameFile: false,
+			expectedLevelOffset:   []int64{0, 2 * usermem.PageSize, 3 * usermem.PageSize},
 		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(fmt.Sprintf("%d", tc.dataSize), func(t *testing.T) {
-			p := InitLayout(tc.dataSize)
-			if p.blockSize != int64(usermem.PageSize) {
-				t.Errorf("got blockSize %d, want %d", p.blockSize, usermem.PageSize)
-			}
-			if p.digestSize != sha256DigestSize {
-				t.Errorf("got digestSize %d, want %d", p.digestSize, sha256DigestSize)
-			}
-			if p.numLevels() != len(tc.expectedLevelOffset) {
-				t.Errorf("got levels %d, want %d", p.numLevels(), len(tc.expectedLevelOffset))
-			}
-			for i := 0; i < p.numLevels() && i < len(tc.expectedLevelOffset); i++ {
-				if p.levelOffset[i] != tc.expectedLevelOffset[i] {
-					t.Errorf("got levelStart[%d] %d, want %d", i, p.levelOffset[i], tc.expectedLevelOffset[i])
-				}
-			}
-		})
-	}
-}
-
-func TestGenerate(t *testing.T) {
-	// The input data has size dataSize. It starts with the data in startWith,
-	// and all other bytes are zeroes.
-	testCases := []struct {
-		data         []byte
-		expectedRoot []byte
-	}{
 		{
-			data:         bytes.Repeat([]byte{0}, usermem.PageSize),
-			expectedRoot: []byte{173, 127, 172, 178, 88, 111, 198, 233, 102, 192, 4, 215, 209, 209, 107, 2, 79, 88, 5, 255, 124, 180, 124, 122, 133, 218, 189, 139, 72, 137, 44, 167},
-		},
-		{
-			data:         bytes.Repeat([]byte{0}, 128*usermem.PageSize+1),
-			expectedRoot: []byte{62, 93, 40, 92, 161, 241, 30, 223, 202, 99, 39, 2, 132, 113, 240, 139, 117, 99, 79, 243, 54, 18, 100, 184, 141, 121, 238, 46, 149, 202, 203, 132},
+			dataSize:              1000000,
+			dataAndTreeInSameFile: true,
+			expectedLevelOffset:   []int64{245 * usermem.PageSize, 247 * usermem.PageSize, 248 * usermem.PageSize},
 		},
 		{
-			data:         []byte{'a'},
-			expectedRoot: []byte{52, 75, 204, 142, 172, 129, 37, 14, 145, 137, 103, 203, 11, 162, 209, 205, 30, 169, 213, 72, 20, 28, 243, 24, 242, 2, 92, 43, 169, 59, 110, 210},
+			dataSize:              4096 * int64(usermem.PageSize),
+			dataAndTreeInSameFile: false,
+			expectedLevelOffset:   []int64{0, 32 * usermem.PageSize, 33 * usermem.PageSize},
 		},
 		{
-			data:         bytes.Repeat([]byte{'a'}, usermem.PageSize),
-			expectedRoot: []byte{201, 62, 238, 45, 13, 176, 47, 16, 172, 199, 70, 13, 149, 118, 225, 34, 220, 248, 205, 83, 196, 191, 141, 252, 174, 27, 62, 116, 235, 207, 255, 90},
+			dataSize:              4096 * int64(usermem.PageSize),
+			dataAndTreeInSameFile: true,
+			expectedLevelOffset:   []int64{4096 * usermem.PageSize, 4128 * usermem.PageSize, 4129 * usermem.PageSize},
 		},
 	}
 
 	for _, tc := range testCases {
-		t.Run(fmt.Sprintf("%d:%v", len(tc.data), tc.data[0]), func(t *testing.T) {
-			var tree bytes.Buffer
-
-			root, err := Generate(bytes.NewBuffer(tc.data), int64(len(tc.data)), &tree, &tree)
-			if err != nil {
-				t.Fatalf("Generate failed: %v", err)
+		t.Run(fmt.Sprintf("%d", tc.dataSize), func(t *testing.T) {
+			l := InitLayout(tc.dataSize, tc.dataAndTreeInSameFile)
+			if l.blockSize != int64(usermem.PageSize) {
+				t.Errorf("got blockSize %d, want %d", l.blockSize, usermem.PageSize)
 			}
-
-			if !bytes.Equal(root, tc.expectedRoot) {
-				t.Errorf("Unexpected root")
+			if l.digestSize != sha256DigestSize {
+				t.Errorf("got digestSize %d, want %d", l.digestSize, sha256DigestSize)
+			}
+			if l.numLevels() != len(tc.expectedLevelOffset) {
+				t.Errorf("got levels %d, want %d", l.numLevels(), len(tc.expectedLevelOffset))
+			}
+			for i := 0; i < l.numLevels() && i < len(tc.expectedLevelOffset); i++ {
+				if l.levelOffset[i] != tc.expectedLevelOffset[i] {
+					t.Errorf("got levelStart[%d] %d, want %d", i, l.levelOffset[i], tc.expectedLevelOffset[i])
+				}
 			}
 		})
 	}
@@ -151,6 +129,57 @@ func (brw *bytesReadWriter) Seek(offset int64, whence int) (int64, error) {
 	return off, nil
 }
 
+func TestGenerate(t *testing.T) {
+	// The input data has size dataSize. It starts with the data in startWith,
+	// and all other bytes are zeroes.
+	testCases := []struct {
+		data         []byte
+		expectedRoot []byte
+	}{
+		{
+			data:         bytes.Repeat([]byte{0}, usermem.PageSize),
+			expectedRoot: []byte{173, 127, 172, 178, 88, 111, 198, 233, 102, 192, 4, 215, 209, 209, 107, 2, 79, 88, 5, 255, 124, 180, 124, 122, 133, 218, 189, 139, 72, 137, 44, 167},
+		},
+		{
+			data:         bytes.Repeat([]byte{0}, 128*usermem.PageSize+1),
+			expectedRoot: []byte{62, 93, 40, 92, 161, 241, 30, 223, 202, 99, 39, 2, 132, 113, 240, 139, 117, 99, 79, 243, 54, 18, 100, 184, 141, 121, 238, 46, 149, 202, 203, 132},
+		},
+		{
+			data:         []byte{'a'},
+			expectedRoot: []byte{52, 75, 204, 142, 172, 129, 37, 14, 145, 137, 103, 203, 11, 162, 209, 205, 30, 169, 213, 72, 20, 28, 243, 24, 242, 2, 92, 43, 169, 59, 110, 210},
+		},
+		{
+			data:         bytes.Repeat([]byte{'a'}, usermem.PageSize),
+			expectedRoot: []byte{201, 62, 238, 45, 13, 176, 47, 16, 172, 199, 70, 13, 149, 118, 225, 34, 220, 248, 205, 83, 196, 191, 141, 252, 174, 27, 62, 116, 235, 207, 255, 90},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("%d:%v", len(tc.data), tc.data[0]), func(t *testing.T) {
+			for _, dataAndTreeInSameFile := range []bool{false, true} {
+				var tree bytesReadWriter
+				var root []byte
+				var err error
+				if dataAndTreeInSameFile {
+					tree.Write(tc.data)
+					root, err = Generate(&tree, int64(len(tc.data)), &tree, &tree, dataAndTreeInSameFile)
+				} else {
+					root, err = Generate(&bytesReadWriter{
+						bytes: tc.data,
+					}, int64(len(tc.data)), &tree, &tree, dataAndTreeInSameFile)
+				}
+				if err != nil {
+					t.Fatalf("got err: %v, want nil", err)
+				}
+
+				if !bytes.Equal(root, tc.expectedRoot) {
+					t.Errorf("got root: %v, want %v", root, tc.expectedRoot)
+				}
+			}
+		})
+	}
+}
+
 func TestVerify(t *testing.T) {
 	// The input data has size dataSize. The portion to be verified ranges from
 	// verifyStart with verifySize. A bit is flipped in outOfRangeByteIndex to
@@ -284,26 +313,37 @@ func TestVerify(t *testing.T) {
 			data := make([]byte, tc.dataSize)
 			// Generate random bytes in data.
 			rand.Read(data)
-			var tree bytesReadWriter
-
-			root, err := Generate(bytes.NewBuffer(data), int64(tc.dataSize), &tree, &tree)
-			if err != nil {
-				t.Fatalf("Generate failed: %v", err)
-			}
 
-			// Flip a bit in data and checks Verify results.
-			var buf bytes.Buffer
-			data[tc.modifyByte] ^= 1
-			if tc.shouldSucceed {
-				if err := Verify(&buf, bytes.NewReader(data), &tree, tc.dataSize, tc.verifyStart, tc.verifySize, root); err != nil && err != io.EOF {
-					t.Errorf("Verification failed when expected to succeed: %v", err)
+			for _, dataAndTreeInSameFile := range []bool{false, true} {
+				var tree bytesReadWriter
+				var root []byte
+				var err error
+				if dataAndTreeInSameFile {
+					tree.Write(data)
+					root, err = Generate(&tree, int64(len(data)), &tree, &tree, dataAndTreeInSameFile)
+				} else {
+					root, err = Generate(&bytesReadWriter{
+						bytes: data,
+					}, int64(tc.dataSize), &tree, &tree, false /* dataAndTreeInSameFile */)
 				}
-				if int64(buf.Len()) != tc.verifySize || !bytes.Equal(data[tc.verifyStart:tc.verifyStart+tc.verifySize], buf.Bytes()) {
-					t.Errorf("Incorrect output from Verify")
+				if err != nil {
+					t.Fatalf("Generate failed: %v", err)
 				}
-			} else {
-				if err := Verify(&buf, bytes.NewReader(data), &tree, tc.dataSize, tc.verifyStart, tc.verifySize, root); err == nil {
-					t.Errorf("Verification succeeded when expected to fail")
+
+				// Flip a bit in data and checks Verify results.
+				var buf bytes.Buffer
+				data[tc.modifyByte] ^= 1
+				if tc.shouldSucceed {
+					if err := Verify(&buf, bytes.NewReader(data), &tree, tc.dataSize, tc.verifyStart, tc.verifySize, root, dataAndTreeInSameFile); err != nil && err != io.EOF {
+						t.Errorf("Verification failed when expected to succeed: %v", err)
+					}
+					if int64(buf.Len()) != tc.verifySize || !bytes.Equal(data[tc.verifyStart:tc.verifyStart+tc.verifySize], buf.Bytes()) {
+						t.Errorf("Incorrect output from Verify")
+					}
+				} else {
+					if err := Verify(&buf, bytes.NewReader(data), &tree, tc.dataSize, tc.verifyStart, tc.verifySize, root, dataAndTreeInSameFile); err == nil {
+						t.Errorf("Verification succeeded when expected to fail")
+					}
 				}
 			}
 		})
@@ -318,36 +358,47 @@ func TestVerifyRandom(t *testing.T) {
 	data := make([]byte, dataSize)
 	// Generate random bytes in data.
 	rand.Read(data)
-	var tree bytesReadWriter
 
-	root, err := Generate(bytes.NewBuffer(data), int64(dataSize), &tree, &tree)
-	if err != nil {
-		t.Fatalf("Generate failed: %v", err)
-	}
+	for _, dataAndTreeInSameFile := range []bool{false, true} {
+		var tree bytesReadWriter
+		var root []byte
+		var err error
+		if dataAndTreeInSameFile {
+			tree.Write(data)
+			root, err = Generate(&tree, int64(len(data)), &tree, &tree, dataAndTreeInSameFile)
+		} else {
+			root, err = Generate(&bytesReadWriter{
+				bytes: data,
+			}, int64(dataSize), &tree, &tree, dataAndTreeInSameFile)
+		}
+		if err != nil {
+			t.Fatalf("Generate failed: %v", err)
+		}
 
-	// Pick a random portion of data.
-	start := rand.Int63n(dataSize - 1)
-	size := rand.Int63n(dataSize) + 1
+		// Pick a random portion of data.
+		start := rand.Int63n(dataSize - 1)
+		size := rand.Int63n(dataSize) + 1
 
-	var buf bytes.Buffer
-	// Checks that the random portion of data from the original data is
-	// verified successfully.
-	if err := Verify(&buf, bytes.NewReader(data), &tree, dataSize, start, size, root); err != nil && err != io.EOF {
-		t.Errorf("Verification failed for correct data: %v", err)
-	}
-	if size > dataSize-start {
-		size = dataSize - start
-	}
-	if int64(buf.Len()) != size || !bytes.Equal(data[start:start+size], buf.Bytes()) {
-		t.Errorf("Incorrect output from Verify")
-	}
+		var buf bytes.Buffer
+		// Checks that the random portion of data from the original data is
+		// verified successfully.
+		if err := Verify(&buf, bytes.NewReader(data), &tree, dataSize, start, size, root, dataAndTreeInSameFile); err != nil && err != io.EOF {
+			t.Errorf("Verification failed for correct data: %v", err)
+		}
+		if size > dataSize-start {
+			size = dataSize - start
+		}
+		if int64(buf.Len()) != size || !bytes.Equal(data[start:start+size], buf.Bytes()) {
+			t.Errorf("Incorrect output from Verify")
+		}
 
-	buf.Reset()
-	// Flip a random bit in randPortion, and check that verification fails.
-	randBytePos := rand.Int63n(size)
-	data[start+randBytePos] ^= 1
+		buf.Reset()
+		// Flip a random bit in randPortion, and check that verification fails.
+		randBytePos := rand.Int63n(size)
+		data[start+randBytePos] ^= 1
 
-	if err := Verify(&buf, bytes.NewReader(data), &tree, dataSize, start, size, root); err == nil {
-		t.Errorf("Verification succeeded for modified data")
+		if err := Verify(&buf, bytes.NewReader(data), &tree, dataSize, start, size, root, dataAndTreeInSameFile); err == nil {
+			t.Errorf("Verification succeeded for modified data")
+		}
 	}
 }
-- 
cgit v1.2.3


From 62af21c7f31fc3a4dca20df1d0cded197cf68ee8 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 24 Aug 2020 20:04:12 -0700
Subject: Flush in fsimpl/gofer.regularFileFD.OnClose() if there are no dirty
 pages.

This is closer to indistinguishable from VFS1 behavior.

PiperOrigin-RevId: 328256068
---
 pkg/sentry/fsimpl/gofer/regular_file.go | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 7e1cbf065..3b5462682 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -56,10 +56,16 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
 	if !fd.vfsfd.IsWritable() {
 		return nil
 	}
-	// Skip flushing if writes may be buffered by the client, since (as with
-	// the VFS1 client) we don't flush buffered writes on close anyway.
+	// Skip flushing if there are client-buffered writes, since (as with the
+	// VFS1 client) we don't flush buffered writes on close anyway.
 	d := fd.dentry()
-	if d.fs.opts.interop == InteropModeExclusive {
+	if d.fs.opts.interop != InteropModeExclusive {
+		return nil
+	}
+	d.dataMu.RLock()
+	haveDirtyPages := !d.dirty.IsEmpty()
+	d.dataMu.RUnlock()
+	if haveDirtyPages {
 		return nil
 	}
 	d.handleMu.RLock()
-- 
cgit v1.2.3


From 886d8f64d962a9c34145414c8c41e3d19f886ce1 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Mon, 24 Aug 2020 20:39:26 -0700
Subject: Automated rollback of changelist 327325153

PiperOrigin-RevId: 328259353
---
 pkg/tcpip/transport/tcp/BUILD       | 17 +--------------
 pkg/tcpip/transport/tcp/connect.go  | 11 ----------
 pkg/tcpip/transport/tcp/endpoint.go |  4 ++--
 pkg/tcpip/transport/tcp/segment.go  | 23 ++++++---------------
 pkg/tcpip/transport/tcp/snd.go      | 41 ++++++++++++++-----------------------
 5 files changed, 24 insertions(+), 72 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index bde071f2a..234fb95ce 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -11,8 +11,7 @@ go_template_instance(
     template = "//pkg/ilist:generic_list",
     types = {
         "Element": "*segment",
-        "ElementMapper": "segmentMapper",
-        "Linker": "*segmentEntry",
+        "Linker": "*segment",
     },
 )
 
@@ -28,19 +27,6 @@ go_template_instance(
     },
 )
 
-go_template_instance(
-    name = "tcp_rack_segment_list",
-    out = "tcp_rack_segment_list.go",
-    package = "tcp",
-    prefix = "rackSegment",
-    template = "//pkg/ilist:generic_list",
-    types = {
-        "Element": "*segment",
-        "ElementMapper": "rackSegmentMapper",
-        "Linker": "*rackSegmentEntry",
-    },
-)
-
 go_library(
     name = "tcp",
     srcs = [
@@ -69,7 +55,6 @@ go_library(
         "snd.go",
         "snd_state.go",
         "tcp_endpoint_list.go",
-        "tcp_rack_segment_list.go",
         "tcp_segment_list.go",
         "timer.go",
     ],
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 87980c0a1..290172ac9 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -924,18 +924,7 @@ func (e *endpoint) handleWrite() *tcpip.Error {
 
 	first := e.sndQueue.Front()
 	if first != nil {
-		lastSeg := e.snd.writeList.Back()
 		e.snd.writeList.PushBackList(&e.sndQueue)
-		if lastSeg == nil {
-			lastSeg = e.snd.writeList.Front()
-		} else {
-			lastSeg = lastSeg.segEntry.Next()
-		}
-		// Add new segments to rcList, as rcList and writeList should
-		// be consistent.
-		for seg := lastSeg; seg != nil; seg = seg.segEntry.Next() {
-			e.snd.rcList.PushBack(seg)
-		}
 		e.sndBufInQueue = 0
 	}
 
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 9df22ac84..4ba0ea1c0 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1428,7 +1428,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	vec = append([][]byte(nil), vec...)
 
 	var num int64
-	for s := e.rcvList.Front(); s != nil; s = s.segEntry.Next() {
+	for s := e.rcvList.Front(); s != nil; s = s.Next() {
 		views := s.data.Views()
 
 		for i := s.viewToDeliver; i < len(views); i++ {
@@ -2249,7 +2249,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	if !handshake {
 		e.segmentQueue.mu.Lock()
 		for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} {
-			for s := l.Front(); s != nil; s = s.segEntry.Next() {
+			for s := l.Front(); s != nil; s = s.Next() {
 				s.id = e.ID
 				s.route = r.Clone()
 				e.sndWaker.Assert()
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index a20755f78..94307d31a 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -30,13 +30,12 @@ import (
 //
 // +stateify savable
 type segment struct {
-	segEntry     segmentEntry
-	rackSegEntry rackSegmentEntry
-	refCnt       int32
-	id           stack.TransportEndpointID `state:"manual"`
-	route        stack.Route               `state:"manual"`
-	data         buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
-	hdr          header.TCP
+	segmentEntry
+	refCnt int32
+	id     stack.TransportEndpointID `state:"manual"`
+	route  stack.Route               `state:"manual"`
+	data   buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
+	hdr    header.TCP
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View `state:"nosave"`
@@ -62,16 +61,6 @@ type segment struct {
 	xmitCount uint32
 }
 
-// segmentMapper is the ElementMapper for the writeList.
-type segmentMapper struct{}
-
-func (segmentMapper) linkerFor(seg *segment) *segmentEntry { return &seg.segEntry }
-
-// rackSegmentMapper is the ElementMapper for the rcList.
-type rackSegmentMapper struct{}
-
-func (rackSegmentMapper) linkerFor(seg *segment) *rackSegmentEntry { return &seg.rackSegEntry }
-
 func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) *segment {
 	s := &segment{
 		refCnt: 1,
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 31151f23d..c55589c45 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -154,7 +154,6 @@ type sender struct {
 	closed      bool
 	writeNext   *segment
 	writeList   segmentList
-	rcList      rackSegmentList
 	resendTimer timer       `state:"nosave"`
 	resendWaker sleep.Waker `state:"nosave"`
 
@@ -368,7 +367,7 @@ func (s *sender) updateMaxPayloadSize(mtu, count int) {
 
 	// Rewind writeNext to the first segment exceeding the MTU. Do nothing
 	// if it is already before such a packet.
-	for seg := s.writeList.Front(); seg != nil; seg = seg.segEntry.Next() {
+	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
 		if seg == s.writeNext {
 			// We got to writeNext before we could find a segment
 			// exceeding the MTU.
@@ -623,7 +622,6 @@ func (s *sender) splitSeg(seg *segment, size int) {
 	nSeg.data.TrimFront(size)
 	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
 	s.writeList.InsertAfter(seg, nSeg)
-	s.rcList.InsertAfter(seg, nSeg)
 
 	// The segment being split does not carry PUSH flag because it is
 	// followed by the newly split segment.
@@ -655,7 +653,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 	var s3 *segment
 	var s4 *segment
 	// Step 1.
-	for seg := nextSegHint; seg != nil; seg = seg.segEntry.Next() {
+	for seg := nextSegHint; seg != nil; seg = seg.Next() {
 		// Stop iteration if we hit a segment that has never been
 		// transmitted (i.e. either it has no assigned sequence number
 		// or if it does have one, it's >= the next sequence number
@@ -685,7 +683,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 				// NextSeg():
 				//     (1.c) IsLost(S2) returns true.
 				if s.ep.scoreboard.IsLost(segSeq) {
-					return seg, seg.segEntry.Next(), false
+					return seg, seg.Next(), false
 				}
 
 				// NextSeg():
@@ -699,7 +697,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 				// SHOULD be returned.
 				if s3 == nil {
 					s3 = seg
-					hint = seg.segEntry.Next()
+					hint = seg.Next()
 				}
 			}
 			// NextSeg():
@@ -733,7 +731,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 	// range of one segment of up to SMSS octets of
 	// previously unsent data starting with sequence number
 	// HighData+1 MUST be returned."
-	for seg := s.writeNext; seg != nil; seg = seg.segEntry.Next() {
+	for seg := s.writeNext; seg != nil; seg = seg.Next() {
 		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) {
 			continue
 		}
@@ -775,16 +773,15 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 			// triggering bugs in poorly written DNS
 			// implementations.
 			var nextTooBig bool
-			for seg.segEntry.Next() != nil && seg.segEntry.Next().data.Size() != 0 {
-				if seg.data.Size()+seg.segEntry.Next().data.Size() > available {
+			for seg.Next() != nil && seg.Next().data.Size() != 0 {
+				if seg.data.Size()+seg.Next().data.Size() > available {
 					nextTooBig = true
 					break
 				}
-				seg.data.Append(seg.segEntry.Next().data)
+				seg.data.Append(seg.Next().data)
 
 				// Consume the segment that we just merged in.
-				s.writeList.Remove(seg.segEntry.Next())
-				s.rcList.Remove(seg.rackSegEntry.Next())
+				s.writeList.Remove(seg.Next())
 			}
 			if !nextTooBig && seg.data.Size() < available {
 				// Segment is not full.
@@ -951,7 +948,7 @@ func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool)
 			}
 			dataSent = true
 			s.outstanding++
-			s.writeNext = nextSeg.segEntry.Next()
+			s.writeNext = nextSeg.Next()
 			continue
 		}
 
@@ -964,7 +961,6 @@ func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool)
 		// transmitted in (C.1)."
 		s.outstanding++
 		dataSent = true
-
 		s.sendSegment(nextSeg)
 
 		segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
@@ -1039,7 +1035,7 @@ func (s *sender) sendData() {
 	if s.fr.active && s.ep.sackPermitted {
 		dataSent = s.handleSACKRecovery(s.maxPayloadSize, end)
 	} else {
-		for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.segEntry.Next() {
+		for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() {
 			cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize
 			if cwndLimit < limit {
 				limit = cwndLimit
@@ -1047,7 +1043,7 @@ func (s *sender) sendData() {
 			if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
 				// Move writeNext along so that we don't try and scan data that
 				// has already been SACKED.
-				s.writeNext = seg.segEntry.Next()
+				s.writeNext = seg.Next()
 				continue
 			}
 			if sent := s.maybeSendSegment(seg, limit, end); !sent {
@@ -1055,7 +1051,7 @@ func (s *sender) sendData() {
 			}
 			dataSent = true
 			s.outstanding += s.pCount(seg)
-			s.writeNext = seg.segEntry.Next()
+			s.writeNext = seg.Next()
 		}
 	}
 
@@ -1186,7 +1182,7 @@ func (s *sender) SetPipe() {
 	}
 	pipe := 0
 	smss := seqnum.Size(s.ep.scoreboard.SMSS())
-	for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.segEntry.Next() {
+	for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() {
 		// With GSO each segment can be much larger than SMSS. So check the segment
 		// in SMSS sized ranges.
 		segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.data.Size()))
@@ -1388,7 +1384,7 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 			}
 
 			if s.writeNext == seg {
-				s.writeNext = seg.segEntry.Next()
+				s.writeNext = seg.Next()
 			}
 
 			// Update the RACK fields if SACK is enabled.
@@ -1397,7 +1393,6 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 			}
 
 			s.writeList.Remove(seg)
-			s.rcList.Remove(seg)
 
 			// if SACK is enabled then Only reduce outstanding if
 			// the segment was not previously SACKED as these have
@@ -1465,12 +1460,6 @@ func (s *sender) sendSegment(seg *segment) *tcpip.Error {
 		if s.sndCwnd < s.sndSsthresh {
 			s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
 		}
-
-		// Move the segment which has to be retransmitted to the end of the list, as
-		// RACK requires the segments in the order of their transmission times.
-		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2
-		// Step 5
-		s.rcList.PushBack(seg)
 	}
 	seg.xmitTime = time.Now()
 	seg.xmitCount++
-- 
cgit v1.2.3


From 2ddd883a9459ab0f5ef22b81b8efbd1733fda035 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 25 Aug 2020 00:24:16 -0700
Subject: Fix deadlock in gofer direct IO.

Fixes several java runtime tests:
java/nio/channels/FileChannel/directio/ReadDirect.java
java/nio/channels/FileChannel/directio/PreadDirect.java

Updates #3576.

PiperOrigin-RevId: 328281849
---
 pkg/sentry/fsimpl/gofer/regular_file.go | 31 ++++++++++++++++++++-----------
 pkg/sentry/fsimpl/gofer/time.go         | 14 ++++++++++++++
 2 files changed, 34 insertions(+), 11 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 3b5462682..a2e9342d5 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -123,6 +123,10 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 		return 0, io.EOF
 	}
 
+	var (
+		n       int64
+		readErr error
+	)
 	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
 		// Lock d.metadataMu for the rest of the read to prevent d.size from
 		// changing.
@@ -133,20 +137,25 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 		if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
 			return 0, err
 		}
-	}
-
-	rw := getDentryReadWriter(ctx, d, offset)
-	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		rw := getDentryReadWriter(ctx, d, offset)
 		// Require the read to go to the remote file.
 		rw.direct = true
+		n, readErr = dst.CopyOutFrom(ctx, rw)
+		putDentryReadWriter(rw)
+		if d.fs.opts.interop != InteropModeShared {
+			// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+			d.touchAtimeLocked(fd.vfsfd.Mount())
+		}
+	} else {
+		rw := getDentryReadWriter(ctx, d, offset)
+		n, readErr = dst.CopyOutFrom(ctx, rw)
+		putDentryReadWriter(rw)
+		if d.fs.opts.interop != InteropModeShared {
+			// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+			d.touchAtime(fd.vfsfd.Mount())
+		}
 	}
-	n, err := dst.CopyOutFrom(ctx, rw)
-	putDentryReadWriter(rw)
-	if d.fs.opts.interop != InteropModeShared {
-		// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
-		d.touchAtime(fd.vfsfd.Mount())
-	}
-	return n, err
+	return n, readErr
 }
 
 // Read implements vfs.FileDescriptionImpl.Read.
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index 98733253d..7e825caae 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -52,6 +52,20 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) {
 	mnt.EndWrite()
 }
 
+// Preconditions: d.metadataMu is locked. d.cachedMetadataAuthoritative() == true.
+func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime || mnt.ReadOnly() {
+		return
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	now := d.fs.clock.Now().Nanoseconds()
+	atomic.StoreInt64(&d.atime, now)
+	atomic.StoreUint32(&d.atimeDirty, 1)
+	mnt.EndWrite()
+}
+
 // Preconditions:
 // * d.cachedMetadataAuthoritative() == true.
 // * The caller has successfully called vfs.Mount.CheckBeginWrite().
-- 
cgit v1.2.3


From 086f085660b73e8ead7ca0bfef5835a6aaad8866 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 25 Aug 2020 07:15:50 -0700
Subject: Fix TCP_LINGER2 behavior to match linux.

We still deviate a bit from linux in how long we will actually wait in
FIN-WAIT-2. Linux seems to cap it with TIME_WAIT_LEN and it's not completely
obvious as to why it's done that way. For now I think we can ignore that and
fix it if it really is an issue.

PiperOrigin-RevId: 328324922
---
 pkg/sentry/socket/netstack/netstack.go       | 10 ++++++---
 pkg/tcpip/transport/tcp/endpoint.go          | 23 ++++++++++++++-------
 pkg/tcpip/transport/tcp/tcp_test.go          |  7 ++++---
 test/syscalls/linux/socket_ip_tcp_generic.cc | 31 ++++++++++++++++++++++------
 4 files changed, 52 insertions(+), 19 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 4d0e33696..921464f5d 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1409,8 +1409,12 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal
 		if err := ep.GetSockOpt(&v); err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		lingerTimeout := primitive.Int32(time.Duration(v) / time.Second)
+		var lingerTimeout primitive.Int32
+		if v >= 0 {
+			lingerTimeout = primitive.Int32(time.Duration(v) / time.Second)
+		} else {
+			lingerTimeout = -1
+		}
 		return &lingerTimeout, nil
 
 	case linux.TCP_DEFER_ACCEPT:
@@ -1967,7 +1971,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 			return syserr.ErrInvalidArgument
 		}
 
-		v := usermem.ByteOrder.Uint32(optVal)
+		v := int32(usermem.ByteOrder.Uint32(optVal))
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))))
 
 	case linux.TCP_DEFER_ACCEPT:
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 4ba0ea1c0..9c0f4c9f4 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1775,15 +1775,24 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 
 	case tcpip.TCPLingerTimeoutOption:
 		e.LockUser()
-		if v < 0 {
+
+		switch {
+		case v < 0:
 			// Same as effectively disabling TCPLinger timeout.
-			v = 0
-		}
-		// Cap it to MaxTCPLingerTimeout.
-		stkTCPLingerTimeout := tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
-		if v > stkTCPLingerTimeout {
-			v = stkTCPLingerTimeout
+			v = -1
+		case v == 0:
+			// Same as the stack default.
+			var stackLingerTimeout tcpip.TCPLingerTimeoutOption
+			if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil {
+				panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err))
+			}
+			v = stackLingerTimeout
+		case v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout):
+			// Cap it to Stack's default TCP_LINGER2 timeout.
+			v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
+		default:
 		}
+
 		e.tcpLingerTimeout = time.Duration(v)
 		e.UnlockUser()
 
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 55ae09a2f..9650bb06c 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -6206,12 +6206,13 @@ func TestTCPLingerTimeout(t *testing.T) {
 		tcpLingerTimeout time.Duration
 		want             time.Duration
 	}{
-		{"NegativeLingerTimeout", -123123, 0},
-		{"ZeroLingerTimeout", 0, 0},
+		{"NegativeLingerTimeout", -123123, -1},
+		// Zero is treated same as the stack's default TCP_LINGER2 timeout.
+		{"ZeroLingerTimeout", 0, tcp.DefaultTCPLingerTimeout},
 		{"InRangeLingerTimeout", 10 * time.Second, 10 * time.Second},
 		// Values > stack's TCPLingerTimeout are capped to the stack's
 		// value. Defaults to tcp.DefaultTCPLingerTimeout(60 seconds)
-		{"AboveMaxLingerTimeout", 125 * time.Second, 120 * time.Second},
+		{"AboveMaxLingerTimeout", tcp.MaxTCPLingerTimeout + 5*time.Second, tcp.MaxTCPLingerTimeout},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 53c076787..04356b780 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -819,18 +819,37 @@ TEST_P(TCPSocketPairTest, TCPLingerTimeoutDefault) {
   EXPECT_EQ(get, kDefaultTCPLingerTimeout);
 }
 
-TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutZeroOrLess) {
+TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutLessThanZero) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
-  constexpr int kZero = 0;
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &kZero,
-                         sizeof(kZero)),
-              SyscallSucceedsWithValue(0));
-
   constexpr int kNegative = -1234;
   EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2,
                          &kNegative, sizeof(kNegative)),
               SyscallSucceedsWithValue(0));
+  int get = INT_MAX;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, -1);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kZero = 0;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &kZero,
+                         sizeof(kZero)),
+              SyscallSucceedsWithValue(0));
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_THAT(get,
+              AnyOf(Eq(kMaxTCPLingerTimeout), Eq(kOldMaxTCPLingerTimeout)));
 }
 
 TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutAboveMax) {
-- 
cgit v1.2.3


From 98e652f6f1d8f3d0bbc4600b1ef2ce471d8e6406 Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Tue, 25 Aug 2020 09:59:42 -0700
Subject: Support SO_LINGER socket option.

When SO_LINGER option is enabled, the close will not return until all the
queued messages are sent and acknowledged for the socket or linger timeout is
reached. If the option is not set, close will return immediately. This option
is mainly supported for connection oriented protocols such as TCP.

PiperOrigin-RevId: 328350576
---
 pkg/sentry/socket/netstack/netstack.go       |  45 ++++-
 pkg/sentry/socket/unix/transport/unix.go     |   2 +-
 pkg/tcpip/tcpip.go                           |   9 +
 pkg/tcpip/transport/tcp/endpoint.go          |  33 ++++
 test/packetimpact/dut/posix_server.cc        |   7 +
 test/packetimpact/proto/posix_server.proto   |  11 ++
 test/packetimpact/testbench/dut.go           |  42 +++++
 test/packetimpact/tests/BUILD                |  10 ++
 test/packetimpact/tests/tcp_linger_test.go   | 253 +++++++++++++++++++++++++++
 test/syscalls/linux/socket_ip_tcp_generic.cc | 119 +++++++++++++
 test/syscalls/linux/socket_ip_udp_generic.cc |  30 ++++
 11 files changed, 557 insertions(+), 4 deletions(-)
 create mode 100644 test/packetimpact/tests/tcp_linger_test.go

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 921464f5d..626195be2 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -479,8 +479,35 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error {
 }
 
 // Release implements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release(context.Context) {
+func (s *socketOpsCommon) Release(ctx context.Context) {
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventHUp|waiter.EventErr)
+	defer s.EventUnregister(&e)
+
 	s.Endpoint.Close()
+
+	// SO_LINGER option is valid only for TCP. For other socket types
+	// return after endpoint close.
+	if family, skType, _ := s.Type(); skType != linux.SOCK_STREAM || (family != linux.AF_INET && family != linux.AF_INET6) {
+		return
+	}
+
+	var v tcpip.LingerOption
+	if err := s.Endpoint.GetSockOpt(&v); err != nil {
+		return
+	}
+
+	// The case for zero timeout is handled in tcp endpoint close function.
+	// Close is blocked until either:
+	// 1. The endpoint state is not in any of the states: FIN-WAIT1,
+	// CLOSING and LAST_ACK.
+	// 2. Timeout is reached.
+	if v.Enabled && v.Timeout != 0 {
+		t := kernel.TaskFromContext(ctx)
+		start := t.Kernel().MonotonicClock().Now()
+		deadline := start.Add(v.Timeout)
+		t.BlockWithDeadline(ch, true, deadline)
+	}
 }
 
 // Read implements fs.FileOperations.Read.
@@ -1195,7 +1222,16 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		linger := linux.Linger{}
+		var v tcpip.LingerOption
+		var linger linux.Linger
+		if err := ep.GetSockOpt(&v); err != nil {
+			return &linger, nil
+		}
+
+		if v.Enabled {
+			linger.OnOff = 1
+		}
+		linger.Linger = int32(v.Timeout.Seconds())
 		return &linger, nil
 
 	case linux.SO_SNDTIMEO:
@@ -1865,7 +1901,10 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 			socket.SetSockOptEmitUnimplementedEvent(t, name)
 		}
 
-		return nil
+		return syserr.TranslateNetstackError(
+			ep.SetSockOpt(tcpip.LingerOption{
+				Enabled: v.OnOff != 0,
+				Timeout: time.Second * time.Duration(v.Linger)}))
 
 	case linux.SO_DETACH_FILTER:
 		// optval is ignored.
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 475d7177e..ab7bab5cd 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -942,7 +942,7 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	switch opt.(type) {
-	case tcpip.ErrorOption:
+	case tcpip.ErrorOption, *tcpip.LingerOption:
 		return nil
 
 	default:
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 07c85ce59..290c4e138 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -978,6 +978,15 @@ const (
 	TCPTimeWaitReuseLoopbackOnly
 )
 
+// LingerOption is used by SetSockOpt/GetSockOpt to set/get the
+// duration for which a socket lingers before returning from Close.
+//
+// +stateify savable
+type LingerOption struct {
+	Enabled bool
+	Timeout time.Duration
+}
+
 // IPPacketInfo is the message structure for IP_PKTINFO.
 //
 // +stateify savable
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 9c0f4c9f4..ff9b8804d 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -654,6 +654,9 @@ type endpoint struct {
 
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
+
+	// linger is used for SO_LINGER socket option.
+	linger tcpip.LingerOption
 }
 
 // UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -1007,6 +1010,26 @@ func (e *endpoint) Close() {
 		return
 	}
 
+	if e.linger.Enabled && e.linger.Timeout == 0 {
+		s := e.EndpointState()
+		isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv
+		if isResetState {
+			// Close the endpoint without doing full shutdown and
+			// send a RST.
+			e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+			e.closeNoShutdownLocked()
+
+			// Wake up worker to close the endpoint.
+			switch s {
+			case StateSynRecv:
+				e.notifyProtocolGoroutine(notifyClose)
+			default:
+				e.notifyProtocolGoroutine(notifyTickleWorker)
+			}
+			return
+		}
+	}
+
 	// Issue a shutdown so that the peer knows we won't send any more data
 	// if we're connected, or stop accepting if we're listening.
 	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
@@ -1807,6 +1830,11 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 	case tcpip.SocketDetachFilterOption:
 		return nil
 
+	case tcpip.LingerOption:
+		e.LockUser()
+		e.linger = v
+		e.UnlockUser()
+
 	default:
 		return nil
 	}
@@ -2032,6 +2060,11 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 			Port: port,
 		}
 
+	case *tcpip.LingerOption:
+		e.LockUser()
+		*o = e.linger
+		e.UnlockUser()
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
diff --git a/test/packetimpact/dut/posix_server.cc b/test/packetimpact/dut/posix_server.cc
index 76ba701da..0f8e279f8 100644
--- a/test/packetimpact/dut/posix_server.cc
+++ b/test/packetimpact/dut/posix_server.cc
@@ -312,6 +312,13 @@ class PosixImpl final : public posix_server::Posix::Service {
     return ::grpc::Status::OK;
   }
 
+  ::grpc::Status Shutdown(grpc_impl::ServerContext *context,
+                          const ::posix_server::ShutdownRequest *request,
+                          ::posix_server::ShutdownResponse *response) override {
+    response->set_errno_(shutdown(request->fd(), request->how()));
+    return ::grpc::Status::OK;
+  }
+
   ::grpc::Status Recv(::grpc::ServerContext *context,
                       const ::posix_server::RecvRequest *request,
                       ::posix_server::RecvResponse *response) override {
diff --git a/test/packetimpact/proto/posix_server.proto b/test/packetimpact/proto/posix_server.proto
index ccd20b10d..f32ed54ef 100644
--- a/test/packetimpact/proto/posix_server.proto
+++ b/test/packetimpact/proto/posix_server.proto
@@ -188,6 +188,15 @@ message SocketResponse {
   int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
+message ShutdownRequest {
+  int32 fd = 1;
+  int32 how = 2;
+}
+
+message ShutdownResponse {
+  int32 errno_ = 1;  // "errno" may fail to compile in c++.
+}
+
 message RecvRequest {
   int32 sockfd = 1;
   int32 len = 2;
@@ -225,6 +234,8 @@ service Posix {
   rpc SetSockOpt(SetSockOptRequest) returns (SetSockOptResponse);
   // Call socket() on the DUT.
   rpc Socket(SocketRequest) returns (SocketResponse);
+  // Call shutdown() on the DUT.
+  rpc Shutdown(ShutdownRequest) returns (ShutdownResponse);
   // Call recv() on the DUT.
   rpc Recv(RecvRequest) returns (RecvResponse);
 }
diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go
index 73c532e75..6165ab293 100644
--- a/test/packetimpact/testbench/dut.go
+++ b/test/packetimpact/testbench/dut.go
@@ -16,11 +16,13 @@ package testbench
 
 import (
 	"context"
+	"encoding/binary"
 	"flag"
 	"net"
 	"strconv"
 	"syscall"
 	"testing"
+	"time"
 
 	pb "gvisor.dev/gvisor/test/packetimpact/proto/posix_server_go_proto"
 
@@ -700,3 +702,43 @@ func (dut *DUT) RecvWithErrno(ctx context.Context, t *testing.T, sockfd, len, fl
 	}
 	return resp.GetRet(), resp.GetBuf(), syscall.Errno(resp.GetErrno_())
 }
+
+// SetSockLingerOption sets SO_LINGER socket option on the DUT.
+func (dut *DUT) SetSockLingerOption(t *testing.T, sockfd int32, timeout time.Duration, enable bool) {
+	var linger unix.Linger
+	if enable {
+		linger.Onoff = 1
+	}
+	linger.Linger = int32(timeout / time.Second)
+
+	buf := make([]byte, 8)
+	binary.LittleEndian.PutUint32(buf, uint32(linger.Onoff))
+	binary.LittleEndian.PutUint32(buf[4:], uint32(linger.Linger))
+	dut.SetSockOpt(t, sockfd, unix.SOL_SOCKET, unix.SO_LINGER, buf)
+}
+
+// Shutdown calls shutdown on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is needed, use
+// ShutdownWithErrno.
+func (dut *DUT) Shutdown(t *testing.T, fd, how int32) error {
+	t.Helper()
+
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	return dut.ShutdownWithErrno(ctx, t, fd, how)
+}
+
+// ShutdownWithErrno calls shutdown on the DUT.
+func (dut *DUT) ShutdownWithErrno(ctx context.Context, t *testing.T, fd, how int32) error {
+	t.Helper()
+
+	req := pb.ShutdownRequest{
+		Fd:  fd,
+		How: how,
+	}
+	resp, err := dut.posixServer.Shutdown(ctx, &req)
+	if err != nil {
+		t.Fatalf("failed to call Shutdown: %s", err)
+	}
+	return syscall.Errno(resp.GetErrno_())
+}
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 74658fea0..7a7152fa5 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -308,3 +308,13 @@ packetimpact_go_test(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
+
+packetimpact_go_test(
+    name = "tcp_linger",
+    srcs = ["tcp_linger_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/test/packetimpact/tests/tcp_linger_test.go b/test/packetimpact/tests/tcp_linger_test.go
new file mode 100644
index 000000000..913e49e06
--- /dev/null
+++ b/test/packetimpact/tests/tcp_linger_test.go
@@ -0,0 +1,253 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_linger_test
+
+import (
+	"context"
+	"flag"
+	"syscall"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func createSocket(t *testing.T, dut testbench.DUT) (int32, int32, testbench.TCPIPv4) {
+	listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	conn.Connect(t)
+	acceptFD, _ := dut.Accept(t, listenFD)
+	return acceptFD, listenFD, conn
+}
+
+func closeAll(t *testing.T, dut testbench.DUT, listenFD int32, conn testbench.TCPIPv4) {
+	conn.Close(t)
+	dut.Close(t, listenFD)
+	dut.TearDown()
+}
+
+// lingerDuration is the timeout value used with SO_LINGER socket option.
+const lingerDuration = 3 * time.Second
+
+// TestTCPLingerZeroTimeout tests when SO_LINGER is set with zero timeout. DUT
+// should send RST-ACK when socket is closed.
+func TestTCPLingerZeroTimeout(t *testing.T) {
+	// Create a socket, listen, TCP connect, and accept.
+	dut := testbench.NewDUT(t)
+	acceptFD, listenFD, conn := createSocket(t, dut)
+	defer closeAll(t, dut, listenFD, conn)
+
+	dut.SetSockLingerOption(t, acceptFD, 0, true)
+	dut.Close(t, acceptFD)
+
+	// If the linger timeout is set to zero, the DUT should send a RST.
+	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}, time.Second); err != nil {
+		t.Errorf("expected RST-ACK packet within a second but got none: %s", err)
+	}
+	conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+}
+
+// TestTCPLingerOff tests when SO_LINGER is not set. DUT should send FIN-ACK
+// when socket is closed.
+func TestTCPLingerOff(t *testing.T) {
+	// Create a socket, listen, TCP connect, and accept.
+	dut := testbench.NewDUT(t)
+	acceptFD, listenFD, conn := createSocket(t, dut)
+	defer closeAll(t, dut, listenFD, conn)
+
+	dut.Close(t, acceptFD)
+
+	// If SO_LINGER is not set, DUT should send a FIN-ACK.
+	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
+		t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
+	}
+	conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+}
+
+// TestTCPLingerNonZeroTimeout tests when SO_LINGER is set with non-zero timeout.
+// DUT should close the socket after timeout.
+func TestTCPLingerNonZeroTimeout(t *testing.T) {
+	for _, tt := range []struct {
+		description string
+		lingerOn    bool
+	}{
+		{"WithNonZeroLinger", true},
+		{"WithoutLinger", false},
+	} {
+		t.Run(tt.description, func(t *testing.T) {
+			// Create a socket, listen, TCP connect, and accept.
+			dut := testbench.NewDUT(t)
+			acceptFD, listenFD, conn := createSocket(t, dut)
+			defer closeAll(t, dut, listenFD, conn)
+
+			dut.SetSockLingerOption(t, acceptFD, lingerDuration, tt.lingerOn)
+
+			// Increase timeout as Close will take longer time to
+			// return when SO_LINGER is set with non-zero timeout.
+			timeout := lingerDuration + 1*time.Second
+			ctx, cancel := context.WithTimeout(context.Background(), timeout)
+			defer cancel()
+			start := time.Now()
+			dut.CloseWithErrno(ctx, t, acceptFD)
+			end := time.Now()
+			diff := end.Sub(start)
+
+			if tt.lingerOn && diff < lingerDuration {
+				t.Errorf("expected close to return after %v seconds, but returned sooner", lingerDuration)
+			} else if !tt.lingerOn && diff > 1*time.Second {
+				t.Errorf("expected close to return within a second, but returned later")
+			}
+
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
+				t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
+			}
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+		})
+	}
+}
+
+// TestTCPLingerSendNonZeroTimeout tests when SO_LINGER is set with non-zero
+// timeout and send a packet. DUT should close the socket after timeout.
+func TestTCPLingerSendNonZeroTimeout(t *testing.T) {
+	for _, tt := range []struct {
+		description string
+		lingerOn    bool
+	}{
+		{"WithSendNonZeroLinger", true},
+		{"WithoutLinger", false},
+	} {
+		t.Run(tt.description, func(t *testing.T) {
+			// Create a socket, listen, TCP connect, and accept.
+			dut := testbench.NewDUT(t)
+			acceptFD, listenFD, conn := createSocket(t, dut)
+			defer closeAll(t, dut, listenFD, conn)
+
+			dut.SetSockLingerOption(t, acceptFD, lingerDuration, tt.lingerOn)
+
+			// Send data.
+			sampleData := []byte("Sample Data")
+			dut.Send(t, acceptFD, sampleData, 0)
+
+			// Increase timeout as Close will take longer time to
+			// return when SO_LINGER is set with non-zero timeout.
+			timeout := lingerDuration + 1*time.Second
+			ctx, cancel := context.WithTimeout(context.Background(), timeout)
+			defer cancel()
+			start := time.Now()
+			dut.CloseWithErrno(ctx, t, acceptFD)
+			end := time.Now()
+			diff := end.Sub(start)
+
+			if tt.lingerOn && diff < lingerDuration {
+				t.Errorf("expected close to return after %v seconds, but returned sooner", lingerDuration)
+			} else if !tt.lingerOn && diff > 1*time.Second {
+				t.Errorf("expected close to return within a second, but returned later")
+			}
+
+			samplePayload := &testbench.Payload{Bytes: sampleData}
+			if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil {
+				t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+			}
+
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
+				t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
+			}
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+		})
+	}
+}
+
+// TestTCPLingerShutdownZeroTimeout tests SO_LINGER with shutdown() and zero
+// timeout. DUT should send RST-ACK when socket is closed.
+func TestTCPLingerShutdownZeroTimeout(t *testing.T) {
+	// Create a socket, listen, TCP connect, and accept.
+	dut := testbench.NewDUT(t)
+	acceptFD, listenFD, conn := createSocket(t, dut)
+	defer closeAll(t, dut, listenFD, conn)
+
+	dut.SetSockLingerOption(t, acceptFD, 0, true)
+	dut.Shutdown(t, acceptFD, syscall.SHUT_RDWR)
+	dut.Close(t, acceptFD)
+
+	// Shutdown will send FIN-ACK with read/write option.
+	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
+		t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
+	}
+
+	// If the linger timeout is set to zero, the DUT should send a RST.
+	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}, time.Second); err != nil {
+		t.Errorf("expected RST-ACK packet within a second but got none: %s", err)
+	}
+	conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+}
+
+// TestTCPLingerShutdownSendNonZeroTimeout tests SO_LINGER with shutdown() and
+// non-zero timeout. DUT should close the socket after timeout.
+func TestTCPLingerShutdownSendNonZeroTimeout(t *testing.T) {
+	for _, tt := range []struct {
+		description string
+		lingerOn    bool
+	}{
+		{"shutdownRDWR", true},
+		{"shutdownRDWR", false},
+	} {
+		t.Run(tt.description, func(t *testing.T) {
+			// Create a socket, listen, TCP connect, and accept.
+			dut := testbench.NewDUT(t)
+			acceptFD, listenFD, conn := createSocket(t, dut)
+			defer closeAll(t, dut, listenFD, conn)
+
+			dut.SetSockLingerOption(t, acceptFD, lingerDuration, tt.lingerOn)
+
+			// Send data.
+			sampleData := []byte("Sample Data")
+			dut.Send(t, acceptFD, sampleData, 0)
+
+			dut.Shutdown(t, acceptFD, syscall.SHUT_RDWR)
+
+			// Increase timeout as Close will take longer time to
+			// return when SO_LINGER is set with non-zero timeout.
+			timeout := lingerDuration + 1*time.Second
+			ctx, cancel := context.WithTimeout(context.Background(), timeout)
+			defer cancel()
+			start := time.Now()
+			dut.CloseWithErrno(ctx, t, acceptFD)
+			end := time.Now()
+			diff := end.Sub(start)
+
+			if tt.lingerOn && diff < lingerDuration {
+				t.Errorf("expected close to return after %v seconds, but returned sooner", lingerDuration)
+			} else if !tt.lingerOn && diff > 1*time.Second {
+				t.Errorf("expected close to return within a second, but returned later")
+			}
+
+			samplePayload := &testbench.Payload{Bytes: sampleData}
+			if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil {
+				t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+			}
+
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
+				t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
+			}
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+		})
+	}
+}
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 04356b780..f4b69c46c 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -1080,5 +1080,124 @@ TEST_P(TCPSocketPairTest, TCPResetDuringClose_NoRandomSave) {
   }
 }
 
+// Test setsockopt and getsockopt for a socket with SO_LINGER option.
+TEST_P(TCPSocketPairTest, SetAndGetLingerOption) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Check getsockopt before SO_LINGER option is set.
+  struct linger got_linger = {-1, -1};
+  socklen_t got_len = sizeof(got_linger);
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &got_len),
+              SyscallSucceeds());
+  ASSERT_THAT(got_len, sizeof(got_linger));
+  struct linger want_linger = {};
+  EXPECT_EQ(0, memcmp(&want_linger, &got_linger, got_len));
+
+  // Set and get SO_LINGER with negative values.
+  struct linger sl;
+  sl.l_onoff = 1;
+  sl.l_linger = -3;
+  ASSERT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+      SyscallSucceeds());
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &got_len),
+              SyscallSucceeds());
+  ASSERT_EQ(got_len, sizeof(got_linger));
+  EXPECT_EQ(sl.l_onoff, got_linger.l_onoff);
+  // Linux returns a different value as it uses HZ to convert the seconds to
+  // jiffies which overflows for negative values. We want to be compatible with
+  // linux for getsockopt return value.
+  if (IsRunningOnGvisor()) {
+    EXPECT_EQ(sl.l_linger, got_linger.l_linger);
+  }
+
+  // Set and get SO_LINGER option with positive values.
+  sl.l_onoff = 1;
+  sl.l_linger = 5;
+  ASSERT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+      SyscallSucceeds());
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &got_len),
+              SyscallSucceeds());
+  ASSERT_EQ(got_len, sizeof(got_linger));
+  EXPECT_EQ(0, memcmp(&sl, &got_linger, got_len));
+}
+
+// Test socket to disable SO_LINGER option.
+TEST_P(TCPSocketPairTest, SetOffLingerOption) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the SO_LINGER option.
+  struct linger sl;
+  sl.l_onoff = 1;
+  sl.l_linger = 5;
+  ASSERT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+      SyscallSucceeds());
+
+  // Check getsockopt after SO_LINGER option is set.
+  struct linger got_linger = {-1, -1};
+  socklen_t got_len = sizeof(got_linger);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &got_len),
+              SyscallSucceeds());
+  ASSERT_EQ(got_len, sizeof(got_linger));
+  EXPECT_EQ(0, memcmp(&sl, &got_linger, got_len));
+
+  sl.l_onoff = 0;
+  sl.l_linger = 5;
+  ASSERT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+      SyscallSucceeds());
+
+  // Check getsockopt after SO_LINGER option is set to zero.
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &got_len),
+              SyscallSucceeds());
+  ASSERT_EQ(got_len, sizeof(got_linger));
+  EXPECT_EQ(0, memcmp(&sl, &got_linger, got_len));
+}
+
+// Test close on dup'd socket with SO_LINGER option set.
+TEST_P(TCPSocketPairTest, CloseWithLingerOption) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the SO_LINGER option.
+  struct linger sl;
+  sl.l_onoff = 1;
+  sl.l_linger = 5;
+  ASSERT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+      SyscallSucceeds());
+
+  // Check getsockopt after SO_LINGER option is set.
+  struct linger got_linger = {-1, -1};
+  socklen_t got_len = sizeof(got_linger);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &got_len),
+              SyscallSucceeds());
+  ASSERT_EQ(got_len, sizeof(got_linger));
+  EXPECT_EQ(0, memcmp(&sl, &got_linger, got_len));
+
+  FileDescriptor dupFd = FileDescriptor(dup(sockets->first_fd()));
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+  char buf[10] = {};
+  // Write on dupFd should succeed as socket will not be closed until
+  // all references are removed.
+  ASSERT_THAT(RetryEINTR(write)(dupFd.get(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+  ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallFailsWithErrno(EBADF));
+
+  // Close the socket.
+  dupFd.reset();
+  // Write on dupFd should fail as all references for socket are removed.
+  ASSERT_THAT(RetryEINTR(write)(dupFd.get(), buf, sizeof(buf)),
+              SyscallFailsWithErrno(EBADF));
+}
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index edb86aded..5cad6f017 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -448,5 +448,35 @@ TEST_P(UDPSocketPairTest, TClassRecvMismatch) {
               SyscallFailsWithErrno(EOPNOTSUPP));
 }
 
+// Test the SO_LINGER option can be set/get on udp socket.
+TEST_P(UDPSocketPairTest, SoLingerFail) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  int level = SOL_SOCKET;
+  int type = SO_LINGER;
+
+  struct linger sl;
+  sl.l_onoff = 1;
+  sl.l_linger = 5;
+  ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &sl, sizeof(sl)),
+              SyscallSucceedsWithValue(0));
+
+  struct linger got_linger = {};
+  socklen_t length = sizeof(sl);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), level, type, &got_linger, &length),
+      SyscallSucceedsWithValue(0));
+
+  ASSERT_EQ(length, sizeof(got_linger));
+  // Linux returns the values which are set in the SetSockOpt for SO_LINGER.
+  // In gVisor, we do not store the linger values for UDP as SO_LINGER for UDP
+  // is a no-op.
+  if (IsRunningOnGvisor()) {
+    struct linger want_linger = {};
+    EXPECT_EQ(0, memcmp(&want_linger, &got_linger, length));
+  } else {
+    EXPECT_EQ(0, memcmp(&sl, &got_linger, length));
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 232587304de02d5d0634fe8b6118529cfd04bcad Mon Sep 17 00:00:00 2001
From: Sam Balana <sbalana@google.com>
Date: Tue, 25 Aug 2020 11:07:32 -0700
Subject: Add option to replace linkAddrCache with neighborCache

This change adds an option to replace the current implementation of ARP through
linkAddrCache, with an implementation of NUD through neighborCache. Switching
to using NUD for both ARP and NDP is beneficial for the reasons described by
RFC 4861 Section 3.1:

  "[Using NUD] significantly improves the robustness of packet delivery in the
  presence of failing routers, partially failing or partitioned links, or nodes
  that change their link-layer addresses. For instance, mobile nodes can move
  off-link without losing any connectivity due to stale ARP caches."

  "Unlike ARP, Neighbor Unreachability Detection detects half-link failures and
  avoids sending traffic to neighbors with which two-way connectivity is
  absent."

Along with these changes exposes the API for querying and operating the
neighbor cache. Operations include:
  - Create a static entry
  - List all entries
  - Delete all entries
  - Remove an entry by address

This also exposes the API to change the NUD protocol constants on a per-NIC
basis to allow Neighbor Discovery to operate over links with widely varying
performance characteristics. See [RFC 4861 Section 10][1] for the list of
constants.

Finally, an API for subscribing to NUD state changes is exposed through
NUDDispatcher. See [RFC 4861 Appendix C][3] for the list of edges.

Tests:
 pkg/tcpip/network/arp:arp_test
 + TestDirectRequest

 pkg/tcpip/network/ipv6:ipv6_test
 + TestLinkResolution
 + TestNDPValidation
 + TestNeighorAdvertisementWithTargetLinkLayerOption
 + TestNeighorSolicitationResponse
 + TestNeighorSolicitationWithSourceLinkLayerOption
 + TestRouterAdvertValidation

 pkg/tcpip/stack:stack_test
 + TestCacheWaker
 + TestForwardingWithFakeResolver
 + TestForwardingWithFakeResolverManyPackets
 + TestForwardingWithFakeResolverManyResolutions
 + TestForwardingWithFakeResolverPartialTimeout
 + TestForwardingWithFakeResolverTwoPackets
 + TestIPv6SourceAddressSelectionScopeAndSameAddress

[1]: https://tools.ietf.org/html/rfc4861#section-10
[2]: https://tools.ietf.org/html/rfc4861#appendix-C

Fixes #1889
Fixes #1894
Fixes #1895
Fixes #1947
Fixes #1948
Fixes #1949
Fixes #1950

PiperOrigin-RevId: 328365034
---
 pkg/tcpip/network/arp/BUILD           |   1 +
 pkg/tcpip/network/arp/arp.go          |  47 +-
 pkg/tcpip/network/arp/arp_test.go     | 331 +++++++++++--
 pkg/tcpip/network/ip_test.go          |  14 +-
 pkg/tcpip/network/ipv4/ipv4.go        |   2 +-
 pkg/tcpip/network/ipv6/icmp.go        | 278 +++++++----
 pkg/tcpip/network/ipv6/icmp_test.go   | 447 ++++++++++++-----
 pkg/tcpip/network/ipv6/ipv6.go        |   4 +-
 pkg/tcpip/network/ipv6/ndp_test.go    | 875 +++++++++++++++++++++++-----------
 pkg/tcpip/stack/forwarder_test.go     | 655 ++++++++++++++++---------
 pkg/tcpip/stack/linkaddrcache_test.go |  68 +++
 pkg/tcpip/stack/ndp_test.go           | 792 ++++++++++++++++--------------
 pkg/tcpip/stack/nic.go                |  94 +++-
 pkg/tcpip/stack/nic_test.go           |   2 +-
 pkg/tcpip/stack/nud_test.go           |  16 +-
 pkg/tcpip/stack/registration.go       |   4 +-
 pkg/tcpip/stack/route.go              |  19 +
 pkg/tcpip/stack/stack.go              | 100 +++-
 pkg/tcpip/stack/stack_test.go         |   2 +-
 pkg/tcpip/tcpip.go                    |   4 +
 pkg/tcpip/transport/udp/udp_test.go   |   4 +-
 21 files changed, 2611 insertions(+), 1148 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index eddf7b725..82c073e32 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -28,5 +28,6 @@ go_test(
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/icmp",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 920872c3f..cbbe5b77f 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -46,6 +46,7 @@ type endpoint struct {
 	nicID         tcpip.NICID
 	linkEP        stack.LinkEndpoint
 	linkAddrCache stack.LinkAddressCache
+	nud           stack.NUDHandler
 }
 
 // DefaultTTL is unused for ARP. It implements stack.NetworkEndpoint.
@@ -78,7 +79,7 @@ func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderPara
 
 // NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
 func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
-	return e.protocol.Number()
+	return ProtocolNumber
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
@@ -99,9 +100,25 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 	switch h.Op() {
 	case header.ARPRequest:
 		localAddr := tcpip.Address(h.ProtocolAddressTarget())
-		if e.linkAddrCache.CheckLocalAddress(e.nicID, header.IPv4ProtocolNumber, localAddr) == 0 {
-			return // we have no useful answer, ignore the request
+
+		if e.nud == nil {
+			if e.linkAddrCache.CheckLocalAddress(e.nicID, header.IPv4ProtocolNumber, localAddr) == 0 {
+				return // we have no useful answer, ignore the request
+			}
+
+			addr := tcpip.Address(h.ProtocolAddressSender())
+			linkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
+			e.linkAddrCache.AddLinkAddress(e.nicID, addr, linkAddr)
+		} else {
+			if r.Stack().CheckLocalAddress(e.nicID, header.IPv4ProtocolNumber, localAddr) == 0 {
+				return // we have no useful answer, ignore the request
+			}
+
+			remoteAddr := tcpip.Address(h.ProtocolAddressSender())
+			remoteLinkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
+			e.nud.HandleProbe(remoteAddr, localAddr, ProtocolNumber, remoteLinkAddr, e.protocol)
 		}
+
 		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 			ReserveHeaderBytes: int(e.linkEP.MaxHeaderLength()) + header.ARPSize,
 		})
@@ -113,11 +130,28 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 		copy(packet.HardwareAddressTarget(), h.HardwareAddressSender())
 		copy(packet.ProtocolAddressTarget(), h.ProtocolAddressSender())
 		_ = e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
-		fallthrough // also fill the cache from requests
+
 	case header.ARPReply:
 		addr := tcpip.Address(h.ProtocolAddressSender())
 		linkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
-		e.linkAddrCache.AddLinkAddress(e.nicID, addr, linkAddr)
+
+		if e.nud == nil {
+			e.linkAddrCache.AddLinkAddress(e.nicID, addr, linkAddr)
+			return
+		}
+
+		// The solicited, override, and isRouter flags are not available for ARP;
+		// they are only available for IPv6 Neighbor Advertisements.
+		e.nud.HandleConfirmation(addr, linkAddr, stack.ReachabilityConfirmationFlags{
+			// Solicited and unsolicited (also referred to as gratuitous) ARP Replies
+			// are handled equivalently to a solicited Neighbor Advertisement.
+			Solicited: true,
+			// If a different link address is received than the one cached, the entry
+			// should always go to Stale.
+			Override: false,
+			// ARP does not distinguish between router and non-router hosts.
+			IsRouter: false,
+		})
 	}
 }
 
@@ -134,12 +168,13 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 	return tcpip.Address(h.ProtocolAddressSender()), ProtocolAddress
 }
 
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, sender stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, nud stack.NUDHandler, dispatcher stack.TransportDispatcher, sender stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
 	return &endpoint{
 		protocol:      p,
 		nicID:         nicID,
 		linkEP:        sender,
 		linkAddrCache: linkAddrCache,
+		nud:           nud,
 	}
 }
 
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index c2c3e6891..9c9a859e3 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -16,10 +16,12 @@ package arp_test
 
 import (
 	"context"
+	"fmt"
 	"strconv"
 	"testing"
 	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -32,57 +34,192 @@ import (
 )
 
 const (
-	stackLinkAddr1 = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c")
-	stackLinkAddr2 = tcpip.LinkAddress("\x0b\x0b\x0c\x0c\x0d\x0d")
-	stackAddr1     = tcpip.Address("\x0a\x00\x00\x01")
-	stackAddr2     = tcpip.Address("\x0a\x00\x00\x02")
-	stackAddrBad   = tcpip.Address("\x0a\x00\x00\x03")
+	nicID = 1
+
+	stackAddr     = tcpip.Address("\x0a\x00\x00\x01")
+	stackLinkAddr = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c")
+
+	remoteAddr     = tcpip.Address("\x0a\x00\x00\x02")
+	remoteLinkAddr = tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06")
+
+	unknownAddr = tcpip.Address("\x0a\x00\x00\x03")
 
 	defaultChannelSize = 1
 	defaultMTU         = 65536
+
+	// eventChanSize defines the size of event channels used by the neighbor
+	// cache's event dispatcher. The size chosen here needs to be sufficient to
+	// queue all the events received during tests before consumption.
+	// If eventChanSize is too small, the tests may deadlock.
+	eventChanSize = 32
+)
+
+type eventType uint8
+
+const (
+	entryAdded eventType = iota
+	entryChanged
+	entryRemoved
 )
 
+func (t eventType) String() string {
+	switch t {
+	case entryAdded:
+		return "add"
+	case entryChanged:
+		return "change"
+	case entryRemoved:
+		return "remove"
+	default:
+		return fmt.Sprintf("unknown (%d)", t)
+	}
+}
+
+type eventInfo struct {
+	eventType eventType
+	nicID     tcpip.NICID
+	addr      tcpip.Address
+	linkAddr  tcpip.LinkAddress
+	state     stack.NeighborState
+}
+
+func (e eventInfo) String() string {
+	return fmt.Sprintf("%s event for NIC #%d, addr=%q, linkAddr=%q, state=%q", e.eventType, e.nicID, e.addr, e.linkAddr, e.state)
+}
+
+// arpDispatcher implements NUDDispatcher to validate the dispatching of
+// events upon certain NUD state machine events.
+type arpDispatcher struct {
+	// C is where events are queued
+	C chan eventInfo
+}
+
+var _ stack.NUDDispatcher = (*arpDispatcher)(nil)
+
+func (d *arpDispatcher) OnNeighborAdded(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state stack.NeighborState, updatedAt time.Time) {
+	e := eventInfo{
+		eventType: entryAdded,
+		nicID:     nicID,
+		addr:      addr,
+		linkAddr:  linkAddr,
+		state:     state,
+	}
+	d.C <- e
+}
+
+func (d *arpDispatcher) OnNeighborChanged(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state stack.NeighborState, updatedAt time.Time) {
+	e := eventInfo{
+		eventType: entryChanged,
+		nicID:     nicID,
+		addr:      addr,
+		linkAddr:  linkAddr,
+		state:     state,
+	}
+	d.C <- e
+}
+
+func (d *arpDispatcher) OnNeighborRemoved(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state stack.NeighborState, updatedAt time.Time) {
+	e := eventInfo{
+		eventType: entryRemoved,
+		nicID:     nicID,
+		addr:      addr,
+		linkAddr:  linkAddr,
+		state:     state,
+	}
+	d.C <- e
+}
+
+func (d *arpDispatcher) waitForEvent(ctx context.Context, want eventInfo) error {
+	select {
+	case got := <-d.C:
+		if diff := cmp.Diff(got, want, cmp.AllowUnexported(got)); diff != "" {
+			return fmt.Errorf("got invalid event (-got +want):\n%s", diff)
+		}
+	case <-ctx.Done():
+		return fmt.Errorf("%s for %s", ctx.Err(), want)
+	}
+	return nil
+}
+
+func (d *arpDispatcher) waitForEventWithTimeout(want eventInfo, timeout time.Duration) error {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	return d.waitForEvent(ctx, want)
+}
+
+func (d *arpDispatcher) nextEvent() (eventInfo, bool) {
+	select {
+	case event := <-d.C:
+		return event, true
+	default:
+		return eventInfo{}, false
+	}
+}
+
 type testContext struct {
-	t      *testing.T
-	linkEP *channel.Endpoint
-	s      *stack.Stack
+	s       *stack.Stack
+	linkEP  *channel.Endpoint
+	nudDisp *arpDispatcher
 }
 
-func newTestContext(t *testing.T) *testContext {
+func newTestContext(t *testing.T, useNeighborCache bool) *testContext {
+	c := stack.DefaultNUDConfigurations()
+	// Transition from Reachable to Stale almost immediately to test if receiving
+	// probes refreshes positive reachability.
+	c.BaseReachableTime = time.Microsecond
+
+	d := arpDispatcher{
+		// Create an event channel large enough so the neighbor cache doesn't block
+		// while dispatching events. Blocking could interfere with the timing of
+		// NUD transitions.
+		C: make(chan eventInfo, eventChanSize),
+	}
+
 	s := stack.New(stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), arp.NewProtocol()},
 		TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol4()},
+		NUDConfigs:         c,
+		NUDDisp:            &d,
+		UseNeighborCache:   useNeighborCache,
 	})
 
-	ep := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr1)
+	ep := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr)
+	ep.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
 	wep := stack.LinkEndpoint(ep)
 
 	if testing.Verbose() {
 		wep = sniffer.New(ep)
 	}
-	if err := s.CreateNIC(1, wep); err != nil {
+	if err := s.CreateNIC(nicID, wep); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
 
-	if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr1); err != nil {
+	if err := s.AddAddress(nicID, ipv4.ProtocolNumber, stackAddr); err != nil {
 		t.Fatalf("AddAddress for ipv4 failed: %v", err)
 	}
-	if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr2); err != nil {
-		t.Fatalf("AddAddress for ipv4 failed: %v", err)
+	if !useNeighborCache {
+		// The remote address needs to be assigned to the NIC so we can receive and
+		// verify outgoing ARP packets. The neighbor cache isn't concerned with
+		// this; the tests that use linkAddrCache expect the ARP responses to be
+		// received by the same NIC.
+		if err := s.AddAddress(nicID, ipv4.ProtocolNumber, remoteAddr); err != nil {
+			t.Fatalf("AddAddress for ipv4 failed: %v", err)
+		}
 	}
-	if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+	if err := s.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
 		t.Fatalf("AddAddress for arp failed: %v", err)
 	}
 
 	s.SetRouteTable([]tcpip.Route{{
 		Destination: header.IPv4EmptySubnet,
-		NIC:         1,
+		NIC:         nicID,
 	}})
 
 	return &testContext{
-		t:      t,
-		s:      s,
-		linkEP: ep,
+		s:       s,
+		linkEP:  ep,
+		nudDisp: &d,
 	}
 }
 
@@ -91,7 +228,7 @@ func (c *testContext) cleanup() {
 }
 
 func TestDirectRequest(t *testing.T) {
-	c := newTestContext(t)
+	c := newTestContext(t, false /* useNeighborCache */)
 	defer c.cleanup()
 
 	const senderMAC = "\x01\x02\x03\x04\x05\x06"
@@ -111,7 +248,7 @@ func TestDirectRequest(t *testing.T) {
 		}))
 	}
 
-	for i, address := range []tcpip.Address{stackAddr1, stackAddr2} {
+	for i, address := range []tcpip.Address{stackAddr, remoteAddr} {
 		t.Run(strconv.Itoa(i), func(t *testing.T) {
 			inject(address)
 			pi, _ := c.linkEP.ReadContext(context.Background())
@@ -122,7 +259,7 @@ func TestDirectRequest(t *testing.T) {
 			if !rep.IsValid() {
 				t.Fatalf("invalid ARP response: len = %d; response = %x", len(rep), rep)
 			}
-			if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr1; got != want {
+			if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr; got != want {
 				t.Errorf("got HardwareAddressSender = %s, want = %s", got, want)
 			}
 			if got, want := tcpip.Address(rep.ProtocolAddressSender()), tcpip.Address(h.ProtocolAddressTarget()); got != want {
@@ -137,7 +274,7 @@ func TestDirectRequest(t *testing.T) {
 		})
 	}
 
-	inject(stackAddrBad)
+	inject(unknownAddr)
 	// Sleep tests are gross, but this will only potentially flake
 	// if there's a bug. If there is no bug this will reliably
 	// succeed.
@@ -148,6 +285,144 @@ func TestDirectRequest(t *testing.T) {
 	}
 }
 
+func TestDirectRequestWithNeighborCache(t *testing.T) {
+	c := newTestContext(t, true /* useNeighborCache */)
+	defer c.cleanup()
+
+	tests := []struct {
+		name           string
+		senderAddr     tcpip.Address
+		senderLinkAddr tcpip.LinkAddress
+		targetAddr     tcpip.Address
+		isValid        bool
+	}{
+		{
+			name:           "Loopback",
+			senderAddr:     stackAddr,
+			senderLinkAddr: stackLinkAddr,
+			targetAddr:     stackAddr,
+			isValid:        true,
+		},
+		{
+			name:           "Remote",
+			senderAddr:     remoteAddr,
+			senderLinkAddr: remoteLinkAddr,
+			targetAddr:     stackAddr,
+			isValid:        true,
+		},
+		{
+			name:           "RemoteInvalidTarget",
+			senderAddr:     remoteAddr,
+			senderLinkAddr: remoteLinkAddr,
+			targetAddr:     unknownAddr,
+			isValid:        false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			// Inject an incoming ARP request.
+			v := make(buffer.View, header.ARPSize)
+			h := header.ARP(v)
+			h.SetIPv4OverEthernet()
+			h.SetOp(header.ARPRequest)
+			copy(h.HardwareAddressSender(), test.senderLinkAddr)
+			copy(h.ProtocolAddressSender(), test.senderAddr)
+			copy(h.ProtocolAddressTarget(), test.targetAddr)
+			c.linkEP.InjectInbound(arp.ProtocolNumber, &stack.PacketBuffer{
+				Data: v.ToVectorisedView(),
+			})
+
+			if !test.isValid {
+				// No packets should be sent after receiving an invalid ARP request.
+				// There is no need to perform a blocking read here, since packets are
+				// sent in the same function that handles ARP requests.
+				if pkt, ok := c.linkEP.Read(); ok {
+					t.Errorf("unexpected packet sent with network protocol number %d", pkt.Proto)
+				}
+				return
+			}
+
+			// Verify an ARP response was sent.
+			pi, ok := c.linkEP.Read()
+			if !ok {
+				t.Fatal("expected ARP response to be sent, got none")
+			}
+
+			if pi.Proto != arp.ProtocolNumber {
+				t.Fatalf("expected ARP response, got network protocol number %d", pi.Proto)
+			}
+			rep := header.ARP(pi.Pkt.NetworkHeader().View())
+			if !rep.IsValid() {
+				t.Fatalf("invalid ARP response: len = %d; response = %x", len(rep), rep)
+			}
+			if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr; got != want {
+				t.Errorf("got HardwareAddressSender() = %s, want = %s", got, want)
+			}
+			if got, want := tcpip.Address(rep.ProtocolAddressSender()), tcpip.Address(h.ProtocolAddressTarget()); got != want {
+				t.Errorf("got ProtocolAddressSender() = %s, want = %s", got, want)
+			}
+			if got, want := tcpip.LinkAddress(rep.HardwareAddressTarget()), tcpip.LinkAddress(h.HardwareAddressSender()); got != want {
+				t.Errorf("got HardwareAddressTarget() = %s, want = %s", got, want)
+			}
+			if got, want := tcpip.Address(rep.ProtocolAddressTarget()), tcpip.Address(h.ProtocolAddressSender()); got != want {
+				t.Errorf("got ProtocolAddressTarget() = %s, want = %s", got, want)
+			}
+
+			// Verify the sender was saved in the neighbor cache.
+			wantEvent := eventInfo{
+				eventType: entryAdded,
+				nicID:     nicID,
+				addr:      test.senderAddr,
+				linkAddr:  tcpip.LinkAddress(test.senderLinkAddr),
+				state:     stack.Stale,
+			}
+			if err := c.nudDisp.waitForEventWithTimeout(wantEvent, time.Second); err != nil {
+				t.Fatal(err)
+			}
+
+			neighbors, err := c.s.Neighbors(nicID)
+			if err != nil {
+				t.Fatalf("c.s.Neighbors(%d): %s", nicID, err)
+			}
+
+			neighborByAddr := make(map[tcpip.Address]stack.NeighborEntry)
+			for _, n := range neighbors {
+				if existing, ok := neighborByAddr[n.Addr]; ok {
+					if diff := cmp.Diff(existing, n); diff != "" {
+						t.Fatalf("duplicate neighbor entry found (-existing +got):\n%s", diff)
+					}
+					t.Fatalf("exact neighbor entry duplicate found for addr=%s", n.Addr)
+				}
+				neighborByAddr[n.Addr] = n
+			}
+
+			neigh, ok := neighborByAddr[test.senderAddr]
+			if !ok {
+				t.Fatalf("expected neighbor entry with Addr = %s", test.senderAddr)
+			}
+			if got, want := neigh.LinkAddr, test.senderLinkAddr; got != want {
+				t.Errorf("got neighbor LinkAddr = %s, want = %s", got, want)
+			}
+			if got, want := neigh.LocalAddr, stackAddr; got != want {
+				t.Errorf("got neighbor LocalAddr = %s, want = %s", got, want)
+			}
+			if got, want := neigh.State, stack.Stale; got != want {
+				t.Errorf("got neighbor State = %s, want = %s", got, want)
+			}
+
+			// No more events should be dispatched
+			for {
+				event, ok := c.nudDisp.nextEvent()
+				if !ok {
+					break
+				}
+				t.Errorf("unexpected %s", event)
+			}
+		})
+	}
+}
+
 func TestLinkAddressRequest(t *testing.T) {
 	tests := []struct {
 		name           string
@@ -156,8 +431,8 @@ func TestLinkAddressRequest(t *testing.T) {
 	}{
 		{
 			name:           "Unicast",
-			remoteLinkAddr: stackLinkAddr2,
-			expectLinkAddr: stackLinkAddr2,
+			remoteLinkAddr: remoteLinkAddr,
+			expectLinkAddr: remoteLinkAddr,
 		},
 		{
 			name:           "Multicast",
@@ -173,9 +448,9 @@ func TestLinkAddressRequest(t *testing.T) {
 			t.Fatal("expected ARP protocol to implement stack.LinkAddressResolver")
 		}
 
-		linkEP := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr1)
-		if err := linkRes.LinkAddressRequest(stackAddr1, stackAddr2, test.remoteLinkAddr, linkEP); err != nil {
-			t.Errorf("got p.LinkAddressRequest(%s, %s, %s, _) = %s", stackAddr1, stackAddr2, test.remoteLinkAddr, err)
+		linkEP := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr)
+		if err := linkRes.LinkAddressRequest(stackAddr, remoteAddr, test.remoteLinkAddr, linkEP); err != nil {
+			t.Errorf("got p.LinkAddressRequest(%s, %s, %s, _) = %s", stackAddr, remoteAddr, test.remoteLinkAddr, err)
 		}
 
 		pkt, ok := linkEP.Read()
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 9007346fe..e45dd17f8 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -250,7 +250,7 @@ func buildDummyStack(t *testing.T) *stack.Stack {
 func TestIPv4Send(t *testing.T) {
 	o := testObject{t: t, v4: true}
 	proto := ipv4.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, nil, &o, buildDummyStack(t))
+	ep := proto.NewEndpoint(nicID, nil, nil, nil, &o, buildDummyStack(t))
 	defer ep.Close()
 
 	// Allocate and initialize the payload view.
@@ -287,7 +287,7 @@ func TestIPv4Send(t *testing.T) {
 func TestIPv4Receive(t *testing.T) {
 	o := testObject{t: t, v4: true}
 	proto := ipv4.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+	ep := proto.NewEndpoint(nicID, nil, nil, &o, nil, buildDummyStack(t))
 	defer ep.Close()
 
 	totalLen := header.IPv4MinimumSize + 30
@@ -357,7 +357,7 @@ func TestIPv4ReceiveControl(t *testing.T) {
 		t.Run(c.name, func(t *testing.T) {
 			o := testObject{t: t}
 			proto := ipv4.NewProtocol()
-			ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+			ep := proto.NewEndpoint(nicID, nil, nil, &o, nil, buildDummyStack(t))
 			defer ep.Close()
 
 			const dataOffset = header.IPv4MinimumSize*2 + header.ICMPv4MinimumSize
@@ -418,7 +418,7 @@ func TestIPv4ReceiveControl(t *testing.T) {
 func TestIPv4FragmentationReceive(t *testing.T) {
 	o := testObject{t: t, v4: true}
 	proto := ipv4.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+	ep := proto.NewEndpoint(nicID, nil, nil, &o, nil, buildDummyStack(t))
 	defer ep.Close()
 
 	totalLen := header.IPv4MinimumSize + 24
@@ -495,7 +495,7 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 func TestIPv6Send(t *testing.T) {
 	o := testObject{t: t}
 	proto := ipv6.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, &o, channel.New(0, 1280, ""), buildDummyStack(t))
+	ep := proto.NewEndpoint(nicID, nil, nil, &o, channel.New(0, 1280, ""), buildDummyStack(t))
 	defer ep.Close()
 
 	// Allocate and initialize the payload view.
@@ -532,7 +532,7 @@ func TestIPv6Send(t *testing.T) {
 func TestIPv6Receive(t *testing.T) {
 	o := testObject{t: t}
 	proto := ipv6.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+	ep := proto.NewEndpoint(nicID, nil, nil, &o, nil, buildDummyStack(t))
 	defer ep.Close()
 
 	totalLen := header.IPv6MinimumSize + 30
@@ -611,7 +611,7 @@ func TestIPv6ReceiveControl(t *testing.T) {
 		t.Run(c.name, func(t *testing.T) {
 			o := testObject{t: t}
 			proto := ipv6.NewProtocol()
-			ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+			ep := proto.NewEndpoint(nicID, nil, nil, &o, nil, buildDummyStack(t))
 			defer ep.Close()
 
 			dataOffset := header.IPv6MinimumSize*2 + header.ICMPv6MinimumSize
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 63ffb3660..55ca94268 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -59,7 +59,7 @@ type endpoint struct {
 }
 
 // NewEndpoint creates a new ipv4 endpoint.
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, _ stack.LinkAddressCache, _ stack.NUDHandler, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
 	return &endpoint{
 		nicID:      nicID,
 		linkEP:     linkEP,
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 66d3a953a..2b83c421e 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -15,8 +15,6 @@
 package ipv6
 
 import (
-	"fmt"
-
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -71,6 +69,59 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack
 	e.dispatcher.DeliverTransportControlPacket(src, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
+// getLinkAddrOption searches NDP options for a given link address option using
+// the provided getAddr function as a filter. Returns the link address if
+// found; otherwise, returns the zero link address value. Also returns true if
+// the options are valid as per the wire format, false otherwise.
+func getLinkAddrOption(it header.NDPOptionIterator, getAddr func(header.NDPOption) tcpip.LinkAddress) (tcpip.LinkAddress, bool) {
+	var linkAddr tcpip.LinkAddress
+	for {
+		opt, done, err := it.Next()
+		if err != nil {
+			return "", false
+		}
+		if done {
+			break
+		}
+		if addr := getAddr(opt); len(addr) != 0 {
+			// No RFCs define what to do when an NDP message has multiple Link-Layer
+			// Address options. Since no interface can have multiple link-layer
+			// addresses, we consider such messages invalid.
+			if len(linkAddr) != 0 {
+				return "", false
+			}
+			linkAddr = addr
+		}
+	}
+	return linkAddr, true
+}
+
+// getSourceLinkAddr searches NDP options for the source link address option.
+// Returns the link address if found; otherwise, returns the zero link address
+// value. Also returns true if the options are valid as per the wire format,
+// false otherwise.
+func getSourceLinkAddr(it header.NDPOptionIterator) (tcpip.LinkAddress, bool) {
+	return getLinkAddrOption(it, func(opt header.NDPOption) tcpip.LinkAddress {
+		if src, ok := opt.(header.NDPSourceLinkLayerAddressOption); ok {
+			return src.EthernetAddress()
+		}
+		return ""
+	})
+}
+
+// getTargetLinkAddr searches NDP options for the target link address option.
+// Returns the link address if found; otherwise, returns the zero link address
+// value. Also returns true if the options are valid as per the wire format,
+// false otherwise.
+func getTargetLinkAddr(it header.NDPOptionIterator) (tcpip.LinkAddress, bool) {
+	return getLinkAddrOption(it, func(opt header.NDPOption) tcpip.LinkAddress {
+		if dst, ok := opt.(header.NDPTargetLinkLayerAddressOption); ok {
+			return dst.EthernetAddress()
+		}
+		return ""
+	})
+}
+
 func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragmentHeader bool) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
@@ -137,7 +188,7 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 
 	case header.ICMPv6NeighborSolicit:
 		received.NeighborSolicit.Increment()
-		if pkt.Data.Size() < header.ICMPv6NeighborSolicitMinimumSize || !isNDPValid() {
+		if !isNDPValid() || pkt.Data.Size() < header.ICMPv6NeighborSolicitMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
@@ -147,14 +198,15 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// NDP messages cannot be fragmented. Also note that in the common case NDP
 		// datagrams are very small and ToView() will not incur allocations.
 		ns := header.NDPNeighborSolicit(payload.ToView())
-		it, err := ns.Options().Iter(true)
-		if err != nil {
-			// If we have a malformed NDP NS option, drop the packet.
+		targetAddr := ns.TargetAddress()
+
+		// As per RFC 4861 section 4.3, the Target Address MUST NOT be a multicast
+		// address.
+		if header.IsV6MulticastAddress(targetAddr) {
 			received.Invalid.Increment()
 			return
 		}
 
-		targetAddr := ns.TargetAddress()
 		s := r.Stack()
 		if isTentative, err := s.IsAddrTentative(e.nicID, targetAddr); err != nil {
 			// We will only get an error if the NIC is unrecognized, which should not
@@ -187,39 +239,22 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// so the packet is processed as defined in RFC 4861, as per RFC 4862
 		// section 5.4.3.
 
-		// Is the NS targetting us?
-		if e.linkAddrCache.CheckLocalAddress(e.nicID, ProtocolNumber, targetAddr) == 0 {
+		// Is the NS targeting us?
+		if s.CheckLocalAddress(e.nicID, ProtocolNumber, targetAddr) == 0 {
 			return
 		}
 
-		// If the NS message contains the Source Link-Layer Address option, update
-		// the link address cache with the value of the option.
-		//
-		// TODO(b/148429853): Properly process the NS message and do Neighbor
-		// Unreachability Detection.
-		var sourceLinkAddr tcpip.LinkAddress
-		for {
-			opt, done, err := it.Next()
-			if err != nil {
-				// This should never happen as Iter(true) above did not return an error.
-				panic(fmt.Sprintf("unexpected error when iterating over NDP options: %s", err))
-			}
-			if done {
-				break
-			}
+		it, err := ns.Options().Iter(false /* check */)
+		if err != nil {
+			// Options are not valid as per the wire format, silently drop the packet.
+			received.Invalid.Increment()
+			return
+		}
 
-			switch opt := opt.(type) {
-			case header.NDPSourceLinkLayerAddressOption:
-				// No RFCs define what to do when an NS message has multiple Source
-				// Link-Layer Address options. Since no interface can have multiple
-				// link-layer addresses, we consider such messages invalid.
-				if len(sourceLinkAddr) != 0 {
-					received.Invalid.Increment()
-					return
-				}
-
-				sourceLinkAddr = opt.EthernetAddress()
-			}
+		sourceLinkAddr, ok := getSourceLinkAddr(it)
+		if !ok {
+			received.Invalid.Increment()
+			return
 		}
 
 		unspecifiedSource := r.RemoteAddress == header.IPv6Any
@@ -237,6 +272,8 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		} else if unspecifiedSource {
 			received.Invalid.Increment()
 			return
+		} else if e.nud != nil {
+			e.nud.HandleProbe(r.RemoteAddress, r.LocalAddress, header.IPv6ProtocolNumber, sourceLinkAddr, e.protocol)
 		} else {
 			e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, sourceLinkAddr)
 		}
@@ -304,7 +341,7 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 
 	case header.ICMPv6NeighborAdvert:
 		received.NeighborAdvert.Increment()
-		if pkt.Data.Size() < header.ICMPv6NeighborAdvertSize || !isNDPValid() {
+		if !isNDPValid() || pkt.Data.Size() < header.ICMPv6NeighborAdvertSize {
 			received.Invalid.Increment()
 			return
 		}
@@ -314,17 +351,10 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// 5, NDP messages cannot be fragmented. Also note that in the common case
 		// NDP datagrams are very small and ToView() will not incur allocations.
 		na := header.NDPNeighborAdvert(payload.ToView())
-		it, err := na.Options().Iter(true)
-		if err != nil {
-			// If we have a malformed NDP NA option, drop the packet.
-			received.Invalid.Increment()
-			return
-		}
-
 		targetAddr := na.TargetAddress()
-		stack := r.Stack()
+		s := r.Stack()
 
-		if isTentative, err := stack.IsAddrTentative(e.nicID, targetAddr); err != nil {
+		if isTentative, err := s.IsAddrTentative(e.nicID, targetAddr); err != nil {
 			// We will only get an error if the NIC is unrecognized, which should not
 			// happen. For now short-circuit this packet.
 			//
@@ -335,7 +365,14 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 			// DAD on, implying the address is not unique. In this case we let the
 			// stack know so it can handle such a scenario and do nothing furthur with
 			// the NDP NA.
-			stack.DupTentativeAddrDetected(e.nicID, targetAddr)
+			s.DupTentativeAddrDetected(e.nicID, targetAddr)
+			return
+		}
+
+		it, err := na.Options().Iter(false /* check */)
+		if err != nil {
+			// If we have a malformed NDP NA option, drop the packet.
+			received.Invalid.Increment()
 			return
 		}
 
@@ -348,39 +385,25 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// TODO(b/143147598): Handle the scenario described above. Also inform the
 		// netstack integration that a duplicate address was detected outside of
 		// DAD.
+		targetLinkAddr, ok := getTargetLinkAddr(it)
+		if !ok {
+			received.Invalid.Increment()
+			return
+		}
 
 		// If the NA message has the target link layer option, update the link
 		// address cache with the link address for the target of the message.
-		//
-		// TODO(b/148429853): Properly process the NA message and do Neighbor
-		// Unreachability Detection.
-		var targetLinkAddr tcpip.LinkAddress
-		for {
-			opt, done, err := it.Next()
-			if err != nil {
-				// This should never happen as Iter(true) above did not return an error.
-				panic(fmt.Sprintf("unexpected error when iterating over NDP options: %s", err))
-			}
-			if done {
-				break
+		if len(targetLinkAddr) != 0 {
+			if e.nud == nil {
+				e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, targetLinkAddr)
+				return
 			}
 
-			switch opt := opt.(type) {
-			case header.NDPTargetLinkLayerAddressOption:
-				// No RFCs define what to do when an NA message has multiple Target
-				// Link-Layer Address options. Since no interface can have multiple
-				// link-layer addresses, we consider such messages invalid.
-				if len(targetLinkAddr) != 0 {
-					received.Invalid.Increment()
-					return
-				}
-
-				targetLinkAddr = opt.EthernetAddress()
-			}
-		}
-
-		if len(targetLinkAddr) != 0 {
-			e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, targetLinkAddr)
+			e.nud.HandleConfirmation(targetAddr, targetLinkAddr, stack.ReachabilityConfirmationFlags{
+				Solicited: na.SolicitedFlag(),
+				Override:  na.OverrideFlag(),
+				IsRouter:  na.RouterFlag(),
+			})
 		}
 
 	case header.ICMPv6EchoRequest:
@@ -440,27 +463,75 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 
 	case header.ICMPv6RouterSolicit:
 		received.RouterSolicit.Increment()
-		if !isNDPValid() {
+
+		//
+		// Validate the RS as per RFC 4861 section 6.1.1.
+		//
+
+		// Is the NDP payload of sufficient size to hold a Router Solictation?
+		if !isNDPValid() || pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRSMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
 
-	case header.ICMPv6RouterAdvert:
-		received.RouterAdvert.Increment()
+		stack := r.Stack()
 
-		// Is the NDP payload of sufficient size to hold a Router
-		// Advertisement?
-		if pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRAMinimumSize || !isNDPValid() {
+		// Is the networking stack operating as a router?
+		if !stack.Forwarding() {
+			// ... No, silently drop the packet.
+			received.RouterOnlyPacketsDroppedByHost.Increment()
+			return
+		}
+
+		// Note that in the common case NDP datagrams are very small and ToView()
+		// will not incur allocations.
+		rs := header.NDPRouterSolicit(payload.ToView())
+		it, err := rs.Options().Iter(false /* check */)
+		if err != nil {
+			// Options are not valid as per the wire format, silently drop the packet.
 			received.Invalid.Increment()
 			return
 		}
 
-		routerAddr := iph.SourceAddress()
+		sourceLinkAddr, ok := getSourceLinkAddr(it)
+		if !ok {
+			received.Invalid.Increment()
+			return
+		}
+
+		// If the RS message has the source link layer option, update the link
+		// address cache with the link address for the source of the message.
+		if len(sourceLinkAddr) != 0 {
+			// As per RFC 4861 section 4.1, the Source Link-Layer Address Option MUST
+			// NOT be included when the source IP address is the unspecified address.
+			// Otherwise, it SHOULD be included on link layers that have addresses.
+			if r.RemoteAddress == header.IPv6Any {
+				received.Invalid.Increment()
+				return
+			}
+
+			if e.nud != nil {
+				// A RS with a specified source IP address modifies the NUD state
+				// machine in the same way a reachability probe would.
+				e.nud.HandleProbe(r.RemoteAddress, r.LocalAddress, header.IPv6ProtocolNumber, sourceLinkAddr, e.protocol)
+			}
+		}
+
+	case header.ICMPv6RouterAdvert:
+		received.RouterAdvert.Increment()
 
 		//
 		// Validate the RA as per RFC 4861 section 6.1.2.
 		//
 
+		// Is the NDP payload of sufficient size to hold a Router Advertisement?
+		if !isNDPValid() || pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRAMinimumSize {
+			received.Invalid.Increment()
+			return
+		}
+
+		routerAddr := iph.SourceAddress()
+
 		// Is the IP Source Address a link-local address?
 		if !header.IsV6LinkLocalAddress(routerAddr) {
 			// ...No, silently drop the packet.
@@ -468,16 +539,18 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 			return
 		}
 
-		// The remainder of payload must be only the router advertisement, so
-		// payload.ToView() always returns the advertisement. Per RFC 6980 section
-		// 5, NDP messages cannot be fragmented. Also note that in the common case
-		// NDP datagrams are very small and ToView() will not incur allocations.
+		// Note that in the common case NDP datagrams are very small and ToView()
+		// will not incur allocations.
 		ra := header.NDPRouterAdvert(payload.ToView())
-		opts := ra.Options()
+		it, err := ra.Options().Iter(false /* check */)
+		if err != nil {
+			// Options are not valid as per the wire format, silently drop the packet.
+			received.Invalid.Increment()
+			return
+		}
 
-		// Are options valid as per the wire format?
-		if _, err := opts.Iter(true); err != nil {
-			// ...No, silently drop the packet.
+		sourceLinkAddr, ok := getSourceLinkAddr(it)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
@@ -487,12 +560,33 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// as RFC 4861 section 6.1.2 is concerned.
 		//
 
+		// If the RA has the source link layer option, update the link address
+		// cache with the link address for the advertised router.
+		if len(sourceLinkAddr) != 0 && e.nud != nil {
+			e.nud.HandleProbe(routerAddr, r.LocalAddress, header.IPv6ProtocolNumber, sourceLinkAddr, e.protocol)
+		}
+
 		// Tell the NIC to handle the RA.
 		stack := r.Stack()
-		rxNICID := r.NICID()
-		stack.HandleNDPRA(rxNICID, routerAddr, ra)
+		stack.HandleNDPRA(e.nicID, routerAddr, ra)
 
 	case header.ICMPv6RedirectMsg:
+		// TODO(gvisor.dev/issue/2285): Call `e.nud.HandleProbe` after validating
+		// this redirect message, as per RFC 4871 section 7.3.3:
+		//
+		//    "A Neighbor Cache entry enters the STALE state when created as a
+		//    result of receiving packets other than solicited Neighbor
+		//    Advertisements (i.e., Router Solicitations, Router Advertisements,
+		//    Redirects, and Neighbor Solicitations).  These packets contain the
+		//    link-layer address of either the sender or, in the case of Redirect,
+		//    the redirection target.  However, receipt of these link-layer
+		//    addresses does not confirm reachability of the forward-direction path
+		//    to that node.  Placing a newly created Neighbor Cache entry for which
+		//    the link-layer address is known in the STALE state provides assurance
+		//    that path failures are detected quickly. In addition, should a cached
+		//    link-layer address be modified due to receiving one of the above
+		//    messages, the state SHOULD also be set to STALE to provide prompt
+		//    verification that the path to the new link-layer address is working."
 		received.RedirectMsg.Increment()
 		if !isNDPValid() {
 			received.Invalid.Increment()
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 9e4eeea77..8112ed051 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -31,6 +31,8 @@ import (
 )
 
 const (
+	nicID = 1
+
 	linkAddr0 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
 	linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e")
 	linkAddr2 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0f")
@@ -49,7 +51,10 @@ type stubLinkEndpoint struct {
 }
 
 func (*stubLinkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return 0
+	// Indicate that resolution for link layer addresses is required to send
+	// packets over this link. This is needed so the NIC knows to allocate a
+	// neighbor table.
+	return stack.CapabilityResolutionRequired
 }
 
 func (*stubLinkEndpoint) MaxHeaderLength() uint16 {
@@ -84,16 +89,184 @@ func (*stubLinkAddressCache) CheckLocalAddress(tcpip.NICID, tcpip.NetworkProtoco
 func (*stubLinkAddressCache) AddLinkAddress(tcpip.NICID, tcpip.Address, tcpip.LinkAddress) {
 }
 
+type stubNUDHandler struct{}
+
+var _ stack.NUDHandler = (*stubNUDHandler)(nil)
+
+func (*stubNUDHandler) HandleProbe(remoteAddr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes stack.LinkAddressResolver) {
+}
+
+func (*stubNUDHandler) HandleConfirmation(addr tcpip.Address, linkAddr tcpip.LinkAddress, flags stack.ReachabilityConfirmationFlags) {
+}
+
+func (*stubNUDHandler) HandleUpperLevelConfirmation(addr tcpip.Address) {
+}
+
 func TestICMPCounts(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+				UseNeighborCache:   test.useNeighborCache,
+			})
+			{
+				if err := s.CreateNIC(nicID, &stubLinkEndpoint{}); err != nil {
+					t.Fatalf("CreateNIC(_, _) = %s", err)
+				}
+				if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+					t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+				}
+			}
+			{
+				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable(
+					[]tcpip.Route{{
+						Destination: subnet,
+						NIC:         nicID,
+					}},
+				)
+			}
+
+			netProto := s.NetworkProtocolInstance(ProtocolNumber)
+			if netProto == nil {
+				t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
+			}
+			ep := netProto.NewEndpoint(0, &stubLinkAddressCache{}, &stubNUDHandler{}, &stubDispatcher{}, nil, s)
+			defer ep.Close()
+
+			r, err := s.FindRoute(nicID, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+			if err != nil {
+				t.Fatalf("FindRoute(%d, %s, %s, _, false) = (_, %s), want = (_, nil)", nicID, lladdr0, lladdr1, err)
+			}
+			defer r.Release()
+
+			var tllData [header.NDPLinkLayerAddressSize]byte
+			header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+				header.NDPTargetLinkLayerAddressOption(linkAddr1),
+			})
+
+			types := []struct {
+				typ       header.ICMPv6Type
+				size      int
+				extraData []byte
+			}{
+				{
+					typ:  header.ICMPv6DstUnreachable,
+					size: header.ICMPv6DstUnreachableMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6PacketTooBig,
+					size: header.ICMPv6PacketTooBigMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6TimeExceeded,
+					size: header.ICMPv6MinimumSize,
+				},
+				{
+					typ:  header.ICMPv6ParamProblem,
+					size: header.ICMPv6MinimumSize,
+				},
+				{
+					typ:  header.ICMPv6EchoRequest,
+					size: header.ICMPv6EchoMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6EchoReply,
+					size: header.ICMPv6EchoMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6RouterSolicit,
+					size: header.ICMPv6MinimumSize,
+				},
+				{
+					typ:  header.ICMPv6RouterAdvert,
+					size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6NeighborSolicit,
+					size: header.ICMPv6NeighborSolicitMinimumSize,
+				},
+				{
+					typ:       header.ICMPv6NeighborAdvert,
+					size:      header.ICMPv6NeighborAdvertMinimumSize,
+					extraData: tllData[:],
+				},
+				{
+					typ:  header.ICMPv6RedirectMsg,
+					size: header.ICMPv6MinimumSize,
+				},
+			}
+
+			handleIPv6Payload := func(icmp header.ICMPv6) {
+				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+					ReserveHeaderBytes: header.IPv6MinimumSize,
+					Data:               buffer.View(icmp).ToVectorisedView(),
+				})
+				ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(len(icmp)),
+					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+					HopLimit:      header.NDPHopLimit,
+					SrcAddr:       r.LocalAddress,
+					DstAddr:       r.RemoteAddress,
+				})
+				ep.HandlePacket(&r, pkt)
+			}
+
+			for _, typ := range types {
+				icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+				copy(icmp[typ.size:], typ.extraData)
+				icmp.SetType(typ.typ)
+				icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+				handleIPv6Payload(icmp)
+			}
+
+			// Construct an empty ICMP packet so that
+			// Stats().ICMP.ICMPv6ReceivedPacketStats.Invalid is incremented.
+			handleIPv6Payload(header.ICMPv6(buffer.NewView(header.IPv6MinimumSize)))
+
+			icmpv6Stats := s.Stats().ICMP.V6PacketsReceived
+			visitStats(reflect.ValueOf(&icmpv6Stats).Elem(), func(name string, s *tcpip.StatCounter) {
+				if got, want := s.Value(), uint64(1); got != want {
+					t.Errorf("got %s = %d, want = %d", name, got, want)
+				}
+			})
+			if t.Failed() {
+				t.Logf("stats:\n%+v", s.Stats())
+			}
+		})
+	}
+}
+
+func TestICMPCountsWithNeighborCache(t *testing.T) {
 	s := stack.New(stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
 		TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+		UseNeighborCache:   true,
 	})
 	{
-		if err := s.CreateNIC(1, &stubLinkEndpoint{}); err != nil {
-			t.Fatalf("CreateNIC(_) = %s", err)
+		if err := s.CreateNIC(nicID, &stubLinkEndpoint{}); err != nil {
+			t.Fatalf("CreateNIC(_, _) = %s", err)
 		}
-		if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+		if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
 			t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
 		}
 	}
@@ -105,7 +278,7 @@ func TestICMPCounts(t *testing.T) {
 		s.SetRouteTable(
 			[]tcpip.Route{{
 				Destination: subnet,
-				NIC:         1,
+				NIC:         nicID,
 			}},
 		)
 	}
@@ -114,12 +287,12 @@ func TestICMPCounts(t *testing.T) {
 	if netProto == nil {
 		t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
 	}
-	ep := netProto.NewEndpoint(0, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
+	ep := netProto.NewEndpoint(0, nil, &stubNUDHandler{}, &stubDispatcher{}, nil, s)
 	defer ep.Close()
 
-	r, err := s.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+	r, err := s.FindRoute(nicID, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
 	if err != nil {
-		t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
+		t.Fatalf("FindRoute(%d, %s, %s, _, false) = (_, %s), want = (_, nil)", nicID, lladdr0, lladdr1, err)
 	}
 	defer r.Release()
 
@@ -265,19 +438,19 @@ func newTestContext(t *testing.T) *testContext {
 	if testing.Verbose() {
 		wrappedEP0 = sniffer.New(wrappedEP0)
 	}
-	if err := c.s0.CreateNIC(1, wrappedEP0); err != nil {
+	if err := c.s0.CreateNIC(nicID, wrappedEP0); err != nil {
 		t.Fatalf("CreateNIC s0: %v", err)
 	}
-	if err := c.s0.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+	if err := c.s0.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
 		t.Fatalf("AddAddress lladdr0: %v", err)
 	}
 
 	c.linkEP1 = channel.New(defaultChannelSize, defaultMTU, linkAddr1)
 	wrappedEP1 := stack.LinkEndpoint(endpointWithResolutionCapability{LinkEndpoint: c.linkEP1})
-	if err := c.s1.CreateNIC(1, wrappedEP1); err != nil {
+	if err := c.s1.CreateNIC(nicID, wrappedEP1); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
-	if err := c.s1.AddAddress(1, ProtocolNumber, lladdr1); err != nil {
+	if err := c.s1.AddAddress(nicID, ProtocolNumber, lladdr1); err != nil {
 		t.Fatalf("AddAddress lladdr1: %v", err)
 	}
 
@@ -288,7 +461,7 @@ func newTestContext(t *testing.T) *testContext {
 	c.s0.SetRouteTable(
 		[]tcpip.Route{{
 			Destination: subnet0,
-			NIC:         1,
+			NIC:         nicID,
 		}},
 	)
 	subnet1, err := tcpip.NewSubnet(lladdr0, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr0))))
@@ -298,7 +471,7 @@ func newTestContext(t *testing.T) *testContext {
 	c.s1.SetRouteTable(
 		[]tcpip.Route{{
 			Destination: subnet1,
-			NIC:         1,
+			NIC:         nicID,
 		}},
 	)
 
@@ -359,9 +532,9 @@ func TestLinkResolution(t *testing.T) {
 	c := newTestContext(t)
 	defer c.cleanup()
 
-	r, err := c.s0.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+	r, err := c.s0.FindRoute(nicID, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
 	if err != nil {
-		t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
+		t.Fatalf("FindRoute(%d, %s, %s, _, false) = (_, %s), want = (_, nil)", nicID, lladdr0, lladdr1, err)
 	}
 	defer r.Release()
 
@@ -376,14 +549,14 @@ func TestLinkResolution(t *testing.T) {
 	var wq waiter.Queue
 	ep, err := c.s0.NewEndpoint(header.ICMPv6ProtocolNumber, ProtocolNumber, &wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint(_) = _, %s, want = _, nil", err)
+		t.Fatalf("NewEndpoint(_) = (_, %s), want = (_, nil)", err)
 	}
 
 	for {
-		_, resCh, err := ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{NIC: 1, Addr: lladdr1}})
+		_, resCh, err := ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{NIC: nicID, Addr: lladdr1}})
 		if resCh != nil {
 			if err != tcpip.ErrNoLinkAddress {
-				t.Fatalf("ep.Write(_) = _, <non-nil>, %s, want = _, <non-nil>, tcpip.ErrNoLinkAddress", err)
+				t.Fatalf("ep.Write(_) = (_, <non-nil>, %s), want = (_, <non-nil>, tcpip.ErrNoLinkAddress)", err)
 			}
 			for _, args := range []routeArgs{
 				{src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6NeighborSolicit, remoteLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.SolicitedNodeAddr(lladdr1))},
@@ -399,7 +572,7 @@ func TestLinkResolution(t *testing.T) {
 			continue
 		}
 		if err != nil {
-			t.Fatalf("ep.Write(_) = _, _, %s", err)
+			t.Fatalf("ep.Write(_) = (_, _, %s)", err)
 		}
 		break
 	}
@@ -424,6 +597,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 		size        int
 		extraData   []byte
 		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+		routerOnly  bool
 	}{
 		{
 			name: "DstUnreachable",
@@ -480,6 +654,8 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.RouterSolicit
 			},
+			// Hosts MUST silently discard any received Router Solicitation messages.
+			routerOnly: true,
 		},
 		{
 			name: "RouterAdvert",
@@ -516,84 +692,133 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 		},
 	}
 
-	for _, typ := range types {
-		t.Run(typ.name, func(t *testing.T) {
-			e := channel.New(10, 1280, linkAddr0)
-			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
-			})
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
-			}
-
-			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
-				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
-			}
-			{
-				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
-				if err != nil {
-					t.Fatal(err)
-				}
-				s.SetRouteTable(
-					[]tcpip.Route{{
-						Destination: subnet,
-						NIC:         1,
-					}},
-				)
-			}
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
 
-			handleIPv6Payload := func(checksum bool) {
-				icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
-				copy(icmp[typ.size:], typ.extraData)
-				icmp.SetType(typ.typ)
-				if checksum {
-					icmp.SetChecksum(header.ICMPv6Checksum(icmp, lladdr1, lladdr0, buffer.View{}.ToVectorisedView()))
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			for _, typ := range types {
+				for _, isRouter := range []bool{false, true} {
+					name := typ.name
+					if isRouter {
+						name += " (Router)"
+					}
+					t.Run(name, func(t *testing.T) {
+						e := channel.New(0, 1280, linkAddr0)
+
+						// Indicate that resolution for link layer addresses is required to
+						// send packets over this link. This is needed so the NIC knows to
+						// allocate a neighbor table.
+						e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+						s := stack.New(stack.Options{
+							NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+							UseNeighborCache: test.useNeighborCache,
+						})
+						if isRouter {
+							// Enabling forwarding makes the stack act as a router.
+							s.SetForwarding(true)
+						}
+						if err := s.CreateNIC(nicID, e); err != nil {
+							t.Fatalf("CreateNIC(_, _) = %s", err)
+						}
+
+						if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+							t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+						}
+						{
+							subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+							if err != nil {
+								t.Fatal(err)
+							}
+							s.SetRouteTable(
+								[]tcpip.Route{{
+									Destination: subnet,
+									NIC:         nicID,
+								}},
+							)
+						}
+
+						handleIPv6Payload := func(checksum bool) {
+							icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+							copy(icmp[typ.size:], typ.extraData)
+							icmp.SetType(typ.typ)
+							if checksum {
+								icmp.SetChecksum(header.ICMPv6Checksum(icmp, lladdr1, lladdr0, buffer.View{}.ToVectorisedView()))
+							}
+							ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize))
+							ip.Encode(&header.IPv6Fields{
+								PayloadLength: uint16(len(icmp)),
+								NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+								HopLimit:      header.NDPHopLimit,
+								SrcAddr:       lladdr1,
+								DstAddr:       lladdr0,
+							})
+							pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+								Data: buffer.NewVectorisedView(len(ip)+len(icmp), []buffer.View{buffer.View(ip), buffer.View(icmp)}),
+							})
+							e.InjectInbound(ProtocolNumber, pkt)
+						}
+
+						stats := s.Stats().ICMP.V6PacketsReceived
+						invalid := stats.Invalid
+						routerOnly := stats.RouterOnlyPacketsDroppedByHost
+						typStat := typ.statCounter(stats)
+
+						// Initial stat counts should be 0.
+						if got := invalid.Value(); got != 0 {
+							t.Fatalf("got invalid = %d, want = 0", got)
+						}
+						if got := routerOnly.Value(); got != 0 {
+							t.Fatalf("got RouterOnlyPacketsReceivedByHost = %d, want = 0", got)
+						}
+						if got := typStat.Value(); got != 0 {
+							t.Fatalf("got %s = %d, want = 0", typ.name, got)
+						}
+
+						// Without setting checksum, the incoming packet should
+						// be invalid.
+						handleIPv6Payload(false)
+						if got := invalid.Value(); got != 1 {
+							t.Fatalf("got invalid = %d, want = 1", got)
+						}
+						// Router only count should not have increased.
+						if got := routerOnly.Value(); got != 0 {
+							t.Fatalf("got RouterOnlyPacketsReceivedByHost = %d, want = 0", got)
+						}
+						// Rx count of type typ.typ should not have increased.
+						if got := typStat.Value(); got != 0 {
+							t.Fatalf("got %s = %d, want = 0", typ.name, got)
+						}
+
+						// When checksum is set, it should be received.
+						handleIPv6Payload(true)
+						if got := typStat.Value(); got != 1 {
+							t.Fatalf("got %s = %d, want = 1", typ.name, got)
+						}
+						// Invalid count should not have increased again.
+						if got := invalid.Value(); got != 1 {
+							t.Fatalf("got invalid = %d, want = 1", got)
+						}
+						if !isRouter && typ.routerOnly && test.useNeighborCache {
+							// Router only count should have increased.
+							if got := routerOnly.Value(); got != 1 {
+								t.Fatalf("got RouterOnlyPacketsReceivedByHost = %d, want = 1", got)
+							}
+						}
+					})
 				}
-				ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize))
-				ip.Encode(&header.IPv6Fields{
-					PayloadLength: uint16(len(icmp)),
-					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
-					HopLimit:      header.NDPHopLimit,
-					SrcAddr:       lladdr1,
-					DstAddr:       lladdr0,
-				})
-				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-					Data: buffer.NewVectorisedView(len(ip)+len(icmp), []buffer.View{buffer.View(ip), buffer.View(icmp)}),
-				})
-				e.InjectInbound(ProtocolNumber, pkt)
-			}
-
-			stats := s.Stats().ICMP.V6PacketsReceived
-			invalid := stats.Invalid
-			typStat := typ.statCounter(stats)
-
-			// Initial stat counts should be 0.
-			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
-			}
-			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
-			}
-
-			// Without setting checksum, the incoming packet should
-			// be invalid.
-			handleIPv6Payload(false)
-			if got := invalid.Value(); got != 1 {
-				t.Fatalf("got invalid = %d, want = 1", got)
-			}
-			// Rx count of type typ.typ should not have increased.
-			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
-			}
-
-			// When checksum is set, it should be received.
-			handleIPv6Payload(true)
-			if got := typStat.Value(); got != 1 {
-				t.Fatalf("got %s = %d, want = 1", typ.name, got)
-			}
-			// Invalid count should not have increased again.
-			if got := invalid.Value(); got != 1 {
-				t.Fatalf("got invalid = %d, want = 1", got)
 			}
 		})
 	}
@@ -696,11 +921,11 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 			s := stack.New(stack.Options{
 				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
 			})
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(_, _) = %s", err)
 			}
 
-			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
 				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
 			}
 			{
@@ -711,7 +936,7 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 				s.SetRouteTable(
 					[]tcpip.Route{{
 						Destination: subnet,
-						NIC:         1,
+						NIC:         nicID,
 					}},
 				)
 			}
@@ -750,7 +975,7 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 
 			// Without setting checksum, the incoming packet should
@@ -761,13 +986,13 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 			}
 			// Rx count of type typ.typ should not have increased.
 			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 
 			// When checksum is set, it should be received.
 			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, true)
 			if got := typStat.Value(); got != 1 {
-				t.Fatalf("got %s = %d, want = 1", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 			// Invalid count should not have increased again.
 			if got := invalid.Value(); got != 1 {
@@ -874,12 +1099,12 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 			s := stack.New(stack.Options{
 				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
 			})
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
 
-			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
-				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
 			}
 			{
 				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
@@ -889,7 +1114,7 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 				s.SetRouteTable(
 					[]tcpip.Route{{
 						Destination: subnet,
-						NIC:         1,
+						NIC:         nicID,
 					}},
 				)
 			}
@@ -929,7 +1154,7 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 
 			// Without setting checksum, the incoming packet should
@@ -940,13 +1165,13 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 			}
 			// Rx count of type typ.typ should not have increased.
 			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 
 			// When checksum is set, it should be received.
 			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, true)
 			if got := typStat.Value(); got != 1 {
-				t.Fatalf("got %s = %d, want = 1", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 			// Invalid count should not have increased again.
 			if got := invalid.Value(); got != 1 {
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 267d2cce8..36fbbebf0 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -48,6 +48,7 @@ type endpoint struct {
 	nicID         tcpip.NICID
 	linkEP        stack.LinkEndpoint
 	linkAddrCache stack.LinkAddressCache
+	nud           stack.NUDHandler
 	dispatcher    stack.TransportDispatcher
 	protocol      *protocol
 	stack         *stack.Stack
@@ -455,11 +456,12 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 }
 
 // NewEndpoint creates a new ipv6 endpoint.
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, nud stack.NUDHandler, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
 	return &endpoint{
 		nicID:         nicID,
 		linkEP:        linkEP,
 		linkAddrCache: linkAddrCache,
+		nud:           nud,
 		dispatcher:    dispatcher,
 		protocol:      p,
 		stack:         st,
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index af71a7d6b..480c495fa 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -18,6 +18,7 @@ import (
 	"strings"
 	"testing"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/checker"
@@ -30,12 +31,13 @@ import (
 // setupStackAndEndpoint creates a stack with a single NIC with a link-local
 // address llladdr and an IPv6 endpoint to a remote with link-local address
 // rlladdr
-func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address) (*stack.Stack, stack.NetworkEndpoint) {
+func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address, useNeighborCache bool) (*stack.Stack, stack.NetworkEndpoint) {
 	t.Helper()
 
 	s := stack.New(stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
 		TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+		UseNeighborCache:   useNeighborCache,
 	})
 
 	if err := s.CreateNIC(1, &stubLinkEndpoint{}); err != nil {
@@ -63,8 +65,7 @@ func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address) (*stack
 		t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
 	}
 
-	ep := netProto.NewEndpoint(0, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
-
+	ep := netProto.NewEndpoint(0, &stubLinkAddressCache{}, &stubNUDHandler{}, &stubDispatcher{}, nil, s)
 	return s, ep
 }
 
@@ -171,6 +172,123 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 	}
 }
 
+// TestNeighorSolicitationWithSourceLinkLayerOptionUsingNeighborCache tests
+// that receiving a valid NDP NS message with the Source Link Layer Address
+// option results in a new entry in the link address cache for the sender of
+// the message.
+func TestNeighorSolicitationWithSourceLinkLayerOptionUsingNeighborCache(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name             string
+		optsBuf          []byte
+		expectedLinkAddr tcpip.LinkAddress
+	}{
+		{
+			name:             "Valid",
+			optsBuf:          []byte{1, 1, 2, 3, 4, 5, 6, 7},
+			expectedLinkAddr: "\x02\x03\x04\x05\x06\x07",
+		},
+		{
+			name:    "Too Small",
+			optsBuf: []byte{1, 1, 2, 3, 4, 5, 6},
+		},
+		{
+			name:    "Invalid Length",
+			optsBuf: []byte{1, 2, 2, 3, 4, 5, 6, 7},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+				UseNeighborCache: true,
+			})
+			e := channel.New(0, 1280, linkAddr0)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
+			}
+
+			ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + len(test.optsBuf)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+			pkt.SetType(header.ICMPv6NeighborSolicit)
+			ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+			ns.SetTargetAddress(lladdr0)
+			opts := ns.Options()
+			copy(opts, test.optsBuf)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       lladdr1,
+				DstAddr:       lladdr0,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			neighbors, err := s.Neighbors(nicID)
+			if err != nil {
+				t.Fatalf("s.Neighbors(%d): %s", nicID, err)
+			}
+
+			neighborByAddr := make(map[tcpip.Address]stack.NeighborEntry)
+			for _, n := range neighbors {
+				if existing, ok := neighborByAddr[n.Addr]; ok {
+					if diff := cmp.Diff(existing, n); diff != "" {
+						t.Fatalf("s.Neighbors(%d) returned unexpected duplicate neighbor entry (-existing +got):\n%s", nicID, diff)
+					}
+					t.Fatalf("s.Neighbors(%d) returned unexpected duplicate neighbor entry: %s", nicID, existing)
+				}
+				neighborByAddr[n.Addr] = n
+			}
+
+			if neigh, ok := neighborByAddr[lladdr1]; len(test.expectedLinkAddr) != 0 {
+				// Invalid count should not have increased.
+				if got := invalid.Value(); got != 0 {
+					t.Errorf("got invalid = %d, want = 0", got)
+				}
+
+				if !ok {
+					t.Fatalf("expected a neighbor entry for %q", lladdr1)
+				}
+				if neigh.LinkAddr != test.expectedLinkAddr {
+					t.Errorf("got link address = %s, want = %s", neigh.LinkAddr, test.expectedLinkAddr)
+				}
+				if neigh.State != stack.Stale {
+					t.Errorf("got NUD state = %s, want = %s", neigh.State, stack.Stale)
+				}
+			} else {
+				// Invalid count should have increased.
+				if got := invalid.Value(); got != 1 {
+					t.Errorf("got invalid = %d, want = 1", got)
+				}
+
+				if ok {
+					t.Fatalf("unexpectedly got neighbor entry: %s", neigh)
+				}
+			}
+		})
+	}
+}
+
 func TestNeighorSolicitationResponse(t *testing.T) {
 	const nicID = 1
 	nicAddr := lladdr0
@@ -180,6 +298,20 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 	remoteLinkAddr0 := linkAddr1
 	remoteLinkAddr1 := linkAddr2
 
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
+
 	tests := []struct {
 		name          string
 		nsOpts        header.NDPOptionsSerializer
@@ -338,86 +470,92 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 		},
 	}
 
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
-			})
-			e := channel.New(1, 1280, nicLinkAddr)
-			if err := s.CreateNIC(nicID, e); err != nil {
-				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
-			}
-			if err := s.AddAddress(nicID, ProtocolNumber, nicAddr); err != nil {
-				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, nicAddr, err)
-			}
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			for _, test := range tests {
+				t.Run(test.name, func(t *testing.T) {
+					s := stack.New(stack.Options{
+						NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+						UseNeighborCache: stackTyp.useNeighborCache,
+					})
+					e := channel.New(1, 1280, nicLinkAddr)
+					e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+					if err := s.CreateNIC(nicID, e); err != nil {
+						t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+					}
+					if err := s.AddAddress(nicID, ProtocolNumber, nicAddr); err != nil {
+						t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, nicAddr, err)
+					}
 
-			ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + test.nsOpts.Length()
-			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
-			pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
-			pkt.SetType(header.ICMPv6NeighborSolicit)
-			ns := header.NDPNeighborSolicit(pkt.NDPPayload())
-			ns.SetTargetAddress(nicAddr)
-			opts := ns.Options()
-			opts.Serialize(test.nsOpts)
-			pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.nsSrc, test.nsDst, buffer.VectorisedView{}))
-			payloadLength := hdr.UsedLength()
-			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
-			ip.Encode(&header.IPv6Fields{
-				PayloadLength: uint16(payloadLength),
-				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
-				HopLimit:      255,
-				SrcAddr:       test.nsSrc,
-				DstAddr:       test.nsDst,
-			})
+					ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + test.nsOpts.Length()
+					hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
+					pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+					pkt.SetType(header.ICMPv6NeighborSolicit)
+					ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+					ns.SetTargetAddress(nicAddr)
+					opts := ns.Options()
+					opts.Serialize(test.nsOpts)
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.nsSrc, test.nsDst, buffer.VectorisedView{}))
+					payloadLength := hdr.UsedLength()
+					ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+					ip.Encode(&header.IPv6Fields{
+						PayloadLength: uint16(payloadLength),
+						NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+						HopLimit:      255,
+						SrcAddr:       test.nsSrc,
+						DstAddr:       test.nsDst,
+					})
+
+					invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
 
-			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+					// Invalid count should initially be 0.
+					if got := invalid.Value(); got != 0 {
+						t.Fatalf("got invalid = %d, want = 0", got)
+					}
 
-			// Invalid count should initially be 0.
-			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
-			}
+					e.InjectLinkAddr(ProtocolNumber, test.nsSrcLinkAddr, stack.NewPacketBuffer(stack.PacketBufferOptions{
+						Data: hdr.View().ToVectorisedView(),
+					}))
 
-			e.InjectLinkAddr(ProtocolNumber, test.nsSrcLinkAddr, stack.NewPacketBuffer(stack.PacketBufferOptions{
-				Data: hdr.View().ToVectorisedView(),
-			}))
+					if test.nsInvalid {
+						if got := invalid.Value(); got != 1 {
+							t.Fatalf("got invalid = %d, want = 1", got)
+						}
 
-			if test.nsInvalid {
-				if got := invalid.Value(); got != 1 {
-					t.Fatalf("got invalid = %d, want = 1", got)
-				}
+						if p, got := e.Read(); got {
+							t.Fatalf("unexpected response to an invalid NS = %+v", p.Pkt)
+						}
 
-				if p, got := e.Read(); got {
-					t.Fatalf("unexpected response to an invalid NS = %+v", p.Pkt)
-				}
+						// If we expected the NS to be invalid, we have nothing else to check.
+						return
+					}
 
-				// If we expected the NS to be invalid, we have nothing else to check.
-				return
-			}
+					if got := invalid.Value(); got != 0 {
+						t.Fatalf("got invalid = %d, want = 0", got)
+					}
 
-			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
-			}
+					p, got := e.Read()
+					if !got {
+						t.Fatal("expected an NDP NA response")
+					}
 
-			p, got := e.Read()
-			if !got {
-				t.Fatal("expected an NDP NA response")
-			}
+					if p.Route.RemoteLinkAddress != test.naDstLinkAddr {
+						t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr)
+					}
 
-			if p.Route.RemoteLinkAddress != test.naDstLinkAddr {
-				t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr)
+					checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+						checker.SrcAddr(test.naSrc),
+						checker.DstAddr(test.naDst),
+						checker.TTL(header.NDPHopLimit),
+						checker.NDPNA(
+							checker.NDPNASolicitedFlag(test.naSolicited),
+							checker.NDPNATargetAddress(nicAddr),
+							checker.NDPNAOptions([]header.NDPOption{
+								header.NDPTargetLinkLayerAddressOption(nicLinkAddr[:]),
+							}),
+						))
+				})
 			}
-
-			checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
-				checker.SrcAddr(test.naSrc),
-				checker.DstAddr(test.naDst),
-				checker.TTL(header.NDPHopLimit),
-				checker.NDPNA(
-					checker.NDPNASolicitedFlag(test.naSolicited),
-					checker.NDPNATargetAddress(nicAddr),
-					checker.NDPNAOptions([]header.NDPOption{
-						header.NDPTargetLinkLayerAddressOption(nicLinkAddr[:]),
-					}),
-				))
 		})
 	}
 }
@@ -532,197 +670,380 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 	}
 }
 
-func TestNDPValidation(t *testing.T) {
-	setup := func(t *testing.T) (*stack.Stack, stack.NetworkEndpoint, stack.Route) {
-		t.Helper()
-
-		// Create a stack with the assigned link-local address lladdr0
-		// and an endpoint to lladdr1.
-		s, ep := setupStackAndEndpoint(t, lladdr0, lladdr1)
-
-		r, err := s.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
-		if err != nil {
-			t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
-		}
-
-		return s, ep, r
-	}
-
-	handleIPv6Payload := func(payload buffer.View, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint, r *stack.Route) {
-		nextHdr := uint8(header.ICMPv6ProtocolNumber)
-		var extensions buffer.View
-		if atomicFragment {
-			extensions = buffer.NewView(header.IPv6FragmentExtHdrLength)
-			extensions[0] = nextHdr
-			nextHdr = uint8(header.IPv6FragmentExtHdrIdentifier)
-		}
-
-		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: header.IPv6MinimumSize + len(extensions),
-			Data:               payload.ToVectorisedView(),
-		})
-		ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize + len(extensions)))
-		ip.Encode(&header.IPv6Fields{
-			PayloadLength: uint16(len(payload) + len(extensions)),
-			NextHeader:    nextHdr,
-			HopLimit:      hopLimit,
-			SrcAddr:       r.LocalAddress,
-			DstAddr:       r.RemoteAddress,
-		})
-		if n := copy(ip[header.IPv6MinimumSize:], extensions); n != len(extensions) {
-			t.Fatalf("expected to write %d bytes of extensions, but wrote %d", len(extensions), n)
-		}
-		ep.HandlePacket(r, pkt)
-	}
-
-	var tllData [header.NDPLinkLayerAddressSize]byte
-	header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
-		header.NDPTargetLinkLayerAddressOption(linkAddr1),
-	})
+// TestNeighorAdvertisementWithTargetLinkLayerOptionUsingNeighborCache tests
+// that receiving a valid NDP NA message with the Target Link Layer Address
+// option does not result in a new entry in the neighbor cache for the target
+// of the message.
+func TestNeighorAdvertisementWithTargetLinkLayerOptionUsingNeighborCache(t *testing.T) {
+	const nicID = 1
 
-	types := []struct {
-		name        string
-		typ         header.ICMPv6Type
-		size        int
-		extraData   []byte
-		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+	tests := []struct {
+		name    string
+		optsBuf []byte
+		isValid bool
 	}{
 		{
-			name: "RouterSolicit",
-			typ:  header.ICMPv6RouterSolicit,
-			size: header.ICMPv6MinimumSize,
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.RouterSolicit
-			},
-		},
-		{
-			name: "RouterAdvert",
-			typ:  header.ICMPv6RouterAdvert,
-			size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.RouterAdvert
-			},
+			name:    "Valid",
+			optsBuf: []byte{2, 1, 2, 3, 4, 5, 6, 7},
+			isValid: true,
 		},
 		{
-			name: "NeighborSolicit",
-			typ:  header.ICMPv6NeighborSolicit,
-			size: header.ICMPv6NeighborSolicitMinimumSize,
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.NeighborSolicit
-			},
+			name:    "Too Small",
+			optsBuf: []byte{2, 1, 2, 3, 4, 5, 6},
 		},
 		{
-			name:      "NeighborAdvert",
-			typ:       header.ICMPv6NeighborAdvert,
-			size:      header.ICMPv6NeighborAdvertMinimumSize,
-			extraData: tllData[:],
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.NeighborAdvert
-			},
+			name:    "Invalid Length",
+			optsBuf: []byte{2, 2, 2, 3, 4, 5, 6, 7},
 		},
 		{
-			name: "RedirectMsg",
-			typ:  header.ICMPv6RedirectMsg,
-			size: header.ICMPv6MinimumSize,
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.RedirectMsg
+			name: "Multiple",
+			optsBuf: []byte{
+				2, 1, 2, 3, 4, 5, 6, 7,
+				2, 1, 2, 3, 4, 5, 6, 8,
 			},
 		},
 	}
 
-	subTests := []struct {
-		name           string
-		atomicFragment bool
-		hopLimit       uint8
-		code           header.ICMPv6Code
-		valid          bool
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+				UseNeighborCache: true,
+			})
+			e := channel.New(0, 1280, linkAddr0)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
+			}
+
+			ndpNASize := header.ICMPv6NeighborAdvertMinimumSize + len(test.optsBuf)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNASize))
+			pkt.SetType(header.ICMPv6NeighborAdvert)
+			ns := header.NDPNeighborAdvert(pkt.NDPPayload())
+			ns.SetTargetAddress(lladdr1)
+			opts := ns.Options()
+			copy(opts, test.optsBuf)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       lladdr1,
+				DstAddr:       lladdr0,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			neighbors, err := s.Neighbors(nicID)
+			if err != nil {
+				t.Fatalf("s.Neighbors(%d): %s", nicID, err)
+			}
+
+			neighborByAddr := make(map[tcpip.Address]stack.NeighborEntry)
+			for _, n := range neighbors {
+				if existing, ok := neighborByAddr[n.Addr]; ok {
+					if diff := cmp.Diff(existing, n); diff != "" {
+						t.Fatalf("s.Neighbors(%d) returned unexpected duplicate neighbor entry (-existing +got):\n%s", nicID, diff)
+					}
+					t.Fatalf("s.Neighbors(%d) returned unexpected duplicate neighbor entry: %s", nicID, existing)
+				}
+				neighborByAddr[n.Addr] = n
+			}
+
+			if neigh, ok := neighborByAddr[lladdr1]; ok {
+				t.Fatalf("unexpectedly got neighbor entry: %s", neigh)
+			}
+
+			if test.isValid {
+				// Invalid count should not have increased.
+				if got := invalid.Value(); got != 0 {
+					t.Errorf("got invalid = %d, want = 0", got)
+				}
+			} else {
+				// Invalid count should have increased.
+				if got := invalid.Value(); got != 1 {
+					t.Errorf("got invalid = %d, want = 1", got)
+				}
+			}
+		})
+	}
+}
+
+func TestNDPValidation(t *testing.T) {
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
 	}{
 		{
-			name:           "Valid",
-			atomicFragment: false,
-			hopLimit:       header.NDPHopLimit,
-			code:           0,
-			valid:          true,
-		},
-		{
-			name:           "Fragmented",
-			atomicFragment: true,
-			hopLimit:       header.NDPHopLimit,
-			code:           0,
-			valid:          false,
-		},
-		{
-			name:           "Invalid hop limit",
-			atomicFragment: false,
-			hopLimit:       header.NDPHopLimit - 1,
-			code:           0,
-			valid:          false,
+			name:             "linkAddrCache",
+			useNeighborCache: false,
 		},
 		{
-			name:           "Invalid ICMPv6 code",
-			atomicFragment: false,
-			hopLimit:       header.NDPHopLimit,
-			code:           1,
-			valid:          false,
+			name:             "neighborCache",
+			useNeighborCache: true,
 		},
 	}
 
-	for _, typ := range types {
-		t.Run(typ.name, func(t *testing.T) {
-			for _, test := range subTests {
-				t.Run(test.name, func(t *testing.T) {
-					s, ep, r := setup(t)
-					defer r.Release()
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			setup := func(t *testing.T) (*stack.Stack, stack.NetworkEndpoint, stack.Route) {
+				t.Helper()
 
-					stats := s.Stats().ICMP.V6PacketsReceived
-					invalid := stats.Invalid
-					typStat := typ.statCounter(stats)
+				// Create a stack with the assigned link-local address lladdr0
+				// and an endpoint to lladdr1.
+				s, ep := setupStackAndEndpoint(t, lladdr0, lladdr1, stackTyp.useNeighborCache)
 
-					icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
-					copy(icmp[typ.size:], typ.extraData)
-					icmp.SetType(typ.typ)
-					icmp.SetCode(test.code)
-					icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+				r, err := s.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+				if err != nil {
+					t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
+				}
 
-					// Rx count of the NDP message should initially be 0.
-					if got := typStat.Value(); got != 0 {
-						t.Errorf("got %s = %d, want = 0", typ.name, got)
-					}
+				return s, ep, r
+			}
 
-					// Invalid count should initially be 0.
-					if got := invalid.Value(); got != 0 {
-						t.Errorf("got invalid = %d, want = 0", got)
-					}
+			handleIPv6Payload := func(payload buffer.View, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint, r *stack.Route) {
+				nextHdr := uint8(header.ICMPv6ProtocolNumber)
+				var extensions buffer.View
+				if atomicFragment {
+					extensions = buffer.NewView(header.IPv6FragmentExtHdrLength)
+					extensions[0] = nextHdr
+					nextHdr = uint8(header.IPv6FragmentExtHdrIdentifier)
+				}
 
-					if t.Failed() {
-						t.FailNow()
-					}
+				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+					ReserveHeaderBytes: header.IPv6MinimumSize + len(extensions),
+					Data:               payload.ToVectorisedView(),
+				})
+				ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize + len(extensions)))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(len(payload) + len(extensions)),
+					NextHeader:    nextHdr,
+					HopLimit:      hopLimit,
+					SrcAddr:       r.LocalAddress,
+					DstAddr:       r.RemoteAddress,
+				})
+				if n := copy(ip[header.IPv6MinimumSize:], extensions); n != len(extensions) {
+					t.Fatalf("expected to write %d bytes of extensions, but wrote %d", len(extensions), n)
+				}
+				ep.HandlePacket(r, pkt)
+			}
 
-					handleIPv6Payload(buffer.View(icmp), test.hopLimit, test.atomicFragment, ep, &r)
+			var tllData [header.NDPLinkLayerAddressSize]byte
+			header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+				header.NDPTargetLinkLayerAddressOption(linkAddr1),
+			})
 
-					// Rx count of the NDP packet should have increased.
-					if got := typStat.Value(); got != 1 {
-						t.Errorf("got %s = %d, want = 1", typ.name, got)
-					}
+			var sllData [header.NDPLinkLayerAddressSize]byte
+			header.NDPOptions(sllData[:]).Serialize(header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(linkAddr1),
+			})
 
-					want := uint64(0)
-					if !test.valid {
-						// Invalid count should have increased.
-						want = 1
-					}
-					if got := invalid.Value(); got != want {
-						t.Errorf("got invalid = %d, want = %d", got, want)
+			types := []struct {
+				name        string
+				typ         header.ICMPv6Type
+				size        int
+				extraData   []byte
+				statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+				routerOnly  bool
+			}{
+				{
+					name: "RouterSolicit",
+					typ:  header.ICMPv6RouterSolicit,
+					size: header.ICMPv6MinimumSize,
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.RouterSolicit
+					},
+					routerOnly: true,
+				},
+				{
+					name: "RouterAdvert",
+					typ:  header.ICMPv6RouterAdvert,
+					size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.RouterAdvert
+					},
+				},
+				{
+					name:      "NeighborSolicit",
+					typ:       header.ICMPv6NeighborSolicit,
+					size:      header.ICMPv6NeighborSolicitMinimumSize,
+					extraData: sllData[:],
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.NeighborSolicit
+					},
+				},
+				{
+					name:      "NeighborAdvert",
+					typ:       header.ICMPv6NeighborAdvert,
+					size:      header.ICMPv6NeighborAdvertMinimumSize,
+					extraData: tllData[:],
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.NeighborAdvert
+					},
+				},
+				{
+					name: "RedirectMsg",
+					typ:  header.ICMPv6RedirectMsg,
+					size: header.ICMPv6MinimumSize,
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.RedirectMsg
+					},
+				},
+			}
+
+			subTests := []struct {
+				name           string
+				atomicFragment bool
+				hopLimit       uint8
+				code           header.ICMPv6Code
+				valid          bool
+			}{
+				{
+					name:           "Valid",
+					atomicFragment: false,
+					hopLimit:       header.NDPHopLimit,
+					code:           0,
+					valid:          true,
+				},
+				{
+					name:           "Fragmented",
+					atomicFragment: true,
+					hopLimit:       header.NDPHopLimit,
+					code:           0,
+					valid:          false,
+				},
+				{
+					name:           "Invalid hop limit",
+					atomicFragment: false,
+					hopLimit:       header.NDPHopLimit - 1,
+					code:           0,
+					valid:          false,
+				},
+				{
+					name:           "Invalid ICMPv6 code",
+					atomicFragment: false,
+					hopLimit:       header.NDPHopLimit,
+					code:           1,
+					valid:          false,
+				},
+			}
+
+			for _, typ := range types {
+				for _, isRouter := range []bool{false, true} {
+					name := typ.name
+					if isRouter {
+						name += " (Router)"
 					}
-				})
+
+					t.Run(name, func(t *testing.T) {
+						for _, test := range subTests {
+							t.Run(test.name, func(t *testing.T) {
+								s, ep, r := setup(t)
+								defer r.Release()
+
+								if isRouter {
+									// Enabling forwarding makes the stack act as a router.
+									s.SetForwarding(true)
+								}
+
+								stats := s.Stats().ICMP.V6PacketsReceived
+								invalid := stats.Invalid
+								routerOnly := stats.RouterOnlyPacketsDroppedByHost
+								typStat := typ.statCounter(stats)
+
+								icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+								copy(icmp[typ.size:], typ.extraData)
+								icmp.SetType(typ.typ)
+								icmp.SetCode(test.code)
+								icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+
+								// Rx count of the NDP message should initially be 0.
+								if got := typStat.Value(); got != 0 {
+									t.Errorf("got %s = %d, want = 0", typ.name, got)
+								}
+
+								// Invalid count should initially be 0.
+								if got := invalid.Value(); got != 0 {
+									t.Errorf("got invalid = %d, want = 0", got)
+								}
+
+								// RouterOnlyPacketsReceivedByHost count should initially be 0.
+								if got := routerOnly.Value(); got != 0 {
+									t.Errorf("got RouterOnlyPacketsReceivedByHost = %d, want = 0", got)
+								}
+
+								if t.Failed() {
+									t.FailNow()
+								}
+
+								handleIPv6Payload(buffer.View(icmp), test.hopLimit, test.atomicFragment, ep, &r)
+
+								// Rx count of the NDP packet should have increased.
+								if got := typStat.Value(); got != 1 {
+									t.Errorf("got %s = %d, want = 1", typ.name, got)
+								}
+
+								want := uint64(0)
+								if !test.valid {
+									// Invalid count should have increased.
+									want = 1
+								}
+								if got := invalid.Value(); got != want {
+									t.Errorf("got invalid = %d, want = %d", got, want)
+								}
+
+								want = 0
+								if test.valid && !isRouter && typ.routerOnly {
+									// RouterOnlyPacketsReceivedByHost count should have increased.
+									want = 1
+								}
+								if got := routerOnly.Value(); got != want {
+									t.Errorf("got RouterOnlyPacketsReceivedByHost = %d, want = %d", got, want)
+								}
+
+							})
+						}
+					})
+				}
 			}
 		})
 	}
+
 }
 
 // TestRouterAdvertValidation tests that when the NIC is configured to handle
 // NDP Router Advertisement packets, it validates the Router Advertisement
 // properly before handling them.
 func TestRouterAdvertValidation(t *testing.T) {
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
+
 	tests := []struct {
 		name            string
 		src             tcpip.Address
@@ -844,61 +1165,67 @@ func TestRouterAdvertValidation(t *testing.T) {
 		},
 	}
 
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			e := channel.New(10, 1280, linkAddr1)
-			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
-			})
-
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
-			}
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			for _, test := range tests {
+				t.Run(test.name, func(t *testing.T) {
+					e := channel.New(10, 1280, linkAddr1)
+					e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+					s := stack.New(stack.Options{
+						NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+						UseNeighborCache: stackTyp.useNeighborCache,
+					})
+
+					if err := s.CreateNIC(1, e); err != nil {
+						t.Fatalf("CreateNIC(_) = %s", err)
+					}
 
-			icmpSize := header.ICMPv6HeaderSize + len(test.ndpPayload)
-			hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
-			pkt := header.ICMPv6(hdr.Prepend(icmpSize))
-			pkt.SetType(header.ICMPv6RouterAdvert)
-			pkt.SetCode(test.code)
-			copy(pkt.NDPPayload(), test.ndpPayload)
-			payloadLength := hdr.UsedLength()
-			pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.src, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
-			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
-			ip.Encode(&header.IPv6Fields{
-				PayloadLength: uint16(payloadLength),
-				NextHeader:    uint8(icmp.ProtocolNumber6),
-				HopLimit:      test.hopLimit,
-				SrcAddr:       test.src,
-				DstAddr:       header.IPv6AllNodesMulticastAddress,
-			})
+					icmpSize := header.ICMPv6HeaderSize + len(test.ndpPayload)
+					hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
+					pkt := header.ICMPv6(hdr.Prepend(icmpSize))
+					pkt.SetType(header.ICMPv6RouterAdvert)
+					pkt.SetCode(test.code)
+					copy(pkt.NDPPayload(), test.ndpPayload)
+					payloadLength := hdr.UsedLength()
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.src, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
+					ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+					ip.Encode(&header.IPv6Fields{
+						PayloadLength: uint16(payloadLength),
+						NextHeader:    uint8(icmp.ProtocolNumber6),
+						HopLimit:      test.hopLimit,
+						SrcAddr:       test.src,
+						DstAddr:       header.IPv6AllNodesMulticastAddress,
+					})
 
-			stats := s.Stats().ICMP.V6PacketsReceived
-			invalid := stats.Invalid
-			rxRA := stats.RouterAdvert
+					stats := s.Stats().ICMP.V6PacketsReceived
+					invalid := stats.Invalid
+					rxRA := stats.RouterAdvert
 
-			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
-			}
-			if got := rxRA.Value(); got != 0 {
-				t.Fatalf("got rxRA = %d, want = 0", got)
-			}
+					if got := invalid.Value(); got != 0 {
+						t.Fatalf("got invalid = %d, want = 0", got)
+					}
+					if got := rxRA.Value(); got != 0 {
+						t.Fatalf("got rxRA = %d, want = 0", got)
+					}
 
-			e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
-				Data: hdr.View().ToVectorisedView(),
-			}))
+					e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+						Data: hdr.View().ToVectorisedView(),
+					}))
 
-			if got := rxRA.Value(); got != 1 {
-				t.Fatalf("got rxRA = %d, want = 1", got)
-			}
+					if got := rxRA.Value(); got != 1 {
+						t.Fatalf("got rxRA = %d, want = 1", got)
+					}
 
-			if test.expectedSuccess {
-				if got := invalid.Value(); got != 0 {
-					t.Fatalf("got invalid = %d, want = 0", got)
-				}
-			} else {
-				if got := invalid.Value(); got != 1 {
-					t.Fatalf("got invalid = %d, want = 1", got)
-				}
+					if test.expectedSuccess {
+						if got := invalid.Value(); got != 0 {
+							t.Fatalf("got invalid = %d, want = 0", got)
+						}
+					} else {
+						if got := invalid.Value(); got != 1 {
+							t.Fatalf("got invalid = %d, want = 1", got)
+						}
+					}
+				})
 			}
 		})
 	}
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index 5a684eb9d..91165ebc7 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -51,6 +51,8 @@ type fwdTestNetworkEndpoint struct {
 	ep         LinkEndpoint
 }
 
+var _ NetworkEndpoint = (*fwdTestNetworkEndpoint)(nil)
+
 func (f *fwdTestNetworkEndpoint) MTU() uint32 {
 	return f.ep.MTU() - uint32(f.MaxHeaderLength())
 }
@@ -110,11 +112,13 @@ func (*fwdTestNetworkEndpoint) Close() {}
 // resolution.
 type fwdTestNetworkProtocol struct {
 	addrCache              *linkAddrCache
+	neigh                  *neighborCache
 	addrResolveDelay       time.Duration
-	onLinkAddressResolved  func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress)
+	onLinkAddressResolved  func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress)
 	onResolveStaticAddress func(tcpip.Address) (tcpip.LinkAddress, bool)
 }
 
+var _ NetworkProtocol = (*fwdTestNetworkProtocol)(nil)
 var _ LinkAddressResolver = (*fwdTestNetworkProtocol)(nil)
 
 func (f *fwdTestNetworkProtocol) Number() tcpip.NetworkProtocolNumber {
@@ -141,7 +145,7 @@ func (*fwdTestNetworkProtocol) Parse(pkt *PacketBuffer) (tcpip.TransportProtocol
 	return tcpip.TransportProtocolNumber(netHeader[protocolNumberOffset]), true, true
 }
 
-func (f *fwdTestNetworkProtocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, ep LinkEndpoint, _ *Stack) NetworkEndpoint {
+func (f *fwdTestNetworkProtocol) NewEndpoint(nicID tcpip.NICID, _ LinkAddressCache, _ NUDHandler, dispatcher TransportDispatcher, ep LinkEndpoint, _ *Stack) NetworkEndpoint {
 	return &fwdTestNetworkEndpoint{
 		nicID:      nicID,
 		proto:      f,
@@ -163,9 +167,9 @@ func (f *fwdTestNetworkProtocol) Close() {}
 func (f *fwdTestNetworkProtocol) Wait() {}
 
 func (f *fwdTestNetworkProtocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error {
-	if f.addrCache != nil && f.onLinkAddressResolved != nil {
+	if f.onLinkAddressResolved != nil {
 		time.AfterFunc(f.addrResolveDelay, func() {
-			f.onLinkAddressResolved(f.addrCache, addr, remoteLinkAddr)
+			f.onLinkAddressResolved(f.addrCache, f.neigh, addr, remoteLinkAddr)
 		})
 	}
 	return nil
@@ -300,13 +304,16 @@ func (e *fwdTestLinkEndpoint) AddHeader(local, remote tcpip.LinkAddress, protoco
 	panic("not implemented")
 }
 
-func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol) (ep1, ep2 *fwdTestLinkEndpoint) {
+func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol, useNeighborCache bool) (ep1, ep2 *fwdTestLinkEndpoint) {
 	// Create a stack with the network protocol and two NICs.
 	s := New(Options{
 		NetworkProtocols: []NetworkProtocol{proto},
+		UseNeighborCache: useNeighborCache,
 	})
 
-	proto.addrCache = s.linkAddrCache
+	if !useNeighborCache {
+		proto.addrCache = s.linkAddrCache
+	}
 
 	// Enable forwarding.
 	s.SetForwarding(true)
@@ -337,6 +344,15 @@ func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol) (ep1, ep2 *f
 		t.Fatal("AddAddress #2 failed:", err)
 	}
 
+	if useNeighborCache {
+		// Control the neighbor cache for NIC 2.
+		nic, ok := s.nics[2]
+		if !ok {
+			t.Fatal("failed to get the neighbor cache for NIC 2")
+		}
+		proto.neigh = nic.neigh
+	}
+
 	// Route all packets to NIC 2.
 	{
 		subnet, err := tcpip.NewSubnet("\x00", "\x00")
@@ -350,79 +366,129 @@ func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol) (ep1, ep2 *f
 }
 
 func TestForwardingWithStaticResolver(t *testing.T) {
-	// Create a network protocol with a static resolver.
-	proto := &fwdTestNetworkProtocol{
-		onResolveStaticAddress:
-		// The network address 3 is resolved to the link address "c".
-		func(addr tcpip.Address) (tcpip.LinkAddress, bool) {
-			if addr == "\x03" {
-				return "c", true
-			}
-			return "", false
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
 		},
 	}
 
-	ep1, ep2 := fwdTestNetFactory(t, proto)
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			// Create a network protocol with a static resolver.
+			proto := &fwdTestNetworkProtocol{
+				onResolveStaticAddress:
+				// The network address 3 is resolved to the link address "c".
+				func(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+					if addr == "\x03" {
+						return "c", true
+					}
+					return "", false
+				},
+			}
 
-	// Inject an inbound packet to address 3 on NIC 1, and see if it is
-	// forwarded to NIC 2.
-	buf := buffer.NewView(30)
-	buf[dstAddrOffset] = 3
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
+			ep1, ep2 := fwdTestNetFactory(t, proto, test.useNeighborCache)
 
-	var p fwdTestPacketInfo
+			// Inject an inbound packet to address 3 on NIC 1, and see if it is
+			// forwarded to NIC 2.
+			buf := buffer.NewView(30)
+			buf[dstAddrOffset] = 3
+			ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
 
-	select {
-	case p = <-ep2.C:
-	default:
-		t.Fatal("packet not forwarded")
-	}
+			var p fwdTestPacketInfo
 
-	// Test that the static address resolution happened correctly.
-	if p.RemoteLinkAddress != "c" {
-		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-	}
-	if p.LocalLinkAddress != "b" {
-		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+			select {
+			case p = <-ep2.C:
+			default:
+				t.Fatal("packet not forwarded")
+			}
+
+			// Test that the static address resolution happened correctly.
+			if p.RemoteLinkAddress != "c" {
+				t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+			}
+			if p.LocalLinkAddress != "b" {
+				t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+			}
+		})
 	}
 }
 
 func TestForwardingWithFakeResolver(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Any address will be resolved to the link address "c".
-			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Any address will be resolved to the link address "c".
+					cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Any address will be resolved to the link address "c".
+					neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+						Solicited: true,
+						Override:  false,
+						IsRouter:  false,
+					})
+				},
+			},
 		},
 	}
 
-	ep1, ep2 := fwdTestNetFactory(t, proto)
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
 
-	// Inject an inbound packet to address 3 on NIC 1, and see if it is
-	// forwarded to NIC 2.
-	buf := buffer.NewView(30)
-	buf[dstAddrOffset] = 3
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
+			// Inject an inbound packet to address 3 on NIC 1, and see if it is
+			// forwarded to NIC 2.
+			buf := buffer.NewView(30)
+			buf[dstAddrOffset] = 3
+			ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
 
-	var p fwdTestPacketInfo
+			var p fwdTestPacketInfo
 
-	select {
-	case p = <-ep2.C:
-	case <-time.After(time.Second):
-		t.Fatal("packet not forwarded")
-	}
+			select {
+			case p = <-ep2.C:
+			case <-time.After(time.Second):
+				t.Fatal("packet not forwarded")
+			}
 
-	// Test that the address resolution happened correctly.
-	if p.RemoteLinkAddress != "c" {
-		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-	}
-	if p.LocalLinkAddress != "b" {
-		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+			// Test that the address resolution happened correctly.
+			if p.RemoteLinkAddress != "c" {
+				t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+			}
+			if p.LocalLinkAddress != "b" {
+				t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+			}
+		})
 	}
 }
 
@@ -430,7 +496,9 @@ func TestForwardingWithNoResolver(t *testing.T) {
 	// Create a network protocol without a resolver.
 	proto := &fwdTestNetworkProtocol{}
 
-	ep1, ep2 := fwdTestNetFactory(t, proto)
+	// Whether or not we use the neighbor cache here does not matter since
+	// neither linkAddrCache nor neighborCache will be used.
+	ep1, ep2 := fwdTestNetFactory(t, proto, false /* useNeighborCache */)
 
 	// inject an inbound packet to address 3 on NIC 1, and see if it is
 	// forwarded to NIC 2.
@@ -448,203 +516,334 @@ func TestForwardingWithNoResolver(t *testing.T) {
 }
 
 func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Only packets to address 3 will be resolved to the
-			// link address "c".
-			if addr == "\x03" {
-				cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
-			}
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Only packets to address 3 will be resolved to the
+					// link address "c".
+					if addr == "\x03" {
+						cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+					}
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Only packets to address 3 will be resolved to the
+					// link address "c".
+					if addr == "\x03" {
+						neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+							Solicited: true,
+							Override:  false,
+							IsRouter:  false,
+						})
+					}
+				},
+			},
 		},
 	}
 
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	// Inject an inbound packet to address 4 on NIC 1. This packet should
-	// not be forwarded.
-	buf := buffer.NewView(30)
-	buf[dstAddrOffset] = 4
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	// Inject an inbound packet to address 3 on NIC 1, and see if it is
-	// forwarded to NIC 2.
-	buf = buffer.NewView(30)
-	buf[dstAddrOffset] = 3
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	var p fwdTestPacketInfo
-
-	select {
-	case p = <-ep2.C:
-	case <-time.After(time.Second):
-		t.Fatal("packet not forwarded")
-	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
+
+			// Inject an inbound packet to address 4 on NIC 1. This packet should
+			// not be forwarded.
+			buf := buffer.NewView(30)
+			buf[dstAddrOffset] = 4
+			ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
+
+			// Inject an inbound packet to address 3 on NIC 1, and see if it is
+			// forwarded to NIC 2.
+			buf = buffer.NewView(30)
+			buf[dstAddrOffset] = 3
+			ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
+
+			var p fwdTestPacketInfo
+
+			select {
+			case p = <-ep2.C:
+			case <-time.After(time.Second):
+				t.Fatal("packet not forwarded")
+			}
 
-	if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
-		t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
-	}
+			if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
+				t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
+			}
 
-	// Test that the address resolution happened correctly.
-	if p.RemoteLinkAddress != "c" {
-		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-	}
-	if p.LocalLinkAddress != "b" {
-		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+			// Test that the address resolution happened correctly.
+			if p.RemoteLinkAddress != "c" {
+				t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+			}
+			if p.LocalLinkAddress != "b" {
+				t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+			}
+		})
 	}
 }
 
 func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Any packets will be resolved to the link address "c".
-			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Any packets will be resolved to the link address "c".
+					cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Any packets will be resolved to the link address "c".
+					neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+						Solicited: true,
+						Override:  false,
+						IsRouter:  false,
+					})
+				},
+			},
 		},
 	}
 
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	// Inject two inbound packets to address 3 on NIC 1.
-	for i := 0; i < 2; i++ {
-		buf := buffer.NewView(30)
-		buf[dstAddrOffset] = 3
-		ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-			Data: buf.ToVectorisedView(),
-		}))
-	}
-
-	for i := 0; i < 2; i++ {
-		var p fwdTestPacketInfo
-
-		select {
-		case p = <-ep2.C:
-		case <-time.After(time.Second):
-			t.Fatal("packet not forwarded")
-		}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
 
-		if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
-			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
-		}
+			// Inject two inbound packets to address 3 on NIC 1.
+			for i := 0; i < 2; i++ {
+				buf := buffer.NewView(30)
+				buf[dstAddrOffset] = 3
+				ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+					Data: buf.ToVectorisedView(),
+				}))
+			}
 
-		// Test that the address resolution happened correctly.
-		if p.RemoteLinkAddress != "c" {
-			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-		}
-		if p.LocalLinkAddress != "b" {
-			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-		}
+			for i := 0; i < 2; i++ {
+				var p fwdTestPacketInfo
+
+				select {
+				case p = <-ep2.C:
+				case <-time.After(time.Second):
+					t.Fatal("packet not forwarded")
+				}
+
+				if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
+					t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
+				}
+
+				// Test that the address resolution happened correctly.
+				if p.RemoteLinkAddress != "c" {
+					t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+				}
+				if p.LocalLinkAddress != "b" {
+					t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+				}
+			}
+		})
 	}
 }
 
 func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Any packets will be resolved to the link address "c".
-			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Any packets will be resolved to the link address "c".
+					cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Any packets will be resolved to the link address "c".
+					neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+						Solicited: true,
+						Override:  false,
+						IsRouter:  false,
+					})
+				},
+			},
 		},
 	}
 
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	for i := 0; i < maxPendingPacketsPerResolution+5; i++ {
-		// Inject inbound 'maxPendingPacketsPerResolution + 5' packets on NIC 1.
-		buf := buffer.NewView(30)
-		buf[dstAddrOffset] = 3
-		// Set the packet sequence number.
-		binary.BigEndian.PutUint16(buf[fwdTestNetHeaderLen:], uint16(i))
-		ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-			Data: buf.ToVectorisedView(),
-		}))
-	}
-
-	for i := 0; i < maxPendingPacketsPerResolution; i++ {
-		var p fwdTestPacketInfo
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
 
-		select {
-		case p = <-ep2.C:
-		case <-time.After(time.Second):
-			t.Fatal("packet not forwarded")
-		}
-
-		b := PayloadSince(p.Pkt.NetworkHeader())
-		if b[dstAddrOffset] != 3 {
-			t.Fatalf("got b[dstAddrOffset] = %d, want = 3", b[dstAddrOffset])
-		}
-		if len(b) < fwdTestNetHeaderLen+2 {
-			t.Fatalf("packet is too short to hold a sequence number: len(b) = %d", b)
-		}
-		seqNumBuf := b[fwdTestNetHeaderLen:]
-
-		// The first 5 packets should not be forwarded so the sequence number should
-		// start with 5.
-		want := uint16(i + 5)
-		if n := binary.BigEndian.Uint16(seqNumBuf); n != want {
-			t.Fatalf("got the packet #%d, want = #%d", n, want)
-		}
+			for i := 0; i < maxPendingPacketsPerResolution+5; i++ {
+				// Inject inbound 'maxPendingPacketsPerResolution + 5' packets on NIC 1.
+				buf := buffer.NewView(30)
+				buf[dstAddrOffset] = 3
+				// Set the packet sequence number.
+				binary.BigEndian.PutUint16(buf[fwdTestNetHeaderLen:], uint16(i))
+				ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+					Data: buf.ToVectorisedView(),
+				}))
+			}
 
-		// Test that the address resolution happened correctly.
-		if p.RemoteLinkAddress != "c" {
-			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-		}
-		if p.LocalLinkAddress != "b" {
-			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-		}
+			for i := 0; i < maxPendingPacketsPerResolution; i++ {
+				var p fwdTestPacketInfo
+
+				select {
+				case p = <-ep2.C:
+				case <-time.After(time.Second):
+					t.Fatal("packet not forwarded")
+				}
+
+				b := PayloadSince(p.Pkt.NetworkHeader())
+				if b[dstAddrOffset] != 3 {
+					t.Fatalf("got b[dstAddrOffset] = %d, want = 3", b[dstAddrOffset])
+				}
+				if len(b) < fwdTestNetHeaderLen+2 {
+					t.Fatalf("packet is too short to hold a sequence number: len(b) = %d", b)
+				}
+				seqNumBuf := b[fwdTestNetHeaderLen:]
+
+				// The first 5 packets should not be forwarded so the sequence number should
+				// start with 5.
+				want := uint16(i + 5)
+				if n := binary.BigEndian.Uint16(seqNumBuf); n != want {
+					t.Fatalf("got the packet #%d, want = #%d", n, want)
+				}
+
+				// Test that the address resolution happened correctly.
+				if p.RemoteLinkAddress != "c" {
+					t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+				}
+				if p.LocalLinkAddress != "b" {
+					t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+				}
+			}
+		})
 	}
 }
 
 func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Any packets will be resolved to the link address "c".
-			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Any packets will be resolved to the link address "c".
+					cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Any packets will be resolved to the link address "c".
+					neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+						Solicited: true,
+						Override:  false,
+						IsRouter:  false,
+					})
+				},
+			},
 		},
 	}
 
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	for i := 0; i < maxPendingResolutions+5; i++ {
-		// Inject inbound 'maxPendingResolutions + 5' packets on NIC 1.
-		// Each packet has a different destination address (3 to
-		// maxPendingResolutions + 7).
-		buf := buffer.NewView(30)
-		buf[dstAddrOffset] = byte(3 + i)
-		ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-			Data: buf.ToVectorisedView(),
-		}))
-	}
-
-	for i := 0; i < maxPendingResolutions; i++ {
-		var p fwdTestPacketInfo
-
-		select {
-		case p = <-ep2.C:
-		case <-time.After(time.Second):
-			t.Fatal("packet not forwarded")
-		}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
 
-		// The first 5 packets (address 3 to 7) should not be forwarded
-		// because their address resolutions are interrupted.
-		if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] < 8 {
-			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want p.Pkt.NetworkHeader[dstAddrOffset] >= 8", nh[dstAddrOffset])
-		}
+			for i := 0; i < maxPendingResolutions+5; i++ {
+				// Inject inbound 'maxPendingResolutions + 5' packets on NIC 1.
+				// Each packet has a different destination address (3 to
+				// maxPendingResolutions + 7).
+				buf := buffer.NewView(30)
+				buf[dstAddrOffset] = byte(3 + i)
+				ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+					Data: buf.ToVectorisedView(),
+				}))
+			}
 
-		// Test that the address resolution happened correctly.
-		if p.RemoteLinkAddress != "c" {
-			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-		}
-		if p.LocalLinkAddress != "b" {
-			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-		}
+			for i := 0; i < maxPendingResolutions; i++ {
+				var p fwdTestPacketInfo
+
+				select {
+				case p = <-ep2.C:
+				case <-time.After(time.Second):
+					t.Fatal("packet not forwarded")
+				}
+
+				// The first 5 packets (address 3 to 7) should not be forwarded
+				// because their address resolutions are interrupted.
+				if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] < 8 {
+					t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want p.Pkt.NetworkHeader[dstAddrOffset] >= 8", nh[dstAddrOffset])
+				}
+
+				// Test that the address resolution happened correctly.
+				if p.RemoteLinkAddress != "c" {
+					t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+				}
+				if p.LocalLinkAddress != "b" {
+					t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+				}
+			}
+		})
 	}
 }
diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go
index b15b8d1cb..14fb4239b 100644
--- a/pkg/tcpip/stack/linkaddrcache_test.go
+++ b/pkg/tcpip/stack/linkaddrcache_test.go
@@ -275,3 +275,71 @@ func TestStaticResolution(t *testing.T) {
 		t.Errorf("c.get(%q)=%q, want %q", string(addr), string(got), string(want))
 	}
 }
+
+// TestCacheWaker verifies that RemoveWaker removes a waker previously added
+// through get().
+func TestCacheWaker(t *testing.T) {
+	c := newLinkAddrCache(1<<63-1, 1*time.Second, 3)
+
+	// First, sanity check that wakers are working.
+	{
+		linkRes := &testLinkAddressResolver{cache: c}
+		s := sleep.Sleeper{}
+		defer s.Done()
+
+		const wakerID = 1
+		w := sleep.Waker{}
+		s.AddWaker(&w, wakerID)
+
+		e := testAddrs[0]
+
+		if _, _, err := c.get(e.addr, linkRes, "", nil, &w); err != tcpip.ErrWouldBlock {
+			t.Fatalf("got c.get(%q, _, _, _, _) = %s, want = %s", e.addr.Addr, err, tcpip.ErrWouldBlock)
+		}
+		id, ok := s.Fetch(true /* block */)
+		if !ok {
+			t.Fatal("got s.Fetch(true) = (_, false), want = (_, true)")
+		}
+		if id != wakerID {
+			t.Fatalf("got s.Fetch(true) = (%d, %t), want = (%d, true)", id, ok, wakerID)
+		}
+
+		if got, _, err := c.get(e.addr, linkRes, "", nil, nil); err != nil {
+			t.Fatalf("c.get(%q, _, _, _, _): %s", e.addr.Addr, err)
+		} else if got != e.linkAddr {
+			t.Fatalf("got c.get(%q) = %q, want = %q", e.addr.Addr, got, e.linkAddr)
+		}
+	}
+
+	// Check that RemoveWaker works.
+	{
+		linkRes := &testLinkAddressResolver{cache: c}
+		s := sleep.Sleeper{}
+		defer s.Done()
+
+		const wakerID = 2 // different than the ID used in the sanity check
+		w := sleep.Waker{}
+		s.AddWaker(&w, wakerID)
+
+		e := testAddrs[1]
+		linkRes.onLinkAddressRequest = func() {
+			// Remove the waker before the linkAddrCache has the opportunity to send
+			// a notification.
+			c.removeWaker(e.addr, &w)
+		}
+
+		if _, _, err := c.get(e.addr, linkRes, "", nil, &w); err != tcpip.ErrWouldBlock {
+			t.Fatalf("got c.get(%q, _, _, _, _) = %s, want = %s", e.addr.Addr, err, tcpip.ErrWouldBlock)
+		}
+
+		if got, err := getBlocking(c, e.addr, linkRes); err != nil {
+			t.Fatalf("c.get(%q, _, _, _, _): %s", e.addr.Addr, err)
+		} else if got != e.linkAddr {
+			t.Fatalf("c.get(%q) = %q, want = %q", e.addr.Addr, got, e.linkAddr)
+		}
+
+		if id, ok := s.Fetch(false /* block */); ok {
+			t.Fatalf("unexpected notification from waker with id %d", id)
+		}
+	}
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 21bf53010..67dc5364f 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -2787,7 +2787,7 @@ func TestMixedSLAACAddrConflictRegen(t *testing.T) {
 // stack.Stack will have a default route through the router (llAddr3) installed
 // and a static link-address (linkAddr3) added to the link address cache for the
 // router.
-func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*ndpDispatcher, *channel.Endpoint, *stack.Stack) {
+func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID, useNeighborCache bool) (*ndpDispatcher, *channel.Endpoint, *stack.Stack) {
 	t.Helper()
 	ndpDisp := &ndpDispatcher{
 		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
@@ -2800,7 +2800,8 @@ func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*nd
 			HandleRAs:              true,
 			AutoGenGlobalAddresses: true,
 		},
-		NDPDisp: ndpDisp,
+		NDPDisp:          ndpDisp,
+		UseNeighborCache: useNeighborCache,
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -2810,7 +2811,11 @@ func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*nd
 		Gateway:     llAddr3,
 		NIC:         nicID,
 	}})
-	s.AddLinkAddress(nicID, llAddr3, linkAddr3)
+	if useNeighborCache {
+		s.AddStaticNeighbor(nicID, llAddr3, linkAddr3)
+	} else {
+		s.AddLinkAddress(nicID, llAddr3, linkAddr3)
+	}
 	return ndpDisp, e, s
 }
 
@@ -2884,110 +2889,128 @@ func addrForNewConnectionWithAddr(t *testing.T, s *stack.Stack, addr tcpip.FullA
 // TestAutoGenAddrDeprecateFromPI tests deprecating a SLAAC address when
 // receiving a PI with 0 preferred lifetime.
 func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
-	const nicID = 1
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
 
-	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
-	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			const nicID = 1
 
-	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+			prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+			prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
 
-	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
-		t.Helper()
+			ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID, stackTyp.useNeighborCache)
 
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
-				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
 			}
-		default:
-			t.Fatal("expected addr auto gen event")
-		}
-	}
 
-	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
-		t.Helper()
+			expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+				t.Helper()
 
-		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
-			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
-		} else if got != addr {
-			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
-		}
+				if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+					t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+				} else if got != addr {
+					t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+				}
 
-		if got := addrForNewConnection(t, s); got != addr.Address {
-			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
-		}
-	}
+				if got := addrForNewConnection(t, s); got != addr.Address {
+					t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+				}
+			}
 
-	// Receive PI for prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
-	expectAutoGenAddrEvent(addr1, newAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should have %s in the list of addresses", addr1)
-	}
-	expectPrimaryAddr(addr1)
+			// Receive PI for prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+			expectAutoGenAddrEvent(addr1, newAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should have %s in the list of addresses", addr1)
+			}
+			expectPrimaryAddr(addr1)
 
-	// Deprecate addr for prefix1 immedaitely.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
-	expectAutoGenAddrEvent(addr1, deprecatedAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should have %s in the list of addresses", addr1)
-	}
-	// addr should still be the primary endpoint as there are no other addresses.
-	expectPrimaryAddr(addr1)
+			// Deprecate addr for prefix1 immedaitely.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+			expectAutoGenAddrEvent(addr1, deprecatedAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should have %s in the list of addresses", addr1)
+			}
+			// addr should still be the primary endpoint as there are no other addresses.
+			expectPrimaryAddr(addr1)
 
-	// Refresh lifetimes of addr generated from prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	expectPrimaryAddr(addr1)
+			// Refresh lifetimes of addr generated from prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr1)
 
-	// Receive PI for prefix2.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
-	expectAutoGenAddrEvent(addr2, newAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	expectPrimaryAddr(addr2)
+			// Receive PI for prefix2.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+			expectAutoGenAddrEvent(addr2, newAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			expectPrimaryAddr(addr2)
 
-	// Deprecate addr for prefix2 immedaitely.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
-	expectAutoGenAddrEvent(addr2, deprecatedAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	// addr1 should be the primary endpoint now since addr2 is deprecated but
-	// addr1 is not.
-	expectPrimaryAddr(addr1)
-	// addr2 is deprecated but if explicitly requested, it should be used.
-	fullAddr2 := tcpip.FullAddress{Addr: addr2.Address, NIC: nicID}
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
-	}
+			// Deprecate addr for prefix2 immedaitely.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+			expectAutoGenAddrEvent(addr2, deprecatedAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			// addr1 should be the primary endpoint now since addr2 is deprecated but
+			// addr1 is not.
+			expectPrimaryAddr(addr1)
+			// addr2 is deprecated but if explicitly requested, it should be used.
+			fullAddr2 := tcpip.FullAddress{Addr: addr2.Address, NIC: nicID}
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
+			}
 
-	// Another PI w/ 0 preferred lifetime should not result in a deprecation
-	// event.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	expectPrimaryAddr(addr1)
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
-	}
+			// Another PI w/ 0 preferred lifetime should not result in a deprecation
+			// event.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr1)
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
+			}
 
-	// Refresh lifetimes of addr generated from prefix2.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
+			// Refresh lifetimes of addr generated from prefix2.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr2)
+		})
 	}
-	expectPrimaryAddr(addr2)
 }
 
 // TestAutoGenAddrJobDeprecation tests that an address is properly deprecated
@@ -2996,217 +3019,236 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) {
 	const nicID = 1
 	const newMinVL = 2
 	newMinVLDuration := newMinVL * time.Second
-	saved := stack.MinPrefixInformationValidLifetimeForUpdate
-	defer func() {
-		stack.MinPrefixInformationValidLifetimeForUpdate = saved
-	}()
-	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
 
-	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
-	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
 
-	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			saved := stack.MinPrefixInformationValidLifetimeForUpdate
+			defer func() {
+				stack.MinPrefixInformationValidLifetimeForUpdate = saved
+			}()
+			stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
 
-	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
-		t.Helper()
+			prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+			prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
 
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
-				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID, stackTyp.useNeighborCache)
+
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
 			}
-		default:
-			t.Fatal("expected addr auto gen event")
-		}
-	}
 
-	expectAutoGenAddrEventAfter := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
-		t.Helper()
+			expectAutoGenAddrEventAfter := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
+				t.Helper()
 
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
-				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				case <-time.After(timeout):
+					t.Fatal("timed out waiting for addr auto gen event")
+				}
 			}
-		case <-time.After(timeout):
-			t.Fatal("timed out waiting for addr auto gen event")
-		}
-	}
 
-	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
-		t.Helper()
+			expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+				t.Helper()
 
-		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
-			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
-		} else if got != addr {
-			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
-		}
+				if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+					t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+				} else if got != addr {
+					t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+				}
 
-		if got := addrForNewConnection(t, s); got != addr.Address {
-			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
-		}
-	}
+				if got := addrForNewConnection(t, s); got != addr.Address {
+					t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+				}
+			}
 
-	// Receive PI for prefix2.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
-	expectAutoGenAddrEvent(addr2, newAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	expectPrimaryAddr(addr2)
+			// Receive PI for prefix2.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+			expectAutoGenAddrEvent(addr2, newAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			expectPrimaryAddr(addr2)
 
-	// Receive a PI for prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 90))
-	expectAutoGenAddrEvent(addr1, newAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should have %s in the list of addresses", addr1)
-	}
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	expectPrimaryAddr(addr1)
+			// Receive a PI for prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 90))
+			expectAutoGenAddrEvent(addr1, newAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should have %s in the list of addresses", addr1)
+			}
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			expectPrimaryAddr(addr1)
 
-	// Refresh lifetime for addr of prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	expectPrimaryAddr(addr1)
+			// Refresh lifetime for addr of prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr1)
 
-	// Wait for addr of prefix1 to be deprecated.
-	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should not have %s in the list of addresses", addr1)
-	}
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	// addr2 should be the primary endpoint now since addr1 is deprecated but
-	// addr2 is not.
-	expectPrimaryAddr(addr2)
-	// addr1 is deprecated but if explicitly requested, it should be used.
-	fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID}
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
-	}
+			// Wait for addr of prefix1 to be deprecated.
+			expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should not have %s in the list of addresses", addr1)
+			}
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			// addr2 should be the primary endpoint now since addr1 is deprecated but
+			// addr2 is not.
+			expectPrimaryAddr(addr2)
+			// addr1 is deprecated but if explicitly requested, it should be used.
+			fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID}
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
+			}
 
-	// Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make
-	// sure we do not get a deprecation event again.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	expectPrimaryAddr(addr2)
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
-	}
+			// Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make
+			// sure we do not get a deprecation event again.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr2)
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
+			}
 
-	// Refresh lifetimes for addr of prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	// addr1 is the primary endpoint again since it is non-deprecated now.
-	expectPrimaryAddr(addr1)
+			// Refresh lifetimes for addr of prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			// addr1 is the primary endpoint again since it is non-deprecated now.
+			expectPrimaryAddr(addr1)
 
-	// Wait for addr of prefix1 to be deprecated.
-	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should not have %s in the list of addresses", addr1)
-	}
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	// addr2 should be the primary endpoint now since it is not deprecated.
-	expectPrimaryAddr(addr2)
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
-	}
+			// Wait for addr of prefix1 to be deprecated.
+			expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should not have %s in the list of addresses", addr1)
+			}
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			// addr2 should be the primary endpoint now since it is not deprecated.
+			expectPrimaryAddr(addr2)
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
+			}
 
-	// Wait for addr of prefix1 to be invalidated.
-	expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncPositiveEventTimeout)
-	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should not have %s in the list of addresses", addr1)
-	}
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	expectPrimaryAddr(addr2)
+			// Wait for addr of prefix1 to be invalidated.
+			expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncPositiveEventTimeout)
+			if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should not have %s in the list of addresses", addr1)
+			}
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			expectPrimaryAddr(addr2)
 
-	// Refresh both lifetimes for addr of prefix2 to the same value.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, newMinVL, newMinVL))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
+			// Refresh both lifetimes for addr of prefix2 to the same value.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, newMinVL, newMinVL))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
 
-	// Wait for a deprecation then invalidation events, or just an invalidation
-	// event. We need to cover both cases but cannot deterministically hit both
-	// cases because the deprecation and invalidation handlers could be handled in
-	// either deprecation then invalidation, or invalidation then deprecation
-	// (which should be cancelled by the invalidation handler).
-	select {
-	case e := <-ndpDisp.autoGenAddrC:
-		if diff := checkAutoGenAddrEvent(e, addr2, deprecatedAddr); diff == "" {
-			// If we get a deprecation event first, we should get an invalidation
-			// event almost immediately after.
+			// Wait for a deprecation then invalidation events, or just an invalidation
+			// event. We need to cover both cases but cannot deterministically hit both
+			// cases because the deprecation and invalidation handlers could be handled in
+			// either deprecation then invalidation, or invalidation then deprecation
+			// (which should be cancelled by the invalidation handler).
 			select {
 			case e := <-ndpDisp.autoGenAddrC:
-				if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff != "" {
-					t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+				if diff := checkAutoGenAddrEvent(e, addr2, deprecatedAddr); diff == "" {
+					// If we get a deprecation event first, we should get an invalidation
+					// event almost immediately after.
+					select {
+					case e := <-ndpDisp.autoGenAddrC:
+						if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff != "" {
+							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+						}
+					case <-time.After(defaultAsyncPositiveEventTimeout):
+						t.Fatal("timed out waiting for addr auto gen event")
+					}
+				} else if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff == "" {
+					// If we get an invalidation  event first, we should not get a deprecation
+					// event after.
+					select {
+					case <-ndpDisp.autoGenAddrC:
+						t.Fatal("unexpectedly got an auto-generated event")
+					case <-time.After(defaultAsyncNegativeEventTimeout):
+					}
+				} else {
+					t.Fatalf("got unexpected auto-generated event")
 				}
-			case <-time.After(defaultAsyncPositiveEventTimeout):
+			case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout):
 				t.Fatal("timed out waiting for addr auto gen event")
 			}
-		} else if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff == "" {
-			// If we get an invalidation  event first, we should not get a deprecation
-			// event after.
-			select {
-			case <-ndpDisp.autoGenAddrC:
-				t.Fatal("unexpectedly got an auto-generated event")
-			case <-time.After(defaultAsyncNegativeEventTimeout):
+			if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should not have %s in the list of addresses", addr1)
+			}
+			if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should not have %s in the list of addresses", addr2)
+			}
+			// Should not have any primary endpoints.
+			if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+				t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+			} else if want := (tcpip.AddressWithPrefix{}); got != want {
+				t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, want)
+			}
+			wq := waiter.Queue{}
+			we, ch := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			defer wq.EventUnregister(&we)
+			defer close(ch)
+			ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
+			}
+			defer ep.Close()
+			if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+				t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
 			}
-		} else {
-			t.Fatalf("got unexpected auto-generated event")
-		}
-	case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout):
-		t.Fatal("timed out waiting for addr auto gen event")
-	}
-	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should not have %s in the list of addresses", addr1)
-	}
-	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should not have %s in the list of addresses", addr2)
-	}
-	// Should not have any primary endpoints.
-	if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
-		t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
-	} else if want := (tcpip.AddressWithPrefix{}); got != want {
-		t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, want)
-	}
-	wq := waiter.Queue{}
-	we, ch := waiter.NewChannelEntry(nil)
-	wq.EventRegister(&we, waiter.EventIn)
-	defer wq.EventUnregister(&we)
-	defer close(ch)
-	ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
-	if err != nil {
-		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
-	}
-	defer ep.Close()
-	if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-		t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
-	}
 
-	if err := ep.Connect(dstAddr); err != tcpip.ErrNoRoute {
-		t.Errorf("got ep.Connect(%+v) = %v, want = %s", dstAddr, err, tcpip.ErrNoRoute)
+			if err := ep.Connect(dstAddr); err != tcpip.ErrNoRoute {
+				t.Errorf("got ep.Connect(%+v) = %s, want = %s", dstAddr, err, tcpip.ErrNoRoute)
+			}
+		})
 	}
 }
 
@@ -3524,110 +3566,128 @@ func TestAutoGenAddrRemoval(t *testing.T) {
 func TestAutoGenAddrAfterRemoval(t *testing.T) {
 	const nicID = 1
 
-	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
-	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
-	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
-
-	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
-		t.Helper()
-
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
-				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
-			}
-		default:
-			t.Fatal("expected addr auto gen event")
-		}
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
 	}
 
-	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
-		t.Helper()
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+			prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+			ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID, stackTyp.useNeighborCache)
 
-		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
-			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
-		} else if got != addr {
-			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
-		}
-
-		if got := addrForNewConnection(t, s); got != addr.Address {
-			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
-		}
-	}
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
 
-	// Receive a PI to auto-generate addr1 with a large valid and preferred
-	// lifetime.
-	const largeLifetimeSeconds = 999
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix1, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
-	expectAutoGenAddrEvent(addr1, newAddr)
-	expectPrimaryAddr(addr1)
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+			}
 
-	// Add addr2 as a static address.
-	protoAddr2 := tcpip.ProtocolAddress{
-		Protocol:          header.IPv6ProtocolNumber,
-		AddressWithPrefix: addr2,
-	}
-	if err := s.AddProtocolAddressWithOptions(nicID, protoAddr2, stack.FirstPrimaryEndpoint); err != nil {
-		t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err)
-	}
-	// addr2 should be more preferred now since it is at the front of the primary
-	// list.
-	expectPrimaryAddr(addr2)
+			expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+				t.Helper()
 
-	// Get a route using addr2 to increment its reference count then remove it
-	// to leave it in the permanentExpired state.
-	r, err := s.FindRoute(nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, false)
-	if err != nil {
-		t.Fatalf("FindRoute(%d, %s, %s, %d, false): %s", nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, err)
-	}
-	defer r.Release()
-	if err := s.RemoveAddress(nicID, addr2.Address); err != nil {
-		t.Fatalf("s.RemoveAddress(%d, %s): %s", nicID, addr2.Address, err)
-	}
-	// addr1 should be preferred again since addr2 is in the expired state.
-	expectPrimaryAddr(addr1)
+				if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+					t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+				} else if got != addr {
+					t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+				}
 
-	// Receive a PI to auto-generate addr2 as valid and preferred.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
-	expectAutoGenAddrEvent(addr2, newAddr)
-	// addr2 should be more preferred now that it is closer to the front of the
-	// primary list and not deprecated.
-	expectPrimaryAddr(addr2)
+				if got := addrForNewConnection(t, s); got != addr.Address {
+					t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+				}
+			}
 
-	// Removing the address should result in an invalidation event immediately.
-	// It should still be in the permanentExpired state because r is still held.
-	//
-	// We remove addr2 here to make sure addr2 was marked as a SLAAC address
-	// (it was previously marked as a static address).
-	if err := s.RemoveAddress(1, addr2.Address); err != nil {
-		t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
-	}
-	expectAutoGenAddrEvent(addr2, invalidatedAddr)
-	// addr1 should be more preferred since addr2 is in the expired state.
-	expectPrimaryAddr(addr1)
+			// Receive a PI to auto-generate addr1 with a large valid and preferred
+			// lifetime.
+			const largeLifetimeSeconds = 999
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix1, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+			expectAutoGenAddrEvent(addr1, newAddr)
+			expectPrimaryAddr(addr1)
 
-	// Receive a PI to auto-generate addr2 as valid and deprecated.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, 0))
-	expectAutoGenAddrEvent(addr2, newAddr)
-	// addr1 should still be more preferred since addr2 is deprecated, even though
-	// it is closer to the front of the primary list.
-	expectPrimaryAddr(addr1)
+			// Add addr2 as a static address.
+			protoAddr2 := tcpip.ProtocolAddress{
+				Protocol:          header.IPv6ProtocolNumber,
+				AddressWithPrefix: addr2,
+			}
+			if err := s.AddProtocolAddressWithOptions(nicID, protoAddr2, stack.FirstPrimaryEndpoint); err != nil {
+				t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err)
+			}
+			// addr2 should be more preferred now since it is at the front of the primary
+			// list.
+			expectPrimaryAddr(addr2)
 
-	// Receive a PI to refresh addr2's preferred lifetime.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto gen addr event")
-	default:
-	}
-	// addr2 should be more preferred now that it is not deprecated.
-	expectPrimaryAddr(addr2)
+			// Get a route using addr2 to increment its reference count then remove it
+			// to leave it in the permanentExpired state.
+			r, err := s.FindRoute(nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, false)
+			if err != nil {
+				t.Fatalf("FindRoute(%d, %s, %s, %d, false): %s", nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, err)
+			}
+			defer r.Release()
+			if err := s.RemoveAddress(nicID, addr2.Address); err != nil {
+				t.Fatalf("s.RemoveAddress(%d, %s): %s", nicID, addr2.Address, err)
+			}
+			// addr1 should be preferred again since addr2 is in the expired state.
+			expectPrimaryAddr(addr1)
+
+			// Receive a PI to auto-generate addr2 as valid and preferred.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+			expectAutoGenAddrEvent(addr2, newAddr)
+			// addr2 should be more preferred now that it is closer to the front of the
+			// primary list and not deprecated.
+			expectPrimaryAddr(addr2)
+
+			// Removing the address should result in an invalidation event immediately.
+			// It should still be in the permanentExpired state because r is still held.
+			//
+			// We remove addr2 here to make sure addr2 was marked as a SLAAC address
+			// (it was previously marked as a static address).
+			if err := s.RemoveAddress(1, addr2.Address); err != nil {
+				t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+			}
+			expectAutoGenAddrEvent(addr2, invalidatedAddr)
+			// addr1 should be more preferred since addr2 is in the expired state.
+			expectPrimaryAddr(addr1)
+
+			// Receive a PI to auto-generate addr2 as valid and deprecated.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, 0))
+			expectAutoGenAddrEvent(addr2, newAddr)
+			// addr1 should still be more preferred since addr2 is deprecated, even though
+			// it is closer to the front of the primary list.
+			expectPrimaryAddr(addr1)
+
+			// Receive a PI to refresh addr2's preferred lifetime.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto gen addr event")
+			default:
+			}
+			// addr2 should be more preferred now that it is not deprecated.
+			expectPrimaryAddr(addr2)
 
-	if err := s.RemoveAddress(1, addr2.Address); err != nil {
-		t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+			if err := s.RemoveAddress(1, addr2.Address); err != nil {
+				t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+			}
+			expectAutoGenAddrEvent(addr2, invalidatedAddr)
+			expectPrimaryAddr(addr1)
+		})
 	}
-	expectAutoGenAddrEvent(addr2, invalidatedAddr)
-	expectPrimaryAddr(addr1)
 }
 
 // TestAutoGenAddrStaticConflict tests that if SLAAC generates an address that
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index aff29f9cc..0c811efdb 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -21,6 +21,7 @@ import (
 	"sort"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -135,18 +136,8 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 	}
 	nic.mu.ndp.initializeTempAddrState()
 
-	// Register supported packet endpoint protocols.
-	for _, netProto := range header.Ethertypes {
-		nic.mu.packetEPs[netProto] = []PacketEndpoint{}
-	}
-	for _, netProto := range stack.networkProtocols {
-		netNum := netProto.Number()
-		nic.mu.packetEPs[netNum] = nil
-		nic.networkEndpoints[netNum] = netProto.NewEndpoint(id, stack, nic, ep, stack)
-	}
-
 	// Check for Neighbor Unreachability Detection support.
-	if ep.Capabilities()&CapabilityResolutionRequired != 0 && len(stack.linkAddrResolvers) != 0 {
+	if ep.Capabilities()&CapabilityResolutionRequired != 0 && len(stack.linkAddrResolvers) != 0 && stack.useNeighborCache {
 		rng := rand.New(rand.NewSource(stack.clock.NowNanoseconds()))
 		nic.neigh = &neighborCache{
 			nic:   nic,
@@ -155,6 +146,16 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 		}
 	}
 
+	// Register supported packet endpoint protocols.
+	for _, netProto := range header.Ethertypes {
+		nic.mu.packetEPs[netProto] = []PacketEndpoint{}
+	}
+	for _, netProto := range stack.networkProtocols {
+		netNum := netProto.Number()
+		nic.mu.packetEPs[netNum] = nil
+		nic.networkEndpoints[netNum] = netProto.NewEndpoint(id, stack, nic.neigh, nic, ep, stack)
+	}
+
 	nic.linkEP.Attach(nic)
 
 	return nic
@@ -431,7 +432,7 @@ func (n *NIC) setSpoofing(enable bool) {
 // If an IPv6 primary endpoint is requested, Source Address Selection (as
 // defined by RFC 6724 section 5) will be performed.
 func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber, remoteAddr tcpip.Address) *referencedNetworkEndpoint {
-	if protocol == header.IPv6ProtocolNumber && remoteAddr != "" {
+	if protocol == header.IPv6ProtocolNumber && len(remoteAddr) != 0 {
 		return n.primaryIPv6Endpoint(remoteAddr)
 	}
 
@@ -818,11 +819,24 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 		}
 	}
 
-	ep, ok := n.networkEndpoints[protocolAddress.Protocol]
+	netProto, ok := n.stack.networkProtocols[protocolAddress.Protocol]
 	if !ok {
 		return nil, tcpip.ErrUnknownProtocol
 	}
 
+	var nud NUDHandler
+	if n.neigh != nil {
+		// An interface value that holds a nil concrete value is itself non-nil.
+		// For this reason, n.neigh cannot be passed directly to NewEndpoint so
+		// NetworkEndpoints don't confuse it for non-nil.
+		//
+		// See https://golang.org/doc/faq#nil_error for more information.
+		nud = n.neigh
+	}
+
+	// Create the new network endpoint.
+	ep := netProto.NewEndpoint(n.id, n.stack, nud, n, n.linkEP, n.stack)
+
 	isIPv6Unicast := protocolAddress.Protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(protocolAddress.AddressWithPrefix.Address)
 
 	// If the address is an IPv6 address and it is a permanent address,
@@ -844,10 +858,11 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 		deprecated: deprecated,
 	}
 
-	// Set up cache if link address resolution exists for this protocol.
+	// Set up resolver if link address resolution exists for this protocol.
 	if n.linkEP.Capabilities()&CapabilityResolutionRequired != 0 {
-		if _, ok := n.stack.linkAddrResolvers[protocolAddress.Protocol]; ok {
+		if linkRes, ok := n.stack.linkAddrResolvers[protocolAddress.Protocol]; ok {
 			ref.linkCache = n.stack
+			ref.linkRes = linkRes
 		}
 	}
 
@@ -1082,6 +1097,51 @@ func (n *NIC) RemoveAddress(addr tcpip.Address) *tcpip.Error {
 	return n.removePermanentAddressLocked(addr)
 }
 
+func (n *NIC) neighbors() ([]NeighborEntry, *tcpip.Error) {
+	if n.neigh == nil {
+		return nil, tcpip.ErrNotSupported
+	}
+
+	return n.neigh.entries(), nil
+}
+
+func (n *NIC) removeWaker(addr tcpip.Address, w *sleep.Waker) {
+	if n.neigh == nil {
+		return
+	}
+
+	n.neigh.removeWaker(addr, w)
+}
+
+func (n *NIC) addStaticNeighbor(addr tcpip.Address, linkAddress tcpip.LinkAddress) *tcpip.Error {
+	if n.neigh == nil {
+		return tcpip.ErrNotSupported
+	}
+
+	n.neigh.addStaticEntry(addr, linkAddress)
+	return nil
+}
+
+func (n *NIC) removeNeighbor(addr tcpip.Address) *tcpip.Error {
+	if n.neigh == nil {
+		return tcpip.ErrNotSupported
+	}
+
+	if !n.neigh.removeEntry(addr) {
+		return tcpip.ErrBadAddress
+	}
+	return nil
+}
+
+func (n *NIC) clearNeighbors() *tcpip.Error {
+	if n.neigh == nil {
+		return tcpip.ErrNotSupported
+	}
+
+	n.neigh.clear()
+	return nil
+}
+
 // joinGroup adds a new endpoint for the given multicast address, if none
 // exists yet. Otherwise it just increments its count.
 func (n *NIC) joinGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
@@ -1662,6 +1722,10 @@ type referencedNetworkEndpoint struct {
 	// protocol. Set to nil otherwise.
 	linkCache LinkAddressCache
 
+	// linkRes is set if link address resolution is enabled for this protocol.
+	// Set to nil otherwise.
+	linkRes LinkAddressResolver
+
 	// refs is counting references held for this endpoint. When refs hits zero it
 	// triggers the automatic removal of the endpoint from the NIC.
 	refs int32
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
index d312a79eb..1e065b5c1 100644
--- a/pkg/tcpip/stack/nic_test.go
+++ b/pkg/tcpip/stack/nic_test.go
@@ -192,7 +192,7 @@ func (*testIPv6Protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address)
 }
 
 // NewEndpoint implements NetworkProtocol.NewEndpoint.
-func (p *testIPv6Protocol) NewEndpoint(nicID tcpip.NICID, _ LinkAddressCache, _ TransportDispatcher, linkEP LinkEndpoint, _ *Stack) NetworkEndpoint {
+func (p *testIPv6Protocol) NewEndpoint(nicID tcpip.NICID, _ LinkAddressCache, _ NUDHandler, _ TransportDispatcher, linkEP LinkEndpoint, _ *Stack) NetworkEndpoint {
 	return &testIPv6Endpoint{
 		nicID:    nicID,
 		linkEP:   linkEP,
diff --git a/pkg/tcpip/stack/nud_test.go b/pkg/tcpip/stack/nud_test.go
index 2494ee610..2b97e5972 100644
--- a/pkg/tcpip/stack/nud_test.go
+++ b/pkg/tcpip/stack/nud_test.go
@@ -61,6 +61,7 @@ func TestSetNUDConfigurationFailsForBadNICID(t *testing.T) {
 		// stack will only allocate neighbor caches if a protocol providing link
 		// address resolution is specified (e.g. ARP or IPv6).
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		UseNeighborCache: true,
 	})
 
 	// No NIC with ID 1 yet.
@@ -84,7 +85,8 @@ func TestNUDConfigurationFailsForNotSupported(t *testing.T) {
 	e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 
 	s := stack.New(stack.Options{
-		NUDConfigs: stack.DefaultNUDConfigurations(),
+		NUDConfigs:       stack.DefaultNUDConfigurations(),
+		UseNeighborCache: true,
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -108,7 +110,8 @@ func TestSetNUDConfigurationFailsForNotSupported(t *testing.T) {
 	e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 
 	s := stack.New(stack.Options{
-		NUDConfigs: stack.DefaultNUDConfigurations(),
+		NUDConfigs:       stack.DefaultNUDConfigurations(),
+		UseNeighborCache: true,
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -136,6 +139,7 @@ func TestDefaultNUDConfigurations(t *testing.T) {
 		// address resolution is specified (e.g. ARP or IPv6).
 		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 		NUDConfigs:       stack.DefaultNUDConfigurations(),
+		UseNeighborCache: true,
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -190,6 +194,7 @@ func TestNUDConfigurationsBaseReachableTime(t *testing.T) {
 				// providing link address resolution is specified (e.g. ARP or IPv6).
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -246,6 +251,7 @@ func TestNUDConfigurationsMinRandomFactor(t *testing.T) {
 				// providing link address resolution is specified (e.g. ARP or IPv6).
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -325,6 +331,7 @@ func TestNUDConfigurationsMaxRandomFactor(t *testing.T) {
 				// providing link address resolution is specified (e.g. ARP or IPv6).
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -386,6 +393,7 @@ func TestNUDConfigurationsRetransmitTimer(t *testing.T) {
 				// providing link address resolution is specified (e.g. ARP or IPv6).
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -437,6 +445,7 @@ func TestNUDConfigurationsDelayFirstProbeTime(t *testing.T) {
 				// providing link address resolution is specified (e.g. ARP or IPv6).
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -488,6 +497,7 @@ func TestNUDConfigurationsMaxMulticastProbes(t *testing.T) {
 				// providing link address resolution is specified (e.g. ARP or IPv6).
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -539,6 +549,7 @@ func TestNUDConfigurationsMaxUnicastProbes(t *testing.T) {
 				// providing link address resolution is specified (e.g. ARP or IPv6).
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -590,6 +601,7 @@ func TestNUDConfigurationsUnreachableTime(t *testing.T) {
 				// providing link address resolution is specified (e.g. ARP or IPv6).
 				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index aca2f77f8..21ac38583 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -298,7 +298,7 @@ type NetworkProtocol interface {
 	ParseAddresses(v buffer.View) (src, dst tcpip.Address)
 
 	// NewEndpoint creates a new endpoint of this protocol.
-	NewEndpoint(nicID tcpip.NICID, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, sender LinkEndpoint, st *Stack) NetworkEndpoint
+	NewEndpoint(nicID tcpip.NICID, linkAddrCache LinkAddressCache, nud NUDHandler, dispatcher TransportDispatcher, sender LinkEndpoint, st *Stack) NetworkEndpoint
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
@@ -488,7 +488,7 @@ type LinkAddressResolver interface {
 	ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool)
 
 	// LinkAddressProtocol returns the network protocol of the
-	// addresses this this resolver can resolve.
+	// addresses this resolver can resolve.
 	LinkAddressProtocol() tcpip.NetworkProtocolNumber
 }
 
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index e267bebb0..c2eabde9e 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -141,6 +141,16 @@ func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) {
 		}
 		nextAddr = r.RemoteAddress
 	}
+
+	if r.ref.nic.neigh != nil {
+		entry, ch, err := r.ref.nic.neigh.entry(nextAddr, r.LocalAddress, r.ref.linkRes, waker)
+		if err != nil {
+			return ch, err
+		}
+		r.RemoteLinkAddress = entry.LinkAddr
+		return nil, nil
+	}
+
 	linkAddr, ch, err := r.ref.linkCache.GetLinkAddress(r.ref.nic.ID(), nextAddr, r.LocalAddress, r.NetProto, waker)
 	if err != nil {
 		return ch, err
@@ -155,6 +165,12 @@ func (r *Route) RemoveWaker(waker *sleep.Waker) {
 	if nextAddr == "" {
 		nextAddr = r.RemoteAddress
 	}
+
+	if r.ref.nic.neigh != nil {
+		r.ref.nic.neigh.removeWaker(nextAddr, waker)
+		return
+	}
+
 	r.ref.linkCache.RemoveWaker(r.ref.nic.ID(), nextAddr, waker)
 }
 
@@ -163,6 +179,9 @@ func (r *Route) RemoveWaker(waker *sleep.Waker) {
 //
 // The NIC r uses must not be locked.
 func (r *Route) IsResolutionRequired() bool {
+	if r.ref.nic.neigh != nil {
+		return r.ref.isValidForOutgoing() && r.ref.linkRes != nil && r.RemoteLinkAddress == ""
+	}
 	return r.ref.isValidForOutgoing() && r.ref.linkCache != nil && r.RemoteLinkAddress == ""
 }
 
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index a3f87c8af..7f5ed9e83 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -248,7 +248,7 @@ type RcvBufAutoTuneParams struct {
 	// was started.
 	MeasureTime time.Time
 
-	// CopiedBytes is the number of bytes copied to userspace since
+	// CopiedBytes is the number of bytes copied to user space since
 	// this measure began.
 	CopiedBytes int
 
@@ -461,6 +461,10 @@ type Stack struct {
 	// nudConfigs is the default NUD configurations used by interfaces.
 	nudConfigs NUDConfigurations
 
+	// useNeighborCache indicates whether ARP and NDP packets should be handled
+	// by the NIC's neighborCache instead of linkAddrCache.
+	useNeighborCache bool
+
 	// autoGenIPv6LinkLocal determines whether or not the stack will attempt
 	// to auto-generate an IPv6 link-local address for newly enabled non-loopback
 	// NICs. See the AutoGenIPv6LinkLocal field of Options for more details.
@@ -541,6 +545,13 @@ type Options struct {
 	// NUDConfigs is the default NUD configurations used by interfaces.
 	NUDConfigs NUDConfigurations
 
+	// UseNeighborCache indicates whether ARP and NDP packets should be handled
+	// by the Neighbor Unreachability Detection (NUD) state machine. This flag
+	// also enables the APIs for inspecting and modifying the neighbor table via
+	// NUDDispatcher and the following Stack methods: Neighbors, RemoveNeighbor,
+	// and ClearNeighbors.
+	UseNeighborCache bool
+
 	// AutoGenIPv6LinkLocal determines whether or not the stack will attempt to
 	// auto-generate an IPv6 link-local address for newly enabled non-loopback
 	// NICs.
@@ -715,6 +726,7 @@ func New(opts Options) *Stack {
 		seed:                 generateRandUint32(),
 		ndpConfigs:           opts.NDPConfigs,
 		nudConfigs:           opts.NUDConfigs,
+		useNeighborCache:     opts.UseNeighborCache,
 		autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
 		uniqueIDGenerator:    opts.UniqueID,
 		ndpDisp:              opts.NDPDisp,
@@ -1209,8 +1221,8 @@ func (s *Stack) AddProtocolAddressWithOptions(id tcpip.NICID, protocolAddress tc
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	nic := s.nics[id]
-	if nic == nil {
+	nic, ok := s.nics[id]
+	if !ok {
 		return tcpip.ErrUnknownNICID
 	}
 
@@ -1335,8 +1347,8 @@ func (s *Stack) CheckLocalAddress(nicID tcpip.NICID, protocol tcpip.NetworkProto
 
 	// If a NIC is specified, we try to find the address there only.
 	if nicID != 0 {
-		nic := s.nics[nicID]
-		if nic == nil {
+		nic, ok := s.nics[nicID]
+		if !ok {
 			return 0
 		}
 
@@ -1367,8 +1379,8 @@ func (s *Stack) SetPromiscuousMode(nicID tcpip.NICID, enable bool) *tcpip.Error
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	nic := s.nics[nicID]
-	if nic == nil {
+	nic, ok := s.nics[nicID]
+	if !ok {
 		return tcpip.ErrUnknownNICID
 	}
 
@@ -1383,8 +1395,8 @@ func (s *Stack) SetSpoofing(nicID tcpip.NICID, enable bool) *tcpip.Error {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	nic := s.nics[nicID]
-	if nic == nil {
+	nic, ok := s.nics[nicID]
+	if !ok {
 		return tcpip.ErrUnknownNICID
 	}
 
@@ -1416,8 +1428,33 @@ func (s *Stack) GetLinkAddress(nicID tcpip.NICID, addr, localAddr tcpip.Address,
 	return s.linkAddrCache.get(fullAddr, linkRes, localAddr, nic.linkEP, waker)
 }
 
-// RemoveWaker implements LinkAddressCache.RemoveWaker.
+// Neighbors returns all IP to MAC address associations.
+func (s *Stack) Neighbors(nicID tcpip.NICID) ([]NeighborEntry, *tcpip.Error) {
+	s.mu.RLock()
+	nic, ok := s.nics[nicID]
+	s.mu.RUnlock()
+
+	if !ok {
+		return nil, tcpip.ErrUnknownNICID
+	}
+
+	return nic.neighbors()
+}
+
+// RemoveWaker removes a waker that has been added when link resolution for
+// addr was requested.
 func (s *Stack) RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) {
+	if s.useNeighborCache {
+		s.mu.RLock()
+		nic, ok := s.nics[nicID]
+		s.mu.RUnlock()
+
+		if ok {
+			nic.removeWaker(addr, waker)
+		}
+		return
+	}
+
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
@@ -1427,6 +1464,47 @@ func (s *Stack) RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.
 	}
 }
 
+// AddStaticNeighbor statically associates an IP address to a MAC address.
+func (s *Stack) AddStaticNeighbor(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) *tcpip.Error {
+	s.mu.RLock()
+	nic, ok := s.nics[nicID]
+	s.mu.RUnlock()
+
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.addStaticNeighbor(addr, linkAddr)
+}
+
+// RemoveNeighbor removes an IP to MAC address association previously created
+// either automically or by AddStaticNeighbor. Returns ErrBadAddress if there
+// is no association with the provided address.
+func (s *Stack) RemoveNeighbor(nicID tcpip.NICID, addr tcpip.Address) *tcpip.Error {
+	s.mu.RLock()
+	nic, ok := s.nics[nicID]
+	s.mu.RUnlock()
+
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.removeNeighbor(addr)
+}
+
+// ClearNeighbors removes all IP to MAC address associations.
+func (s *Stack) ClearNeighbors(nicID tcpip.NICID) *tcpip.Error {
+	s.mu.RLock()
+	nic, ok := s.nics[nicID]
+	s.mu.RUnlock()
+
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.clearNeighbors()
+}
+
 // RegisterTransportEndpoint registers the given endpoint with the stack
 // transport dispatcher. Received packets that match the provided id will be
 // delivered to the given endpoint; specifying a nic is optional, but
@@ -1961,7 +2039,7 @@ func (s *Stack) FindNetworkEndpoint(netProto tcpip.NetworkProtocolNumber, addres
 	return nil, tcpip.ErrBadAddress
 }
 
-// FindNICNameFromID returns the name of the nic for the given NICID.
+// FindNICNameFromID returns the name of the NIC for the given NICID.
 func (s *Stack) FindNICNameFromID(id tcpip.NICID) string {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 106645c50..1deeccb89 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -197,7 +197,7 @@ func (*fakeNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Addres
 	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
 }
 
-func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint, _ *stack.Stack) stack.NetworkEndpoint {
+func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, _ stack.LinkAddressCache, _ stack.NUDHandler, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint, _ *stack.Stack) stack.NetworkEndpoint {
 	return &fakeNetworkEndpoint{
 		nicID:      nicID,
 		proto:      f,
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 290c4e138..44f87e007 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1192,6 +1192,10 @@ type ICMPv6ReceivedPacketStats struct {
 	// Invalid is the total number of ICMPv6 packets received that the
 	// transport layer could not parse.
 	Invalid *StatCounter
+
+	// RouterOnlyPacketsDroppedByHost is the total number of ICMPv6 packets
+	// dropped due to being router-specific packets.
+	RouterOnlyPacketsDroppedByHost *StatCounter
 }
 
 // ICMPStats collects ICMP-specific stats (both v4 and v6).
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index f87d99d5a..0a558df6d 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -1469,7 +1469,7 @@ func TestTTL(t *testing.T) {
 				} else {
 					p = ipv6.NewProtocol()
 				}
-				ep := p.NewEndpoint(0, nil, nil, nil, stack.New(stack.Options{
+				ep := p.NewEndpoint(0, nil, nil, nil, nil, stack.New(stack.Options{
 					NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
 					TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
 				}))
@@ -1502,7 +1502,7 @@ func TestSetTTL(t *testing.T) {
 					} else {
 						p = ipv6.NewProtocol()
 					}
-					ep := p.NewEndpoint(0, nil, nil, nil, stack.New(stack.Options{
+					ep := p.NewEndpoint(0, nil, nil, nil, nil, stack.New(stack.Options{
 						NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
 						TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
 					}))
-- 
cgit v1.2.3


From 0e91c5804318732e57543ad9a3012b5cb0715b7a Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 25 Aug 2020 11:57:35 -0700
Subject: Change "Fd" member to "FD" according to convension

PiperOrigin-RevId: 328374775
---
 pkg/sentry/vfs/file_description.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 33910e095..3219a9e13 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -851,7 +851,7 @@ func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsyn
 // FileReadWriteSeeker is a helper struct to pass a FileDescription as
 // io.Reader/io.Writer/io.ReadSeeker/etc.
 type FileReadWriteSeeker struct {
-	Fd    *FileDescription
+	FD    *FileDescription
 	Ctx   context.Context
 	ROpts ReadOptions
 	WOpts WriteOptions
@@ -860,18 +860,18 @@ type FileReadWriteSeeker struct {
 // Read implements io.ReadWriteSeeker.Read.
 func (f *FileReadWriteSeeker) Read(p []byte) (int, error) {
 	dst := usermem.BytesIOSequence(p)
-	ret, err := f.Fd.Read(f.Ctx, dst, f.ROpts)
+	ret, err := f.FD.Read(f.Ctx, dst, f.ROpts)
 	return int(ret), err
 }
 
 // Seek implements io.ReadWriteSeeker.Seek.
 func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) {
-	return f.Fd.Seek(f.Ctx, offset, int32(whence))
+	return f.FD.Seek(f.Ctx, offset, int32(whence))
 }
 
 // Write implements io.ReadWriteSeeker.Write.
 func (f *FileReadWriteSeeker) Write(p []byte) (int, error) {
 	buf := usermem.BytesIOSequence(p)
-	ret, err := f.Fd.Write(f.Ctx, buf, f.WOpts)
+	ret, err := f.FD.Write(f.Ctx, buf, f.WOpts)
 	return int(ret), err
 }
-- 
cgit v1.2.3


From 61ad71e6be239a860ed946722f0c4e4e8e643d16 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 25 Aug 2020 12:16:31 -0700
Subject: Add nogo support to go_binary and go_test targets.

Updates #3374

PiperOrigin-RevId: 328378700
---
 pkg/cpuid/cpuid_parse_x86_test.go      | 12 +++----
 pkg/seccomp/BUILD                      |  1 +
 test/benchmarks/database/redis_test.go |  4 +--
 test/benchmarks/fs/bazel_test.go       |  2 +-
 test/benchmarks/network/node_test.go   |  4 +--
 test/benchmarks/network/ruby_test.go   |  4 +--
 test/packetimpact/runner/defs.bzl      |  1 +
 test/root/crictl_test.go               |  2 +-
 test/runtimes/proctor/BUILD            |  1 +
 tools/bazeldefs/defs.bzl               |  9 ++++-
 tools/defs.bzl                         | 40 +++++++++++++++++++--
 tools/issue_reviver/BUILD              |  1 +
 tools/issue_reviver/github/BUILD       |  1 +
 tools/nogo/check/BUILD                 |  1 +
 tools/nogo/defs.bzl                    | 63 ++++++++++++++++++++++++++++------
 15 files changed, 118 insertions(+), 28 deletions(-)

(limited to 'pkg')

diff --git a/pkg/cpuid/cpuid_parse_x86_test.go b/pkg/cpuid/cpuid_parse_x86_test.go
index c9bd40e1b..e4ae0d689 100644
--- a/pkg/cpuid/cpuid_parse_x86_test.go
+++ b/pkg/cpuid/cpuid_parse_x86_test.go
@@ -32,27 +32,27 @@ func kernelVersion() (int, int, error) {
 		return 0, 0, err
 	}
 
-	var r string
+	var sb strings.Builder
 	for _, b := range u.Release {
 		if b == 0 {
 			break
 		}
-		r += string(b)
+		sb.WriteByte(byte(b))
 	}
 
-	s := strings.Split(r, ".")
+	s := strings.Split(sb.String(), ".")
 	if len(s) < 2 {
-		return 0, 0, fmt.Errorf("kernel release missing major and minor component: %s", r)
+		return 0, 0, fmt.Errorf("kernel release missing major and minor component: %s", sb.String())
 	}
 
 	major, err := strconv.Atoi(s[0])
 	if err != nil {
-		return 0, 0, fmt.Errorf("error parsing major version %q in %q: %v", s[0], r, err)
+		return 0, 0, fmt.Errorf("error parsing major version %q in %q: %w", s[0], sb.String(), err)
 	}
 
 	minor, err := strconv.Atoi(s[1])
 	if err != nil {
-		return 0, 0, fmt.Errorf("error parsing minor version %q in %q: %v", s[1], r, err)
+		return 0, 0, fmt.Errorf("error parsing minor version %q in %q: %w", s[1], sb.String(), err)
 	}
 
 	return major, minor, nil
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index 29aeaab8c..bdef7762c 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -10,6 +10,7 @@ go_binary(
         "seccomp_test_victim_amd64.go",
         "seccomp_test_victim_arm64.go",
     ],
+    nogo = False,
     deps = [":seccomp"],
 )
 
diff --git a/test/benchmarks/database/redis_test.go b/test/benchmarks/database/redis_test.go
index 394fce820..6671a4969 100644
--- a/test/benchmarks/database/redis_test.go
+++ b/test/benchmarks/database/redis_test.go
@@ -84,12 +84,12 @@ func BenchmarkRedis(b *testing.B) {
 
 			ip, err := serverMachine.IPAddress()
 			if err != nil {
-				b.Fatal("failed to get IP from server: %v", err)
+				b.Fatalf("failed to get IP from server: %v", err)
 			}
 
 			serverPort, err := server.FindPort(ctx, port)
 			if err != nil {
-				b.Fatal("failed to get IP from server: %v", err)
+				b.Fatalf("failed to get IP from server: %v", err)
 			}
 
 			if err = harness.WaitUntilServing(ctx, clientMachine, ip, serverPort); err != nil {
diff --git a/test/benchmarks/fs/bazel_test.go b/test/benchmarks/fs/bazel_test.go
index f4236ba37..fdbbfe280 100644
--- a/test/benchmarks/fs/bazel_test.go
+++ b/test/benchmarks/fs/bazel_test.go
@@ -73,7 +73,7 @@ func runBuildBenchmark(b *testing.B, image, workdir, target string) {
 			if bm.tmpfs {
 				if out, err := container.Exec(ctx, dockerutil.ExecOpts{},
 					"cp", "-r", workdir, "/tmp/."); err != nil {
-					b.Fatal("failed to copy directory: %v %s", err, out)
+					b.Fatalf("failed to copy directory: %v (%s)", err, out)
 				}
 				workdir = "/tmp" + workdir
 			}
diff --git a/test/benchmarks/network/node_test.go b/test/benchmarks/network/node_test.go
index 52eb794c4..0f4a205b6 100644
--- a/test/benchmarks/network/node_test.go
+++ b/test/benchmarks/network/node_test.go
@@ -48,14 +48,14 @@ func runNode(b *testing.B, hey *tools.Hey) {
 	// The machine to hold Redis and the Node Server.
 	serverMachine, err := h.GetMachine()
 	if err != nil {
-		b.Fatal("failed to get machine with: %v", err)
+		b.Fatalf("failed to get machine with: %v", err)
 	}
 	defer serverMachine.CleanUp()
 
 	// The machine to run 'hey'.
 	clientMachine, err := h.GetMachine()
 	if err != nil {
-		b.Fatal("failed to get machine with: %v", err)
+		b.Fatalf("failed to get machine with: %v", err)
 	}
 	defer clientMachine.CleanUp()
 
diff --git a/test/benchmarks/network/ruby_test.go b/test/benchmarks/network/ruby_test.go
index 5e0b2b724..67f63f76a 100644
--- a/test/benchmarks/network/ruby_test.go
+++ b/test/benchmarks/network/ruby_test.go
@@ -47,14 +47,14 @@ func runRuby(b *testing.B, hey *tools.Hey) {
 	// The machine to hold Redis and the Ruby Server.
 	serverMachine, err := h.GetMachine()
 	if err != nil {
-		b.Fatal("failed to get machine with: %v", err)
+		b.Fatalf("failed to get machine with: %v", err)
 	}
 	defer serverMachine.CleanUp()
 
 	// The machine to run 'hey'.
 	clientMachine, err := h.GetMachine()
 	if err != nil {
-		b.Fatal("failed to get machine with: %v", err)
+		b.Fatalf("failed to get machine with: %v", err)
 	}
 	defer clientMachine.CleanUp()
 	ctx := context.Background()
diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl
index 93a36c6c2..d72c63fe6 100644
--- a/test/packetimpact/runner/defs.bzl
+++ b/test/packetimpact/runner/defs.bzl
@@ -125,6 +125,7 @@ def packetimpact_go_test(name, size = "small", pure = True, expect_native_failur
         name = testbench_binary,
         size = size,
         pure = pure,
+        nogo = False,  # FIXME(gvisor.dev/issue/3374): Not working with all build systems.
         tags = [
             "local",
             "manual",
diff --git a/test/root/crictl_test.go b/test/root/crictl_test.go
index df91fa0fe..11ac5cb52 100644
--- a/test/root/crictl_test.go
+++ b/test/root/crictl_test.go
@@ -418,7 +418,7 @@ func setup(t *testing.T, version string) (*criutil.Crictl, func(), error) {
 		// care about the docker runtime name.
 		config = v2Template
 	default:
-		t.Fatalf("unknown version: %d", version)
+		t.Fatalf("unknown version: %s", version)
 	}
 	t.Logf("Using config: %s", config)
 
diff --git a/test/runtimes/proctor/BUILD b/test/runtimes/proctor/BUILD
index f76e2ddc0..d1935cbe8 100644
--- a/test/runtimes/proctor/BUILD
+++ b/test/runtimes/proctor/BUILD
@@ -21,6 +21,7 @@ go_test(
     size = "small",
     srcs = ["proctor_test.go"],
     library = ":proctor",
+    nogo = False,  # FIXME(gvisor.dev/issue/3374): Not working with all build systems.
     pure = True,
     deps = [
         "//pkg/test/testutil",
diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index db7f379b8..4bbcda054 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -87,13 +87,14 @@ def cc_binary(name, static = False, **kwargs):
         **kwargs
     )
 
-def go_binary(name, static = False, pure = False, **kwargs):
+def go_binary(name, static = False, pure = False, x_defs = None, **kwargs):
     """Build a go binary.
 
     Args:
         name: name of the target.
         static: build a static binary.
         pure: build without cgo.
+        x_defs: additional definitions.
         **kwargs: rest of the arguments are passed to _go_binary.
     """
     if static:
@@ -102,6 +103,7 @@ def go_binary(name, static = False, pure = False, **kwargs):
         kwargs["pure"] = "on"
     _go_binary(
         name = name,
+        x_defs = x_defs,
         **kwargs
     )
 
@@ -151,6 +153,11 @@ def go_rule(rule, implementation, **kwargs):
     toolchains = kwargs.get("toolchains", []) + ["@io_bazel_rules_go//go:toolchain"]
     return rule(implementation, attrs = attrs, toolchains = toolchains, **kwargs)
 
+def go_test_library(target):
+    if hasattr(target.attr, "embed") and len(target.attr.embed) > 0:
+        return target.attr.embed[0]
+    return None
+
 def go_context(ctx):
     go_ctx = _go_context(ctx)
     return struct(
diff --git a/tools/defs.bzl b/tools/defs.bzl
index e71a26cf4..290d564f2 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -27,7 +27,6 @@ gbenchmark = _gbenchmark
 gazelle = _gazelle
 go_embed_data = _go_embed_data
 go_path = _go_path
-go_test = _go_test
 gtest = _gtest
 grpcpp = _grpcpp
 loopback = _loopback
@@ -45,17 +44,35 @@ vdso_linker_option = _vdso_linker_option
 default_platform = _default_platform
 platforms = _platforms
 
-def go_binary(name, **kwargs):
+def go_binary(name, nogo = True, pure = False, static = False, x_defs = None, **kwargs):
     """Wraps the standard go_binary.
 
     Args:
       name: the rule name.
+      nogo: enable nogo analysis.
+      pure: build a pure Go (no CGo) binary.
+      static: build a static binary.
+      x_defs: additional linker definitions.
       **kwargs: standard go_binary arguments.
     """
     _go_binary(
         name = name,
+        pure = pure,
+        static = static,
+        x_defs = x_defs,
         **kwargs
     )
+    if nogo:
+        # Note that the nogo rule applies only for go_library and go_test
+        # targets, therefore we construct a library from the binary sources.
+        _go_library(
+            name = name + "_nogo_library",
+            **kwargs
+        )
+        nogo_test(
+            name = name + "_nogo",
+            deps = [":" + name + "_nogo_library"],
+        )
 
 def calculate_sets(srcs):
     """Calculates special Go sets for templates.
@@ -119,6 +136,7 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
       stateify: whether statify is enabled (default: true).
       marshal: whether marshal is enabled (default: false).
       marshal_debug: whether the gomarshal tools emits debugging output (default: false).
+      nogo: enable nogo analysis.
       **kwargs: standard go_library arguments.
     """
     all_srcs = srcs
@@ -202,6 +220,24 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
                 **kwargs
             )
 
+def go_test(name, nogo = True, **kwargs):
+    """Wraps the standard go_test.
+
+    Args:
+      name: the rule name.
+      nogo: enable nogo analysis.
+      **kwargs: standard go_test arguments.
+    """
+    _go_test(
+        name = name,
+        **kwargs
+    )
+    if nogo:
+        nogo_test(
+            name = name + "_nogo",
+            deps = [":" + name],
+        )
+
 def proto_library(name, srcs, deps = None, has_services = 0, **kwargs):
     """Wraps the standard proto_library.
 
diff --git a/tools/issue_reviver/BUILD b/tools/issue_reviver/BUILD
index 4ef1a3124..35b0111ca 100644
--- a/tools/issue_reviver/BUILD
+++ b/tools/issue_reviver/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
 go_binary(
     name = "issue_reviver",
     srcs = ["main.go"],
+    nogo = False,
     deps = [
         "//tools/issue_reviver/github",
         "//tools/issue_reviver/reviver",
diff --git a/tools/issue_reviver/github/BUILD b/tools/issue_reviver/github/BUILD
index 0eabc2835..555abd296 100644
--- a/tools/issue_reviver/github/BUILD
+++ b/tools/issue_reviver/github/BUILD
@@ -21,4 +21,5 @@ go_test(
     size = "small",
     srcs = ["github_test.go"],
     library = ":github",
+    nogo = False,
 )
diff --git a/tools/nogo/check/BUILD b/tools/nogo/check/BUILD
index e2d76cd5c..21ba2c306 100644
--- a/tools/nogo/check/BUILD
+++ b/tools/nogo/check/BUILD
@@ -7,6 +7,7 @@ package(licenses = ["notice"])
 go_binary(
     name = "check",
     srcs = ["main.go"],
+    nogo = False,
     visibility = ["//visibility:public"],
     deps = ["//tools/nogo"],
 )
diff --git a/tools/nogo/defs.bzl b/tools/nogo/defs.bzl
index d399079c5..5377620b0 100644
--- a/tools/nogo/defs.bzl
+++ b/tools/nogo/defs.bzl
@@ -1,6 +1,6 @@
 """Nogo rules."""
 
-load("//tools/bazeldefs:defs.bzl", "go_context", "go_importpath", "go_rule")
+load("//tools/bazeldefs:defs.bzl", "go_context", "go_importpath", "go_rule", "go_test_library")
 
 # NogoInfo is the serialized set of package facts for a nogo analysis.
 #
@@ -8,10 +8,13 @@ load("//tools/bazeldefs:defs.bzl", "go_context", "go_importpath", "go_rule")
 # with the source files as input. Note however, that the individual nogo rules
 # are simply stubs that enter into the shadow dependency tree (the "aspect").
 NogoInfo = provider(
+    "information for nogo analysis",
     fields = {
         "facts": "serialized package facts",
         "importpath": "package import path",
         "binaries": "package binary files",
+        "srcs": "original source files (for go_test support)",
+        "deps": "original deps (for go_test support)",
     },
 )
 
@@ -21,16 +24,29 @@ def _nogo_aspect_impl(target, ctx):
     # All work is done in the shadow properties for go rules. For a proto
     # library, we simply skip the analysis portion but still need to return a
     # valid NogoInfo to reference the generated binary.
-    if ctx.rule.kind == "go_library":
+    if ctx.rule.kind in ("go_library", "go_binary", "go_test", "go_tool_library"):
         srcs = ctx.rule.files.srcs
-    elif ctx.rule.kind == "go_proto_library" or ctx.rule.kind == "go_wrap_cc":
+        deps = ctx.rule.attr.deps
+    elif ctx.rule.kind in ("go_proto_library", "go_wrap_cc"):
         srcs = []
+        deps = ctx.rule.attr.deps
     else:
         return [NogoInfo()]
 
-    go_ctx = go_context(ctx)
+    # If we're using the "library" attribute, then we need to aggregate the
+    # original library sources and dependencies into this target to perform
+    # proper type analysis.
+    if ctx.rule.kind == "go_test":
+        library = go_test_library(ctx.rule)
+        if library != None:
+            info = library[NogoInfo]
+            if hasattr(info, "srcs"):
+                srcs = srcs + info.srcs
+            if hasattr(info, "deps"):
+                deps = deps + info.deps
 
     # Construct the Go environment from the go_ctx.env dictionary.
+    go_ctx = go_context(ctx)
     env_prefix = " ".join(["%s=%s" % (key, value) for (key, value) in go_ctx.env.items()])
 
     # Start with all target files and srcs as input.
@@ -41,6 +57,13 @@ def _nogo_aspect_impl(target, ctx):
     # to cleanly allow us redirect stdout to the actual output file. Perhaps
     # I'm missing something here, but the intermediate script does work.
     binaries = target.files.to_list()
+    objfiles = [f for f in binaries if f.path.endswith(".a")]
+    if len(objfiles) > 0:
+        # Prefer the .a files for go_library targets.
+        target_objfile = objfiles[0]
+    else:
+        # Use the raw binary for go_binary and go_test targets.
+        target_objfile = binaries[0]
     disasm_file = ctx.actions.declare_file(target.label.name + ".out")
     dumper = ctx.actions.declare_file("%s-dumper" % ctx.label.name)
     ctx.actions.write(dumper, "\n".join([
@@ -48,12 +71,12 @@ def _nogo_aspect_impl(target, ctx):
         "%s %s tool objdump %s > %s\n" % (
             env_prefix,
             go_ctx.go.path,
-            [f.path for f in binaries if f.path.endswith(".a")][0],
+            target_objfile.path,
             disasm_file.path,
         ),
     ]), is_executable = True)
     ctx.actions.run(
-        inputs = binaries,
+        inputs = [target_objfile],
         outputs = [disasm_file],
         tools = go_ctx.runfiles,
         mnemonic = "GoObjdump",
@@ -63,7 +86,15 @@ def _nogo_aspect_impl(target, ctx):
     inputs.append(disasm_file)
 
     # Extract the importpath for this package.
-    importpath = go_importpath(target)
+    if ctx.rule.kind == "go_test":
+        # If this is a test, then it will not be imported by anything else.
+        # We can safely set the importapth to just "test". Note that this
+        # is necessary if the library also imports the core library (in
+        # addition to including the sources directly), which happens in
+        # some complex cases (seccomp_victim).
+        importpath = "test"
+    else:
+        importpath = go_importpath(target)
 
     # The nogo tool requires a configfile serialized in JSON format to do its
     # work. This must line up with the nogo.Config fields.
@@ -84,7 +115,7 @@ def _nogo_aspect_impl(target, ctx):
     )
 
     # Collect all info from shadow dependencies.
-    for dep in ctx.rule.attr.deps:
+    for dep in deps:
         # There will be no file attribute set for all transitive dependencies
         # that are not go_library or go_binary rules, such as a proto rules.
         # This is handled by the ctx.rule.kind check above.
@@ -126,12 +157,18 @@ def _nogo_aspect_impl(target, ctx):
         facts = facts,
         importpath = importpath,
         binaries = binaries,
+        srcs = srcs,
+        deps = deps,
     )]
 
 nogo_aspect = go_rule(
     aspect,
     implementation = _nogo_aspect_impl,
-    attr_aspects = ["deps"],
+    attr_aspects = [
+        "deps",
+        "library",
+        "embed",
+    ],
     attrs = {
         "_nogo": attr.label(
             default = "//tools/nogo/check:check",
@@ -171,6 +208,10 @@ _nogo_test = rule(
     test = True,
 )
 
-def nogo_test(**kwargs):
+def nogo_test(name, **kwargs):
     tags = kwargs.pop("tags", []) + ["nogo"]
-    _nogo_test(tags = tags, **kwargs)
+    _nogo_test(
+        name = name,
+        tags = tags,
+        **kwargs
+    )
-- 
cgit v1.2.3


From 9e7a83e0f283965c6fbfa3d596421508dd088c58 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 25 Aug 2020 13:41:23 -0700
Subject: remove iptables sockopt special cases

iptables sockopts were kludged into an unnecessary check, this properly
relegates them to the {get,set}SockOptIP functions.

PiperOrigin-RevId: 328395135
---
 pkg/sentry/socket/netstack/netstack.go      | 137 ++++++++++++++--------------
 pkg/sentry/socket/netstack/netstack_vfs2.go |  68 +-------------
 pkg/sentry/socket/unix/unix.go              |   2 +-
 pkg/sentry/socket/unix/unix_vfs2.go         |   2 +-
 test/syscalls/linux/iptables.cc             |  37 +++++++-
 5 files changed, 104 insertions(+), 142 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 626195be2..9e2ebc7d4 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -983,53 +983,12 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 		return &val, nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
-		switch name {
-		case linux.IPT_SO_GET_INFO:
-			if outLen < linux.SizeOfIPTGetinfo {
-				return nil, syserr.ErrInvalidArgument
-			}
-			if s.family != linux.AF_INET {
-				return nil, syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return nil, syserr.ErrNoDevice
-			}
-			info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr)
-			if err != nil {
-				return nil, err
-			}
-			return &info, nil
-
-		case linux.IPT_SO_GET_ENTRIES:
-			if outLen < linux.SizeOfIPTGetEntries {
-				return nil, syserr.ErrInvalidArgument
-			}
-			if s.family != linux.AF_INET {
-				return nil, syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return nil, syserr.ErrNoDevice
-			}
-			entries, err := netfilter.GetEntries4(t, stack.(*Stack).Stack, outPtr, outLen)
-			if err != nil {
-				return nil, err
-			}
-			return &entries, nil
-
-		}
-	}
-
-	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
+	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
 }
 
 // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
 // sockets backed by a commonEndpoint.
-func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (marshal.Marshallable, *syserr.Error) {
+func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
 		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
@@ -1041,7 +1000,7 @@ func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family in
 		return getSockOptIPv6(t, ep, name, outLen)
 
 	case linux.SOL_IP:
-		return getSockOptIP(t, ep, name, outLen, family)
+		return getSockOptIP(t, s, ep, name, outPtr, outLen, family)
 
 	case linux.SOL_UDP,
 		linux.SOL_ICMPV6,
@@ -1560,7 +1519,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marsha
 }
 
 // getSockOptIP implements GetSockOpt when level is SOL_IP.
-func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (marshal.Marshallable, *syserr.Error) {
+func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int, family int) (marshal.Marshallable, *syserr.Error) {
 	switch name {
 	case linux.IP_TTL:
 		if outLen < sizeOfInt32 {
@@ -1676,6 +1635,46 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
 		return a.(*linux.SockAddrInet), nil
 
+	case linux.IPT_SO_GET_INFO:
+		if outLen < linux.SizeOfIPTGetinfo {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv4 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr)
+		if err != nil {
+			return nil, err
+		}
+		return &info, nil
+
+	case linux.IPT_SO_GET_ENTRIES:
+		if outLen < linux.SizeOfIPTGetEntries {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv4 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		entries, err := netfilter.GetEntries4(t, stack.(*Stack).Stack, outPtr, outLen)
+		if err != nil {
+			return nil, err
+		}
+		return &entries, nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1709,29 +1708,6 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 		return nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.SOL_IP {
-		switch name {
-		case linux.IPT_SO_SET_REPLACE:
-			if len(optVal) < linux.SizeOfIPTReplace {
-				return syserr.ErrInvalidArgument
-			}
-			if s.family != linux.AF_INET {
-				return syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return syserr.ErrNoDevice
-			}
-			// Stack must be a netstack stack.
-			return netfilter.SetEntries(stack.(*Stack).Stack, optVal)
-
-		case linux.IPT_SO_SET_ADD_COUNTERS:
-			// TODO(gvisor.dev/issue/170): Counter support.
-			return nil
-		}
-	}
-
 	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
 }
 
@@ -1749,7 +1725,7 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int
 		return setSockOptIPv6(t, ep, name, optVal)
 
 	case linux.SOL_IP:
-		return setSockOptIP(t, ep, name, optVal)
+		return setSockOptIP(t, s, ep, name, optVal)
 
 	case linux.SOL_UDP,
 		linux.SOL_ICMPV6,
@@ -2160,7 +2136,7 @@ func parseIntOrChar(buf []byte) (int32, *syserr.Error) {
 }
 
 // setSockOptIP implements SetSockOpt when level is SOL_IP.
-func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
 	case linux.IP_MULTICAST_TTL:
 		v, err := parseIntOrChar(optVal)
@@ -2280,6 +2256,27 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.IPHdrIncludedOption, v != 0))
 
+	case linux.IPT_SO_SET_REPLACE:
+		if len(optVal) < linux.SizeOfIPTReplace {
+			return syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv4 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+			return syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return syserr.ErrNoDevice
+		}
+		// Stack must be a netstack stack.
+		return netfilter.SetEntries(stack.(*Stack).Stack, optVal)
+
+	case linux.IPT_SO_SET_ADD_COUNTERS:
+		// TODO(gvisor.dev/issue/170): Counter support.
+		return nil
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
index 1db8ae491..59fa4c58f 100644
--- a/pkg/sentry/socket/netstack/netstack_vfs2.go
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -21,10 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
-	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
-	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -233,48 +231,7 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
 		return &val, nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
-		switch name {
-		case linux.IPT_SO_GET_INFO:
-			if outLen < linux.SizeOfIPTGetinfo {
-				return nil, syserr.ErrInvalidArgument
-			}
-			if s.family != linux.AF_INET {
-				return nil, syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return nil, syserr.ErrNoDevice
-			}
-			info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr)
-			if err != nil {
-				return nil, err
-			}
-			return &info, nil
-
-		case linux.IPT_SO_GET_ENTRIES:
-			if outLen < linux.SizeOfIPTGetEntries {
-				return nil, syserr.ErrInvalidArgument
-			}
-			if s.family != linux.AF_INET {
-				return nil, syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return nil, syserr.ErrNoDevice
-			}
-			entries, err := netfilter.GetEntries4(t, stack.(*Stack).Stack, outPtr, outLen)
-			if err != nil {
-				return nil, err
-			}
-			return &entries, nil
-
-		}
-	}
-
-	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
+	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
 }
 
 // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
@@ -304,29 +261,6 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by
 		return nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.SOL_IP {
-		switch name {
-		case linux.IPT_SO_SET_REPLACE:
-			if len(optVal) < linux.SizeOfIPTReplace {
-				return syserr.ErrInvalidArgument
-			}
-			if s.family != linux.AF_INET {
-				return syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return syserr.ErrNoDevice
-			}
-			// Stack must be a netstack stack.
-			return netfilter.SetEntries(stack.(*Stack).Stack, optVal)
-
-		case linux.IPT_SO_SET_ADD_COUNTERS:
-			// TODO(gvisor.dev/issue/170): Counter support.
-			return nil
-		}
-	}
-
 	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
 }
 
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index b7e8e4325..0a7a26495 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -194,7 +194,7 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // a transport.Endpoint.
 func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
-	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
+	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen)
 }
 
 // Listen implements the linux syscall listen(2) for sockets backed by
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index d066ef8ab..65a285b8f 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -91,7 +91,7 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // a transport.Endpoint.
 func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
-	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
+	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen)
 }
 
 // blockingAccept implements a blocking version of accept(2), that is, if no
diff --git a/test/syscalls/linux/iptables.cc b/test/syscalls/linux/iptables.cc
index 9b338d970..f1af8f097 100644
--- a/test/syscalls/linux/iptables.cc
+++ b/test/syscalls/linux/iptables.cc
@@ -67,12 +67,43 @@ TEST(IPTablesBasic, FailSockoptNonRaw) {
   struct ipt_getinfo info = {};
   snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
   socklen_t info_size = sizeof(info);
-  EXPECT_THAT(getsockopt(sock, IPPROTO_IP, IPT_SO_GET_INFO, &info, &info_size),
+  EXPECT_THAT(getsockopt(sock, SOL_IP, IPT_SO_GET_INFO, &info, &info_size),
               SyscallFailsWithErrno(ENOPROTOOPT));
 
   ASSERT_THAT(close(sock), SyscallSucceeds());
 }
 
+TEST(IPTablesBasic, GetInfoErrorPrecedence) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET, SOCK_DGRAM, 0), SyscallSucceeds());
+
+  // When using the wrong type of socket and a too-short optlen, we should get
+  // EINVAL.
+  struct ipt_getinfo info = {};
+  snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  socklen_t info_size = sizeof(info) - 1;
+  ASSERT_THAT(getsockopt(sock, SOL_IP, IPT_SO_GET_INFO, &info, &info_size),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(IPTablesBasic, GetEntriesErrorPrecedence) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET, SOCK_DGRAM, 0), SyscallSucceeds());
+
+  // When using the wrong type of socket and a too-short optlen, we should get
+  // EINVAL.
+  struct ipt_get_entries entries = {};
+  socklen_t entries_size = sizeof(struct ipt_get_entries) - 1;
+  snprintf(entries.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  ASSERT_THAT(
+      getsockopt(sock, SOL_IP, IPT_SO_GET_ENTRIES, &entries, &entries_size),
+      SyscallFailsWithErrno(EINVAL));
+}
+
 // Fixture for iptables tests.
 class IPTablesTest : public ::testing::Test {
  protected:
@@ -112,7 +143,7 @@ TEST_F(IPTablesTest, InitialState) {
   struct ipt_getinfo info = {};
   snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
   socklen_t info_size = sizeof(info);
-  ASSERT_THAT(getsockopt(s_, IPPROTO_IP, IPT_SO_GET_INFO, &info, &info_size),
+  ASSERT_THAT(getsockopt(s_, SOL_IP, IPT_SO_GET_INFO, &info, &info_size),
               SyscallSucceeds());
 
   // The nat table supports PREROUTING, and OUTPUT.
@@ -148,7 +179,7 @@ TEST_F(IPTablesTest, InitialState) {
   snprintf(entries->name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
   entries->size = info.size;
   ASSERT_THAT(
-      getsockopt(s_, IPPROTO_IP, IPT_SO_GET_ENTRIES, entries, &entries_size),
+      getsockopt(s_, SOL_IP, IPT_SO_GET_ENTRIES, entries, &entries_size),
       SyscallSucceeds());
 
   // Verify the name and size.
-- 
cgit v1.2.3


From 09bd5a57f3456bb411c34cced923531dc8e0aec7 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 25 Aug 2020 14:45:03 -0700
Subject: Clarify comment on NetworkProtocolNumber.

The actual values used for this field in Netstack are actually EtherType values
of the protocol in an Ethernet frame. Eg. header.IPv4ProtocolNumber is 0x0800
and not the number of the IPv4 Protocol Number itself which is 4. Similarly
header.IPv6ProtocolNumber is set to 0x86DD whereas the IPv6 protocol number is
41.

See:
  - https://www.iana.org/assignments/ieee-802-numbers/ieee-802-numbers.xhtml (For EtherType)
  - https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml (For ProtocolNumbers)
PiperOrigin-RevId: 328407293
---
 pkg/tcpip/tcpip.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 44f87e007..609b8af33 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1029,7 +1029,10 @@ func (r Route) String() string {
 // TransportProtocolNumber is the number of a transport protocol.
 type TransportProtocolNumber uint32
 
-// NetworkProtocolNumber is the number of a network protocol.
+// NetworkProtocolNumber is the EtherType of a network protocol in an Ethernet
+// frame.
+//
+// See: https://www.iana.org/assignments/ieee-802-numbers/ieee-802-numbers.xhtml
 type NetworkProtocolNumber uint32
 
 // A StatCounter keeps track of a statistic.
-- 
cgit v1.2.3


From 7483666eca67bf7d7ee814a4c8667af575f15bda Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 25 Aug 2020 14:58:28 -0700
Subject: overlay: clonePrivateMount must pass a Dentry reference to
 MakeVirtualDentry.

PiperOrigin-RevId: 328410065
---
 pkg/sentry/fsimpl/overlay/overlay.go | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index 4b3dfbc01..00562667f 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -315,7 +315,11 @@ func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forc
 	if err != nil {
 		return vfs.VirtualDentry{}, err
 	}
-	return vfs.MakeVirtualDentry(newmnt, vd.Dentry()), nil
+	// Take a reference on the dentry which will be owned by the returned
+	// VirtualDentry.
+	d := vd.Dentry()
+	d.IncRef()
+	return vfs.MakeVirtualDentry(newmnt, d), nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
-- 
cgit v1.2.3


From e91164893d6bbaf42639b7e4bb948e9165587130 Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Tue, 25 Aug 2020 15:26:54 -0700
Subject: [go-marshal] Enable auto-marshalling for host tty.

PiperOrigin-RevId: 328415633
---
 pkg/abi/linux/tty.go          |  4 ++++
 pkg/sentry/fs/host/BUILD      |  1 +
 pkg/sentry/fs/host/tty.go     | 40 +++++++++++++++-------------------------
 pkg/sentry/fsimpl/host/BUILD  |  1 +
 pkg/sentry/fsimpl/host/tty.go | 40 +++++++++++++++-------------------------
 5 files changed, 36 insertions(+), 50 deletions(-)

(limited to 'pkg')

diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index 8ac02aee8..e640969a6 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -23,6 +23,8 @@ const (
 )
 
 // Winsize is struct winsize, defined in uapi/asm-generic/termios.h.
+//
+// +marshal
 type Winsize struct {
 	Row    uint16
 	Col    uint16
@@ -31,6 +33,8 @@ type Winsize struct {
 }
 
 // Termios is struct termios, defined in uapi/asm-generic/termbits.h.
+//
+// +marshal
 type Termios struct {
 	InputFlags        uint32
 	OutputFlags       uint32
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index d41d23a43..42a6c41c2 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -55,6 +55,7 @@ go_library(
         "//pkg/unet",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/primitive",
     ],
 )
 
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index e29ae00f2..67a807f9d 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -24,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // LINT.IfChange
@@ -123,6 +124,11 @@ func (t *TTYFileOperations) Release(ctx context.Context) {
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		return 0, syserror.ENOTTY
+	}
+
 	// Ignore arg[0].  This is the real FD:
 	fd := t.fileOperations.iops.fileState.FD()
 	ioctl := args[1].Uint64()
@@ -132,9 +138,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = termios.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
@@ -146,9 +150,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		}
 
 		var termios linux.Termios
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetTermios(fd, ioctl, &termios)
@@ -173,10 +175,8 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 
 		// Map the ProcessGroup into a ProcessGroupID in the task's PID
 		// namespace.
-		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup))
+		_, err := pgID.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSPGRP:
@@ -184,11 +184,6 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		// Equivalent to tcsetpgrp(fd, *argp).
 		// Set the foreground process group ID of this terminal.
 
-		task := kernel.TaskFromContext(ctx)
-		if task == nil {
-			return 0, syserror.ENOTTY
-		}
-
 		t.mu.Lock()
 		defer t.mu.Unlock()
 
@@ -208,12 +203,11 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 			return 0, syserror.ENOTTY
 		}
 
-		var pgID kernel.ProcessGroupID
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		var pgIDP primitive.Int32
+		if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
+		pgID := kernel.ProcessGroupID(pgIDP)
 
 		// pgID must be non-negative.
 		if pgID < 0 {
@@ -242,9 +236,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = winsize.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSWINSZ:
@@ -255,9 +247,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		// background ones) can set the winsize.
 
 		var winsize linux.Winsize
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetWinsize(fd, &winsize)
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index 090ae0804..be1c88c82 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -72,6 +72,7 @@ go_library(
         "//pkg/unet",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/primitive",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index 27cbd3059..7a9be4b97 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // TTYFileDescription implements vfs.FileDescriptionImpl for a host file
@@ -143,6 +144,11 @@ func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence,
 
 // Ioctl implements vfs.FileDescriptionImpl.
 func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		return 0, syserror.ENOTTY
+	}
+
 	// Ignore arg[0]. This is the real FD:
 	fd := t.inode.hostFD
 	ioctl := args[1].Uint64()
@@ -152,9 +158,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = termios.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
@@ -166,9 +170,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		}
 
 		var termios linux.Termios
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetTermios(fd, ioctl, &termios)
@@ -192,10 +194,8 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		defer t.mu.Unlock()
 
 		// Map the ProcessGroup into a ProcessGroupID in the task's PID namespace.
-		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup))
+		_, err := pgID.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSPGRP:
@@ -203,11 +203,6 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		// Equivalent to tcsetpgrp(fd, *argp).
 		// Set the foreground process group ID of this terminal.
 
-		task := kernel.TaskFromContext(ctx)
-		if task == nil {
-			return 0, syserror.ENOTTY
-		}
-
 		t.mu.Lock()
 		defer t.mu.Unlock()
 
@@ -226,12 +221,11 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 			return 0, syserror.ENOTTY
 		}
 
-		var pgID kernel.ProcessGroupID
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		var pgIDP primitive.Int32
+		if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
+		pgID := kernel.ProcessGroupID(pgIDP)
 
 		// pgID must be non-negative.
 		if pgID < 0 {
@@ -260,9 +254,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = winsize.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSWINSZ:
@@ -273,9 +265,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		// set the winsize.
 
 		var winsize linux.Winsize
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetWinsize(fd, &winsize)
-- 
cgit v1.2.3


From c8125fe386f7b835e04a0ea40a2b501ef61598e5 Mon Sep 17 00:00:00 2001
From: Toshi Kikuchi <toshik@google.com>
Date: Tue, 25 Aug 2020 16:13:39 -0700
Subject: Only send an ICMP error message if UDP checksum is valid.

Test:
 - TestV4UnknownDestination
 - TestV6UnknownDestination
PiperOrigin-RevId: 328424137
---
 pkg/tcpip/transport/udp/endpoint.go |  37 ++++----
 pkg/tcpip/transport/udp/protocol.go |   7 +-
 pkg/tcpip/transport/udp/udp_test.go | 162 ++++++++++++++++++------------------
 3 files changed, 106 insertions(+), 100 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index c33434b75..0a9d3c6cf 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1366,6 +1366,22 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return result
 }
 
+// verifyChecksum verifies the checksum unless RX checksum offload is enabled.
+// On IPv4, UDP checksum is optional, and a zero value means the transmitter
+// omitted the checksum generation (RFC768).
+// On IPv6, UDP checksum is not optional (RFC2460 Section 8.1).
+func verifyChecksum(r *stack.Route, hdr header.UDP, pkt *stack.PacketBuffer) bool {
+	if r.Capabilities()&stack.CapabilityRXChecksumOffload == 0 &&
+		(hdr.Checksum() != 0 || r.NetProto == header.IPv6ProtocolNumber) {
+		xsum := r.PseudoHeaderChecksum(ProtocolNumber, hdr.Length())
+		for _, v := range pkt.Data.Views() {
+			xsum = header.Checksum(v, xsum)
+		}
+		return hdr.CalculateChecksum(xsum) == 0xffff
+	}
+	return true
+}
+
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
@@ -1387,22 +1403,11 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		return
 	}
 
-	// Verify checksum unless RX checksum offload is enabled.
-	// On IPv4, UDP checksum is optional, and a zero value means
-	// the transmitter omitted the checksum generation (RFC768).
-	// On IPv6, UDP checksum is not optional (RFC2460 Section 8.1).
-	if r.Capabilities()&stack.CapabilityRXChecksumOffload == 0 &&
-		(hdr.Checksum() != 0 || r.NetProto == header.IPv6ProtocolNumber) {
-		xsum := r.PseudoHeaderChecksum(ProtocolNumber, hdr.Length())
-		for _, v := range pkt.Data.Views() {
-			xsum = header.Checksum(v, xsum)
-		}
-		if hdr.CalculateChecksum(xsum) != 0xffff {
-			// Checksum Error.
-			e.stack.Stats().UDP.ChecksumErrors.Increment()
-			e.stats.ReceiveErrors.ChecksumErrors.Increment()
-			return
-		}
+	if !verifyChecksum(r, hdr, pkt) {
+		// Checksum Error.
+		e.stack.Stats().UDP.ChecksumErrors.Increment()
+		e.stats.ReceiveErrors.ChecksumErrors.Increment()
+		return
 	}
 
 	e.stack.Stats().UDP.PacketsReceived.Increment()
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 63d4bed7c..f65751dd4 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -88,7 +88,12 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
 		return true
 	}
-	// TODO(b/129426613): only send an ICMP message if UDP checksum is valid.
+
+	if !verifyChecksum(r, hdr, pkt) {
+		// Checksum Error.
+		r.Stack().Stats().UDP.ChecksumErrors.Increment()
+		return true
+	}
 
 	// Only send ICMP error if the address is not a multicast/broadcast
 	// v4/v6 address or the source is not the unspecified address.
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 0a558df6d..bd1c8ac31 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -403,18 +403,35 @@ func (c *testContext) getPacketAndVerify(flow testFlow, checkers ...checker.Netw
 }
 
 // injectPacket creates a packet of the given flow and with the given payload,
-// and injects it into the link endpoint.
-func (c *testContext) injectPacket(flow testFlow, payload []byte) {
+// and injects it into the link endpoint. If badChecksum is true, the packet has
+// a bad checksum in the UDP header.
+func (c *testContext) injectPacket(flow testFlow, payload []byte, badChecksum bool) {
 	c.t.Helper()
 
 	h := flow.header4Tuple(incoming)
 	if flow.isV4() {
 		buf := c.buildV4Packet(payload, &h)
+		if badChecksum {
+			// Invalidate the UDP header checksum field, taking care to avoid
+			// overflow to zero, which would disable checksum validation.
+			for u := header.UDP(buf[header.IPv4MinimumSize:]); ; {
+				u.SetChecksum(u.Checksum() + 1)
+				if u.Checksum() != 0 {
+					break
+				}
+			}
+		}
 		c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 			Data: buf.ToVectorisedView(),
 		}))
 	} else {
 		buf := c.buildV6Packet(payload, &h)
+		if badChecksum {
+			// Invalidate the UDP header checksum field (Unlike IPv4, zero is
+			// a valid checksum value for IPv6 so no need to avoid it).
+			u := header.UDP(buf[header.IPv6MinimumSize:])
+			u.SetChecksum(u.Checksum() + 1)
+		}
 		c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 			Data: buf.ToVectorisedView(),
 		}))
@@ -569,7 +586,7 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	c.t.Helper()
 
 	payload := newPayload()
-	c.injectPacket(flow, payload)
+	c.injectPacket(flow, payload, false)
 
 	// Try to receive the data.
 	we, ch := waiter.NewChannelEntry(nil)
@@ -925,7 +942,7 @@ func TestReadFromMulticastStats(t *testing.T) {
 			}
 
 			payload := newPayload()
-			c.injectPacket(flow, payload)
+			c.injectPacket(flow, payload, false)
 
 			var want uint64 = 0
 			if flow.isReverseMulticast() {
@@ -1727,21 +1744,33 @@ func TestV4UnknownDestination(t *testing.T) {
 		// so that the final generated IPv4 packet is larger than
 		// header.IPv4MinimumProcessableDatagramSize.
 		largePayload bool
+		// badChecksum if true, will set an invalid checksum in the
+		// header.
+		badChecksum bool
 	}{
-		{unicastV4, true, false},
-		{unicastV4, true, true},
-		{multicastV4, false, false},
-		{multicastV4, false, true},
-		{broadcast, false, false},
-		{broadcast, false, true},
-	}
+		{unicastV4, true, false, false},
+		{unicastV4, true, true, false},
+		{unicastV4, false, false, true},
+		{unicastV4, false, true, true},
+		{multicastV4, false, false, false},
+		{multicastV4, false, true, false},
+		{broadcast, false, false, false},
+		{broadcast, false, true, false},
+	}
+	checksumErrors := uint64(0)
 	for _, tc := range testCases {
-		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t", tc.flow, tc.icmpRequired, tc.largePayload), func(t *testing.T) {
+		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t badChecksum:%t", tc.flow, tc.icmpRequired, tc.largePayload, tc.badChecksum), func(t *testing.T) {
 			payload := newPayload()
 			if tc.largePayload {
 				payload = newMinPayload(576)
 			}
-			c.injectPacket(tc.flow, payload)
+			c.injectPacket(tc.flow, payload, tc.badChecksum)
+			if tc.badChecksum {
+				checksumErrors++
+				if got, want := c.s.Stats().UDP.ChecksumErrors.Value(), checksumErrors; got != want {
+					t.Fatalf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+				}
+			}
 			if !tc.icmpRequired {
 				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 				defer cancel()
@@ -1806,19 +1835,31 @@ func TestV6UnknownDestination(t *testing.T) {
 		// largePayload if true will result in a payload large enough to
 		// create an IPv6 packet > header.IPv6MinimumMTU bytes.
 		largePayload bool
+		// badChecksum if true, will set an invalid checksum in the
+		// header.
+		badChecksum bool
 	}{
-		{unicastV6, true, false},
-		{unicastV6, true, true},
-		{multicastV6, false, false},
-		{multicastV6, false, true},
-	}
+		{unicastV6, true, false, false},
+		{unicastV6, true, true, false},
+		{unicastV6, false, false, true},
+		{unicastV6, false, true, true},
+		{multicastV6, false, false, false},
+		{multicastV6, false, true, false},
+	}
+	checksumErrors := uint64(0)
 	for _, tc := range testCases {
-		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t", tc.flow, tc.icmpRequired, tc.largePayload), func(t *testing.T) {
+		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t badChecksum:%t", tc.flow, tc.icmpRequired, tc.largePayload, tc.badChecksum), func(t *testing.T) {
 			payload := newPayload()
 			if tc.largePayload {
 				payload = newMinPayload(1280)
 			}
-			c.injectPacket(tc.flow, payload)
+			c.injectPacket(tc.flow, payload, tc.badChecksum)
+			if tc.badChecksum {
+				checksumErrors++
+				if got, want := c.s.Stats().UDP.ChecksumErrors.Value(), checksumErrors; got != want {
+					t.Fatalf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+				}
+			}
 			if !tc.icmpRequired {
 				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 				defer cancel()
@@ -1953,74 +1994,29 @@ func TestShortHeader(t *testing.T) {
 	}
 }
 
-// TestIncrementChecksumErrorsV4 verifies if a checksum error is detected,
+// TestBadChecksumErrors verifies if a checksum error is detected,
 // global and endpoint stats are incremented.
-func TestIncrementChecksumErrorsV4(t *testing.T) {
-	c := newDualTestContext(t, defaultMTU)
-	defer c.cleanup()
-
-	c.createEndpoint(ipv4.ProtocolNumber)
-	// Bind to wildcard.
-	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %s", err)
-	}
-
-	payload := newPayload()
-	h := unicastV4.header4Tuple(incoming)
-	buf := c.buildV4Packet(payload, &h)
+func TestBadChecksumErrors(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, unicastV6} {
+		c := newDualTestContext(t, defaultMTU)
+		defer c.cleanup()
 
-	// Invalidate the UDP header checksum field, taking care to avoid
-	// overflow to zero, which would disable checksum validation.
-	for u := header.UDP(buf[header.IPv4MinimumSize:]); ; {
-		u.SetChecksum(u.Checksum() + 1)
-		if u.Checksum() != 0 {
-			break
+		c.createEndpoint(flow.sockProto())
+		// Bind to wildcard.
+		if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+			c.t.Fatalf("Bind failed: %s", err)
 		}
-	}
-
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	const want = 1
-	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
-		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
-	}
-	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
-		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
-	}
-}
-
-// TestIncrementChecksumErrorsV6 verifies if a checksum error is detected,
-// global and endpoint stats are incremented.
-func TestIncrementChecksumErrorsV6(t *testing.T) {
-	c := newDualTestContext(t, defaultMTU)
-	defer c.cleanup()
-
-	c.createEndpoint(ipv6.ProtocolNumber)
-	// Bind to wildcard.
-	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %s", err)
-	}
-
-	payload := newPayload()
-	h := unicastV6.header4Tuple(incoming)
-	buf := c.buildV6Packet(payload, &h)
-
-	// Invalidate the UDP header checksum field.
-	u := header.UDP(buf[header.IPv6MinimumSize:])
-	u.SetChecksum(u.Checksum() + 1)
 
-	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
+		payload := newPayload()
+		c.injectPacket(flow, payload, true /* badChecksum */)
 
-	const want = 1
-	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
-		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
-	}
-	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
-		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+		const want = 1
+		if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+			t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+		}
+		if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+			t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+		}
 	}
 }
 
-- 
cgit v1.2.3


From 5683a8568adc9c13c1cf9d360dae105dc60b145d Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 25 Aug 2020 16:26:53 -0700
Subject: Expose basic coverage information to userspace through kcov
 interface.

In Linux, a kernel configuration is set that compiles the kernel with a
custom function that is called at the beginning of every basic block, which
updates the memory-mapped coverage information. The Go coverage tool does not
allow us to inject arbitrary instructions into basic blocks, but it does
provide data that we can convert to a kcov-like format and transfer them to
userspace through a memory mapping.

Note that this is not a strict implementation of kcov, which is especially
tricky to do because we do not have the same coverage tools available in Go
that that are available for the actual Linux kernel. In Linux, a kernel
configuration is set that compiles the kernel with a custom function that is
called at the beginning of every basic block to write program counters to the
kcov memory mapping. In Go, however, coverage tools only give us a count of
basic blocks as they are executed. Every time we return to userspace, we
collect the coverage information and write out PCs for each block that was
executed, providing userspace with the illusion that the kcov data is always
up to date. For convenience, we also generate a unique synthetic PC for each
block instead of using actual PCs. Finally, we do not provide thread-specific
coverage data (each kcov instance only contains PCs executed by the thread
owning it); instead, we will supply data for any file specified by --
instrumentation_filter.

Also, fix issue in nogo that was causing pkg/coverage:coverage_nogo
compilation to fail.

PiperOrigin-RevId: 328426526
---
 pkg/abi/linux/ioctl.go           |  21 +++
 pkg/coverage/BUILD               |  14 ++
 pkg/coverage/coverage.go         | 175 +++++++++++++++++++++
 pkg/sentry/fsimpl/sys/BUILD      |   5 +
 pkg/sentry/fsimpl/sys/kcov.go    | 116 ++++++++++++++
 pkg/sentry/fsimpl/sys/sys.go     |  18 ++-
 pkg/sentry/kernel/BUILD          |   3 +
 pkg/sentry/kernel/kcov.go        | 321 +++++++++++++++++++++++++++++++++++++++
 pkg/sentry/kernel/kcov_unsafe.go |  28 ++++
 pkg/sentry/kernel/kernel.go      |   2 +-
 pkg/sentry/kernel/task.go        |  18 +++
 pkg/sentry/kernel/task_exit.go   |   2 +
 runsc/config/config.go           |   6 +-
 test/syscalls/BUILD              |   4 +
 test/syscalls/linux/BUILD        |  14 ++
 test/syscalls/linux/kcov.cc      |  70 +++++++++
 16 files changed, 813 insertions(+), 4 deletions(-)
 create mode 100644 pkg/coverage/BUILD
 create mode 100644 pkg/coverage/coverage.go
 create mode 100644 pkg/sentry/fsimpl/sys/kcov.go
 create mode 100644 pkg/sentry/kernel/kcov.go
 create mode 100644 pkg/sentry/kernel/kcov_unsafe.go
 create mode 100644 test/syscalls/linux/kcov.cc

(limited to 'pkg')

diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 2c5e56ae5..d6dbedc3e 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -117,3 +117,24 @@ const (
 func IOC(dir, typ, nr, size uint32) uint32 {
 	return uint32(dir)<<_IOC_DIRSHIFT | typ<<_IOC_TYPESHIFT | nr<<_IOC_NRSHIFT | size<<_IOC_SIZESHIFT
 }
+
+// Kcov ioctls from kernel/kcov.h.
+var (
+	KCOV_INIT_TRACE = IOC(_IOC_READ, 'c', 1, 8)
+	KCOV_ENABLE     = IOC(_IOC_NONE, 'c', 100, 0)
+	KCOV_DISABLE    = IOC(_IOC_NONE, 'c', 101, 0)
+)
+
+// Kcov trace types from kernel/kcov.h.
+const (
+	KCOV_TRACE_PC  = 0
+	KCOV_TRACE_CMP = 1
+)
+
+// Kcov state constants from kernel/kcov.h.
+const (
+	KCOV_MODE_DISABLED  = 0
+	KCOV_MODE_INIT      = 1
+	KCOV_MODE_TRACE_PC  = 2
+	KCOV_MODE_TRACE_CMP = 3
+)
diff --git a/pkg/coverage/BUILD b/pkg/coverage/BUILD
new file mode 100644
index 000000000..a198e8028
--- /dev/null
+++ b/pkg/coverage/BUILD
@@ -0,0 +1,14 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "coverage",
+    srcs = ["coverage.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/sync",
+        "//pkg/usermem",
+        "@io_bazel_rules_go//go/tools/coverdata",
+    ],
+)
diff --git a/pkg/coverage/coverage.go b/pkg/coverage/coverage.go
new file mode 100644
index 000000000..6831adcce
--- /dev/null
+++ b/pkg/coverage/coverage.go
@@ -0,0 +1,175 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package coverage provides an interface through which Go coverage data can
+// be collected, converted to kcov format, and exposed to userspace.
+//
+// Coverage can be enabled by calling bazel {build,test} with
+// --collect_coverage_data and --instrumentation_filter with the desired
+// coverage surface. This causes bazel to use the Go cover tool manually to
+// generate instrumented files. It injects a hook that registers all coverage
+// data with the coverdata package.
+package coverage
+
+import (
+	"fmt"
+	"io"
+	"sort"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+
+	"github.com/bazelbuild/rules_go/go/tools/coverdata"
+)
+
+// KcovAvailable returns whether the kcov coverage interface is available. It is
+// available as long as coverage is enabled for some files.
+func KcovAvailable() bool {
+	return len(coverdata.Cover.Blocks) > 0
+}
+
+// coverageMu must be held while accessing coverdata.Cover. This prevents
+// concurrent reads/writes from multiple threads collecting coverage data.
+var coverageMu sync.RWMutex
+
+// once ensures that globalData is only initialized once.
+var once sync.Once
+
+var globalData struct {
+	// files is the set of covered files sorted by filename. It is calculated at
+	// startup.
+	files []string
+
+	// syntheticPCs are a set of PCs calculated at startup, where the PC
+	// at syntheticPCs[i][j] corresponds to file i, block j.
+	syntheticPCs [][]uint64
+}
+
+// ClearCoverageData clears existing coverage data.
+func ClearCoverageData() {
+	coverageMu.Lock()
+	defer coverageMu.Unlock()
+	for _, counters := range coverdata.Cover.Counters {
+		for index := 0; index < len(counters); index++ {
+			atomic.StoreUint32(&counters[index], 0)
+		}
+	}
+}
+
+var coveragePool = sync.Pool{
+	New: func() interface{} {
+		return make([]byte, 0)
+	},
+}
+
+// ConsumeCoverageData builds and writes the collection of covered PCs. It
+// returns the number of bytes written.
+//
+// In Linux, a kernel configuration is set that compiles the kernel with a
+// custom function that is called at the beginning of every basic block, which
+// updates the memory-mapped coverage information. The Go coverage tool does not
+// allow us to inject arbitrary instructions into basic blocks, but it does
+// provide data that we can convert to a kcov-like format and transfer them to
+// userspace through a memory mapping.
+//
+// Note that this is not a strict implementation of kcov, which is especially
+// tricky to do because we do not have the same coverage tools available in Go
+// that that are available for the actual Linux kernel. In Linux, a kernel
+// configuration is set that compiles the kernel with a custom function that is
+// called at the beginning of every basic block to write program counters to the
+// kcov memory mapping. In Go, however, coverage tools only give us a count of
+// basic blocks as they are executed. Every time we return to userspace, we
+// collect the coverage information and write out PCs for each block that was
+// executed, providing userspace with the illusion that the kcov data is always
+// up to date. For convenience, we also generate a unique synthetic PC for each
+// block instead of using actual PCs. Finally, we do not provide thread-specific
+// coverage data (each kcov instance only contains PCs executed by the thread
+// owning it); instead, we will supply data for any file specified by --
+// instrumentation_filter.
+//
+// Note that we "consume", i.e. clear, coverdata when this function is run, to
+// ensure that each event is only reported once.
+//
+// TODO(b/160639712): evaluate whether it is ok to reset the global coverage
+// data every time this function is run. We could technically have each thread
+// store a local snapshot against which we compare the most recent coverdata so
+// that separate threads do not affect each other's view of the data.
+func ConsumeCoverageData(w io.Writer) int {
+	once.Do(initCoverageData)
+
+	coverageMu.Lock()
+	defer coverageMu.Unlock()
+
+	total := 0
+	var pcBuffer [8]byte
+	for fileIndex, file := range globalData.files {
+		counters := coverdata.Cover.Counters[file]
+		for index := 0; index < len(counters); index++ {
+			val := atomic.SwapUint32(&counters[index], 0)
+			if val != 0 {
+				// Calculate the synthetic PC.
+				pc := globalData.syntheticPCs[fileIndex][index]
+
+				usermem.ByteOrder.PutUint64(pcBuffer[:], pc)
+				n, err := w.Write(pcBuffer[:])
+				if err != nil {
+					if err == io.EOF {
+						// Simply stop writing if we encounter EOF; it's ok if we attempted to
+						// write more than we can hold.
+						return total + n
+					}
+					panic(fmt.Sprintf("Internal error writing PCs to kcov area: %v", err))
+				}
+				total += n
+			}
+		}
+	}
+
+	if total == 0 {
+		// An empty profile indicates that coverage is not enabled, in which case
+		// there shouldn't be any task work registered.
+		panic("kcov task work is registered, but no coverage data was found")
+	}
+	return total
+}
+
+// initCoverageData initializes globalData. It should only be called once,
+// before any kcov data is written.
+func initCoverageData() {
+	// First, order all files. Then calculate synthetic PCs for every block
+	// (using the well-defined ordering for files as well).
+	for file := range coverdata.Cover.Blocks {
+		globalData.files = append(globalData.files, file)
+	}
+	sort.Strings(globalData.files)
+
+	// nextSyntheticPC is the first PC that we generate for a block.
+	//
+	// This uses a standard-looking kernel range for simplicity.
+	//
+	// FIXME(b/160639712): This is only necessary because syzkaller requires
+	// addresses in the kernel range. If we can remove this constraint, then we
+	// should be able to use the actual addresses.
+	var nextSyntheticPC uint64 = 0xffffffff80000000
+	for _, file := range globalData.files {
+		blocks := coverdata.Cover.Blocks[file]
+		thisFile := make([]uint64, 0, len(blocks))
+		for range blocks {
+			thisFile = append(thisFile, nextSyntheticPC)
+			nextSyntheticPC++ // Advance.
+		}
+		globalData.syntheticPCs = append(globalData.syntheticPCs, thisFile)
+	}
+}
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index 1b548ccd4..f9b232da6 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -5,17 +5,22 @@ licenses(["notice"])
 go_library(
     name = "sys",
     srcs = [
+        "kcov.go",
         "sys.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/coverage",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
 
diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go
new file mode 100644
index 000000000..92710d877
--- /dev/null
+++ b/pkg/sentry/fsimpl/sys/kcov.go
@@ -0,0 +1,116 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sys
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) *kernfs.Dentry {
+	k := &kcovInode{}
+	k.InodeAttrs.Init(creds, 0, 0, fs.NextIno(), linux.S_IFREG|0600)
+	d := &kernfs.Dentry{}
+	d.Init(k)
+	return d
+}
+
+// kcovInode implements kernfs.Inode.
+type kcovInode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotSymlink
+	kernfs.InodeNotDirectory
+}
+
+func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	k := kernel.KernelFromContext(ctx)
+	if k == nil {
+		panic("KernelFromContext returned nil")
+	}
+	fd := &kcovFD{
+		inode: i,
+		kcov:  k.NewKcov(),
+	}
+
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{
+		DenyPRead:  true,
+		DenyPWrite: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+type kcovFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.NoLockFD
+
+	vfsfd vfs.FileDescription
+	inode *kcovInode
+	kcov  *kernel.Kcov
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *kcovFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	cmd := uint32(args[1].Int())
+	arg := args[2].Uint64()
+	switch uint32(cmd) {
+	case linux.KCOV_INIT_TRACE:
+		return 0, fd.kcov.InitTrace(arg)
+	case linux.KCOV_ENABLE:
+		return 0, fd.kcov.EnableTrace(ctx, uint8(arg))
+	case linux.KCOV_DISABLE:
+		if arg != 0 {
+			// This arg is unused; it should be 0.
+			return 0, syserror.EINVAL
+		}
+		return 0, fd.kcov.DisableTrace(ctx)
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// ConfigureMmap implements vfs.FileDescriptionImpl.ConfigureMmap.
+func (fd *kcovFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return fd.kcov.ConfigureMMap(ctx, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *kcovFD) Release(ctx context.Context) {
+	// kcov instances have reference counts in Linux, but this seems sufficient
+	// for our purposes.
+	fd.kcov.Reset()
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *kcovFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return fd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *kcovFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	return fd.inode.Stat(ctx, fd.vfsfd.Mount().Filesystem(), opts)
+}
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 393feb802..1f042d9f7 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/coverage"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -73,7 +74,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		}),
 		"firmware": fs.newDir(creds, defaultSysDirMode, nil),
 		"fs":       fs.newDir(creds, defaultSysDirMode, nil),
-		"kernel":   fs.newDir(creds, defaultSysDirMode, nil),
+		"kernel":   kernelDir(ctx, fs, creds),
 		"module":   fs.newDir(creds, defaultSysDirMode, nil),
 		"power":    fs.newDir(creds, defaultSysDirMode, nil),
 	})
@@ -94,6 +95,21 @@ func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernf
 	return fs.newDir(creds, defaultSysDirMode, children)
 }
 
+func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernfs.Dentry {
+	// If kcov is available, set up /sys/kernel/debug/kcov. Technically, debugfs
+	// should be mounted at debug/, but for our purposes, it is sufficient to
+	// keep it in sys.
+	var children map[string]*kernfs.Dentry
+	if coverage.KcovAvailable() {
+		children = map[string]*kernfs.Dentry{
+			"debug": fs.newDir(creds, linux.FileMode(0700), map[string]*kernfs.Dentry{
+				"kcov": fs.newKcovFile(ctx, creds),
+			}),
+		}
+	}
+	return fs.newDir(creds, defaultSysDirMode, children)
+}
+
 // Release implements vfs.FilesystemImpl.Release.
 func (fs *filesystem) Release(ctx context.Context) {
 	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 5416a310d..d1ecceba3 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -91,6 +91,8 @@ go_library(
         "fd_table_unsafe.go",
         "fs_context.go",
         "ipc_namespace.go",
+        "kcov.go",
+        "kcov_unsafe.go",
         "kernel.go",
         "kernel_opts.go",
         "kernel_state.go",
@@ -157,6 +159,7 @@ go_library(
         "//pkg/bits",
         "//pkg/bpf",
         "//pkg/context",
+        "//pkg/coverage",
         "//pkg/cpuid",
         "//pkg/eventchannel",
         "//pkg/fspath",
diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go
new file mode 100644
index 000000000..aad63aa99
--- /dev/null
+++ b/pkg/sentry/kernel/kcov.go
@@ -0,0 +1,321 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"io"
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/coverage"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov
+// area. On Linux, the maximum is INT_MAX / 8.
+const kcovAreaSizeMax = 10 * 1024 * 1024
+
+// Kcov provides kernel coverage data to userspace through a memory-mapped
+// region, as kcov does in Linux.
+//
+// To give the illusion that the data is always up to date, we update the shared
+// memory every time before we return to userspace.
+type Kcov struct {
+	// mfp provides application memory. It is immutable after creation.
+	mfp pgalloc.MemoryFileProvider
+
+	// mu protects all of the fields below.
+	mu sync.RWMutex
+
+	// mode is the current kcov mode.
+	mode uint8
+
+	// size is the size of the mapping through which the kernel conveys coverage
+	// information to userspace.
+	size uint64
+
+	// owningTask is the task that currently owns coverage data on the system. The
+	// interface for kcov essentially requires that coverage is only going to a
+	// single task. Note that kcov should only generate coverage data for the
+	// owning task, but we currently generate global coverage.
+	owningTask *Task
+
+	// count is a locally cached version of the first uint64 in the kcov data,
+	// which is the number of subsequent entries representing PCs.
+	//
+	// It is used with kcovInode.countBlock(), to copy in/out the first element of
+	// the actual data in an efficient manner, avoid boilerplate, and prevent
+	// accidental garbage escapes by the temporary counts.
+	count uint64
+
+	mappable *mm.SpecialMappable
+}
+
+// NewKcov creates and returns a Kcov instance.
+func (k *Kernel) NewKcov() *Kcov {
+	return &Kcov{
+		mfp: k,
+	}
+}
+
+var coveragePool = sync.Pool{
+	New: func() interface{} {
+		return make([]byte, 0)
+	},
+}
+
+// TaskWork implements TaskWorker.TaskWork.
+func (kcov *Kcov) TaskWork(t *Task) {
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	rw := &kcovReadWriter{
+		mf: kcov.mfp.MemoryFile(),
+		fr: kcov.mappable.FileRange(),
+	}
+
+	// Read in the PC count.
+	if _, err := safemem.ReadFullToBlocks(rw, kcov.countBlock()); err != nil {
+		panic(fmt.Sprintf("Internal error reading count from kcov area: %v", err))
+	}
+
+	rw.off = 8 * (1 + kcov.count)
+	n := coverage.ConsumeCoverageData(&kcovIOWriter{rw})
+
+	// Update the pc count, based on the number of entries written. Note that if
+	// we reached the end of the kcov area, we may not have written everything in
+	// output.
+	kcov.count += uint64(n / 8)
+	rw.off = 0
+	if _, err := safemem.WriteFullFromBlocks(rw, kcov.countBlock()); err != nil {
+		panic(fmt.Sprintf("Internal error writing count to kcov area: %v", err))
+	}
+
+	// Re-register for future work.
+	t.RegisterWork(kcov)
+}
+
+// InitTrace performs the KCOV_INIT_TRACE ioctl.
+func (kcov *Kcov) InitTrace(size uint64) error {
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	if kcov.mode != linux.KCOV_MODE_DISABLED {
+		return syserror.EBUSY
+	}
+
+	// To simplify all the logic around mapping, we require that the length of the
+	// shared region is a multiple of the system page size.
+	if (8*size)&(usermem.PageSize-1) != 0 {
+		return syserror.EINVAL
+	}
+
+	// We need space for at least two uint64s to hold current position and a
+	// single PC.
+	if size < 2 || size > kcovAreaSizeMax {
+		return syserror.EINVAL
+	}
+
+	kcov.size = size
+	kcov.mode = linux.KCOV_MODE_INIT
+	return nil
+}
+
+// EnableTrace performs the KCOV_ENABLE_TRACE ioctl.
+func (kcov *Kcov) EnableTrace(ctx context.Context, traceMode uint8) error {
+	t := TaskFromContext(ctx)
+	if t == nil {
+		panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
+	}
+
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	// KCOV_ENABLE must be preceded by KCOV_INIT_TRACE and an mmap call.
+	if kcov.mode != linux.KCOV_MODE_INIT || kcov.mappable == nil {
+		return syserror.EINVAL
+	}
+
+	switch traceMode {
+	case linux.KCOV_TRACE_PC:
+		kcov.mode = traceMode
+	case linux.KCOV_TRACE_CMP:
+		// We do not support KCOV_MODE_TRACE_CMP.
+		return syserror.ENOTSUP
+	default:
+		return syserror.EINVAL
+	}
+
+	if kcov.owningTask != nil && kcov.owningTask != t {
+		return syserror.EBUSY
+	}
+
+	kcov.owningTask = t
+	t.RegisterWork(kcov)
+
+	// Clear existing coverage data; the task expects to read only coverage data
+	// from the time it is activated.
+	coverage.ClearCoverageData()
+	return nil
+}
+
+// DisableTrace performs the KCOV_DISABLE_TRACE ioctl.
+func (kcov *Kcov) DisableTrace(ctx context.Context) error {
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	t := TaskFromContext(ctx)
+	if t == nil {
+		panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
+	}
+
+	if t != kcov.owningTask {
+		return syserror.EINVAL
+	}
+	kcov.owningTask = nil
+	kcov.mode = linux.KCOV_MODE_INIT
+	kcov.resetLocked()
+	return nil
+}
+
+// Reset is called when the owning task exits.
+func (kcov *Kcov) Reset() {
+	kcov.mu.Lock()
+	kcov.resetLocked()
+	kcov.mu.Unlock()
+}
+
+// The kcov instance is reset when the owning task exits or when tracing is
+// disabled.
+func (kcov *Kcov) resetLocked() {
+	kcov.owningTask = nil
+	if kcov.mappable != nil {
+		kcov.mappable = nil
+	}
+}
+
+// ConfigureMMap is called by the vfs.FileDescription for this kcov instance to
+// implement vfs.FileDescription.ConfigureMMap.
+func (kcov *Kcov) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	if kcov.mode != linux.KCOV_MODE_INIT {
+		return syserror.EINVAL
+	}
+
+	if kcov.mappable == nil {
+		// Set up the kcov area.
+		fr, err := kcov.mfp.MemoryFile().Allocate(kcov.size*8, usage.Anonymous)
+		if err != nil {
+			return err
+		}
+
+		// Get the thread id for the mmap name.
+		t := TaskFromContext(ctx)
+		if t == nil {
+			panic("ThreadFromContext returned nil")
+		}
+		// For convenience, a special mappable is used here. Note that these mappings
+		// will look different under /proc/[pid]/maps than they do on Linux.
+		kcov.mappable = mm.NewSpecialMappable(fmt.Sprintf("[kcov:%d]", t.ThreadID()), kcov.mfp, fr)
+	}
+	opts.Mappable = kcov.mappable
+	opts.MappingIdentity = kcov.mappable
+	return nil
+}
+
+// kcovReadWriter implements safemem.Reader and safemem.Writer.
+type kcovReadWriter struct {
+	off uint64
+	mf  *pgalloc.MemoryFile
+	fr  memmap.FileRange
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *kcovReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+
+	// Limit the read to the kcov range and check for overflow.
+	if rw.fr.Length() <= rw.off {
+		return 0, io.EOF
+	}
+	start := rw.fr.Start + rw.off
+	end := rw.fr.Start + rw.fr.Length()
+	if rend := start + dsts.NumBytes(); rend < end {
+		end = rend
+	}
+
+	// Get internal mappings.
+	bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Read)
+	if err != nil {
+		return 0, err
+	}
+
+	// Copy from internal mappings.
+	n, err := safemem.CopySeq(dsts, bs)
+	rw.off += n
+	return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *kcovReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+
+	// Limit the write to the kcov area and check for overflow.
+	if rw.fr.Length() <= rw.off {
+		return 0, io.EOF
+	}
+	start := rw.fr.Start + rw.off
+	end := rw.fr.Start + rw.fr.Length()
+	if wend := start + srcs.NumBytes(); wend < end {
+		end = wend
+	}
+
+	// Get internal mapping.
+	bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Write)
+	if err != nil {
+		return 0, err
+	}
+
+	// Copy to internal mapping.
+	n, err := safemem.CopySeq(bs, srcs)
+	rw.off += n
+	return n, err
+}
+
+// kcovIOWriter implements io.Writer as a basic wrapper over kcovReadWriter.
+type kcovIOWriter struct {
+	rw *kcovReadWriter
+}
+
+// Write implements io.Writer.Write.
+func (w *kcovIOWriter) Write(p []byte) (int, error) {
+	bs := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(p))
+	n, err := safemem.WriteFullFromBlocks(w.rw, bs)
+	return int(n), err
+}
diff --git a/pkg/sentry/kernel/kcov_unsafe.go b/pkg/sentry/kernel/kcov_unsafe.go
new file mode 100644
index 000000000..6f64022eb
--- /dev/null
+++ b/pkg/sentry/kernel/kcov_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// countBlock provides a safemem.BlockSeq for k.count.
+//
+// Like k.count, the block returned is protected by k.mu.
+func (k *Kcov) countBlock() safemem.BlockSeq {
+	return safemem.BlockSeqOf(safemem.BlockFromSafePointer(unsafe.Pointer(&k.count), int(unsafe.Sizeof(k.count))))
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 2e0175e36..402aa1718 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -248,7 +248,7 @@ type Kernel struct {
 	// SpecialOpts contains special kernel options.
 	SpecialOpts
 
-	// VFS keeps the filesystem state used across the kernel.
+	// vfs keeps the filesystem state used across the kernel.
 	vfs vfs.VirtualFilesystem
 
 	// hostMount is the Mount used for file descriptors that were imported
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 5aee699e7..a436610c9 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -574,6 +574,11 @@ type Task struct {
 	//
 	// startTime is protected by mu.
 	startTime ktime.Time
+
+	// kcov is the kcov instance providing code coverage owned by this task.
+	//
+	// kcov is exclusive to the task goroutine.
+	kcov *Kcov
 }
 
 func (t *Task) savePtraceTracer() *Task {
@@ -903,3 +908,16 @@ func (t *Task) UID() uint32 {
 func (t *Task) GID() uint32 {
 	return uint32(t.Credentials().EffectiveKGID)
 }
+
+// SetKcov sets the kcov instance associated with t.
+func (t *Task) SetKcov(k *Kcov) {
+	t.kcov = k
+}
+
+// ResetKcov clears the kcov instance associated with t.
+func (t *Task) ResetKcov() {
+	if t.kcov != nil {
+		t.kcov.Reset()
+		t.kcov = nil
+	}
+}
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index c165d6cb1..b76f7f503 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -239,6 +239,8 @@ func (*runExitMain) execute(t *Task) taskRunState {
 	t.traceExitEvent()
 	lastExiter := t.exitThreadGroup()
 
+	t.ResetKcov()
+
 	// If the task has a cleartid, and the thread group wasn't killed by a
 	// signal, handle that before releasing the MM.
 	if t.cleartid != 0 {
diff --git a/runsc/config/config.go b/runsc/config/config.go
index ca85cef51..8cf0378d5 100644
--- a/runsc/config/config.go
+++ b/runsc/config/config.go
@@ -300,10 +300,10 @@ type Config struct {
 	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
 	CPUNumFromQuota bool
 
-	// Enables VFS2 (not plumbled through yet).
+	// Enables VFS2 (not plumbed through yet).
 	VFS2 bool
 
-	// Enables FUSE usage (not plumbled through yet).
+	// Enables FUSE usage (not plumbed through yet).
 	FUSE bool
 
 	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
@@ -353,6 +353,8 @@ func (c *Config) ToFlags() []string {
 		"--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload),
 		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
 		"--qdisc=" + c.QDisc.String(),
+		"--vfs2=" + strconv.FormatBool(c.VFS2),
+		"--fuse=" + strconv.FormatBool(c.FUSE),
 	}
 	if c.CPUNumFromQuota {
 		f = append(f, "--cpu-num-from-quota")
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index eea1401ac..65e8299c3 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -262,6 +262,10 @@ syscall_test(
     test = "//test/syscalls/linux:itimer_test",
 )
 
+syscall_test(
+    test = "//test/syscalls/linux:kcov_test",
+)
+
 syscall_test(
     test = "//test/syscalls/linux:kill_test",
 )
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ed0b6ecf4..5a323d331 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1067,6 +1067,20 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "kcov_test",
+    testonly = 1,
+    srcs = ["kcov.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
 cc_binary(
     name = "kill_test",
     testonly = 1,
diff --git a/test/syscalls/linux/kcov.cc b/test/syscalls/linux/kcov.cc
new file mode 100644
index 000000000..f3c30444e
--- /dev/null
+++ b/test/syscalls/linux/kcov.cc
@@ -0,0 +1,70 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/errno.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// For this test to work properly, it must be run with coverage enabled. On
+// native Linux, this involves compiling the kernel with kcov enabled. For
+// gVisor, we need to enable the Go coverage tool, e.g.
+// bazel test --collect_coverage_data --instrumentation_filter=//pkg/... <test>.
+TEST(KcovTest, Kcov) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_DAC_OVERRIDE))));
+
+  constexpr int kSize = 4096;
+  constexpr int KCOV_INIT_TRACE = 0x80086301;
+  constexpr int KCOV_ENABLE = 0x6364;
+
+  int fd;
+  ASSERT_THAT(fd = open("/sys/kernel/debug/kcov", O_RDWR),
+              AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(ENOENT)));
+
+  // Kcov not enabled.
+  SKIP_IF(errno == ENOENT);
+
+  ASSERT_THAT(ioctl(fd, KCOV_INIT_TRACE, kSize), SyscallSucceeds());
+  uint64_t* area = (uint64_t*)mmap(nullptr, kSize * sizeof(uint64_t),
+                                   PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+  ASSERT_TRUE(area != MAP_FAILED);
+  ASSERT_THAT(ioctl(fd, KCOV_ENABLE, 0), SyscallSucceeds());
+
+  for (int i = 0; i < 10; i++) {
+    // Make some syscalls to generate coverage data.
+    ASSERT_THAT(ioctl(fd, KCOV_ENABLE, 0), SyscallFailsWithErrno(EINVAL));
+  }
+
+  uint64_t num_pcs = *(uint64_t*)(area);
+  EXPECT_GT(num_pcs, 0);
+  for (uint64_t i = 1; i <= num_pcs; i++) {
+    // Verify that PCs are in the standard kernel range.
+    EXPECT_GT(area[i], 0xffffffff7fffffffL);
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From e382f99346f54122276a38561f42556b600a9454 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 25 Aug 2020 16:38:07 -0700
Subject: Return non-zero size for tmpfs statfs(2).

This does not implement accepting or enforcing any size limit, which will be
more complex and has performance implications; it just returns a fixed non-zero
size.

Updates #1936

PiperOrigin-RevId: 328428588
---
 pkg/abi/linux/fs.go                   | 11 ++++++-----
 pkg/sentry/fs/tmpfs/tmpfs.go          | 12 ++++++++++--
 pkg/sentry/fsimpl/tmpfs/filesystem.go | 12 +-----------
 pkg/sentry/fsimpl/tmpfs/tmpfs.go      | 24 ++++++++++++++++++++++++
 4 files changed, 41 insertions(+), 18 deletions(-)

(limited to 'pkg')

diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 158d2db5b..2b1ef0d4e 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -44,17 +44,18 @@ type Statfs struct {
 	// Type is one of the filesystem magic values, defined above.
 	Type uint64
 
-	// BlockSize is the data block size.
+	// BlockSize is the optimal transfer block size in bytes.
 	BlockSize int64
 
-	// Blocks is the number of data blocks in use.
+	// Blocks is the maximum number of data blocks the filesystem may store, in
+	// units of BlockSize.
 	Blocks uint64
 
-	// BlocksFree is the number of free blocks.
+	// BlocksFree is the number of free data blocks, in units of BlockSize.
 	BlocksFree uint64
 
-	// BlocksAvailable is the number of blocks free for use by
-	// unprivileged users.
+	// BlocksAvailable is the number of data blocks free for use by
+	// unprivileged users, in units of BlockSize.
 	BlocksAvailable uint64
 
 	// Files is the number of used file nodes on the filesystem.
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index b095312fe..998b697ca 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -16,6 +16,8 @@
 package tmpfs
 
 import (
+	"math"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -32,9 +34,15 @@ import (
 var fsInfo = fs.Info{
 	Type: linux.TMPFS_MAGIC,
 
+	// tmpfs currently does not support configurable size limits. In Linux,
+	// such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from
+	// statfs(2). However, many applications treat this as having a size limit
+	// of 0. To work around this, claim to have a very large but non-zero size,
+	// chosen to ensure that BlockSize * Blocks does not overflow int64 (which
+	// applications may also handle incorrectly).
 	// TODO(b/29637826): allow configuring a tmpfs size and enforce it.
-	TotalBlocks: 0,
-	FreeBlocks:  0,
+	TotalBlocks: math.MaxInt64 / usermem.PageSize,
+	FreeBlocks:  math.MaxInt64 / usermem.PageSize,
 }
 
 // rename implements fs.InodeOperations.Rename for tmpfs nodes.
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 7924a0911..eddfeab76 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -25,7 +25,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Sync implements vfs.FilesystemImpl.Sync.
@@ -706,16 +705,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if _, err := resolveLocked(ctx, rp); err != nil {
 		return linux.Statfs{}, err
 	}
-	statfs := linux.Statfs{
-		Type:         linux.TMPFS_MAGIC,
-		BlockSize:    usermem.PageSize,
-		FragmentSize: usermem.PageSize,
-		NameLength:   linux.NAME_MAX,
-		// TODO(b/29637826): Allow configuring a tmpfs size and enforce it.
-		Blocks:     0,
-		BlocksFree: 0,
-	}
-	return statfs, nil
+	return globalStatfs, nil
 }
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 428f62aaa..a7fdf19ca 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -201,6 +201,25 @@ func (fs *filesystem) Release(ctx context.Context) {
 	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
 }
 
+// immutable
+var globalStatfs = linux.Statfs{
+	Type:         linux.TMPFS_MAGIC,
+	BlockSize:    usermem.PageSize,
+	FragmentSize: usermem.PageSize,
+	NameLength:   linux.NAME_MAX,
+
+	// tmpfs currently does not support configurable size limits. In Linux,
+	// such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from
+	// statfs(2). However, many applications treat this as having a size limit
+	// of 0. To work around this, claim to have a very large but non-zero size,
+	// chosen to ensure that BlockSize * Blocks does not overflow int64 (which
+	// applications may also handle incorrectly).
+	// TODO(b/29637826): allow configuring a tmpfs size and enforce it.
+	Blocks:          math.MaxInt64 / usermem.PageSize,
+	BlocksFree:      math.MaxInt64 / usermem.PageSize,
+	BlocksAvailable: math.MaxInt64 / usermem.PageSize,
+}
+
 // dentry implements vfs.DentryImpl.
 type dentry struct {
 	vfsd vfs.Dentry
@@ -698,6 +717,11 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	return nil
 }
 
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return globalStatfs, nil
+}
+
 // Listxattr implements vfs.FileDescriptionImpl.Listxattr.
 func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
 	return fd.inode().listxattr(size)
-- 
cgit v1.2.3


From 87e03869065f0784bf9ed76855205693128f65a4 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 25 Aug 2020 21:01:45 -0700
Subject: Use new reference count utility throughout gvisor.

This uses the refs_vfs2 template in vfs2 as well as objects common to vfs1 and
vfs2. Note that vfs1-only refcounts are not replaced, since vfs1 will be deleted
soon anyway.

The following structs now use the new tool, with leak check enabled:
devpts:rootInode
fuse:inode
kernfs:Dentry
kernfs:dir
kernfs:readonlyDir
kernfs:StaticDirectory
proc:fdDirInode
proc:fdInfoDirInode
proc:subtasksInode
proc:taskInode
proc:tasksInode
vfs:FileDescription
vfs:MountNamespace
vfs:Filesystem
sys:dir
kernel:FSContext
kernel:ProcessGroup
kernel:Session
shm:Shm
mm:aioMappable
mm:SpecialMappable
transport:queue

And the following use the template, but because they currently are not leak
checked, a TODO is left instead of enabling leak check in this patch:
kernel:FDTable
tun:tunEndpoint

Updates #1486.

PiperOrigin-RevId: 328460377
---
 pkg/refs_vfs2/BUILD                                |  2 +-
 pkg/refs_vfs2/refs_template.go                     | 17 ++++-
 pkg/sentry/fsimpl/devpts/BUILD                     | 15 ++++
 pkg/sentry/fsimpl/devpts/devpts.go                 |  7 ++
 pkg/sentry/fsimpl/fuse/BUILD                       | 13 ++++
 pkg/sentry/fsimpl/fuse/fusefs.go                   |  7 ++
 pkg/sentry/fsimpl/kernfs/BUILD                     | 54 ++++++++++++-
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go        | 27 ++++---
 pkg/sentry/fsimpl/kernfs/kernfs.go                 | 24 +++---
 pkg/sentry/fsimpl/kernfs/kernfs_test.go            | 12 +++
 pkg/sentry/fsimpl/proc/BUILD                       | 61 +++++++++++++++
 pkg/sentry/fsimpl/proc/subtasks.go                 |  7 ++
 pkg/sentry/fsimpl/proc/task.go                     |  8 ++
 pkg/sentry/fsimpl/proc/task_fds.go                 | 16 +++-
 pkg/sentry/fsimpl/proc/task_net.go                 |  6 +-
 pkg/sentry/fsimpl/proc/tasks.go                    |  7 ++
 pkg/sentry/fsimpl/sys/BUILD                        | 15 ++++
 pkg/sentry/fsimpl/sys/sys.go                       |  9 ++-
 pkg/sentry/kernel/BUILD                            | 48 ++++++++++++
 pkg/sentry/kernel/fd_table.go                      | 21 +++--
 pkg/sentry/kernel/fd_table_unsafe.go               |  2 +
 pkg/sentry/kernel/fs_context.go                    | 89 ++++++++++++----------
 pkg/sentry/kernel/sessions.go                      | 29 +++----
 pkg/sentry/kernel/shm/BUILD                        | 13 ++++
 pkg/sentry/kernel/shm/shm.go                       | 19 ++---
 pkg/sentry/mm/BUILD                                | 24 ++++++
 pkg/sentry/mm/aio_context.go                       |  7 +-
 pkg/sentry/mm/special_mappable.go                  |  7 +-
 pkg/sentry/socket/unix/transport/BUILD             | 12 +++
 pkg/sentry/socket/unix/transport/connectioned.go   |  8 +-
 pkg/sentry/socket/unix/transport/connectionless.go |  2 +-
 pkg/sentry/socket/unix/transport/queue.go          | 13 ++--
 pkg/sentry/vfs/BUILD                               | 37 +++++++++
 pkg/sentry/vfs/README.md                           |  9 ---
 pkg/sentry/vfs/file_description.go                 | 39 +---------
 pkg/sentry/vfs/filesystem.go                       | 37 +--------
 pkg/sentry/vfs/mount.go                            | 21 ++---
 pkg/tcpip/link/tun/BUILD                           | 14 ++++
 pkg/tcpip/link/tun/device.go                       |  9 +--
 39 files changed, 531 insertions(+), 236 deletions(-)

(limited to 'pkg')

diff --git a/pkg/refs_vfs2/BUILD b/pkg/refs_vfs2/BUILD
index 7b3e10683..577b827a5 100644
--- a/pkg/refs_vfs2/BUILD
+++ b/pkg/refs_vfs2/BUILD
@@ -11,7 +11,7 @@ go_template(
     types = [
         "T",
     ],
-    visibility = ["//pkg/sentry:internal"],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
         "//pkg/refs",
diff --git a/pkg/refs_vfs2/refs_template.go b/pkg/refs_vfs2/refs_template.go
index 99c43c065..d9b552896 100644
--- a/pkg/refs_vfs2/refs_template.go
+++ b/pkg/refs_vfs2/refs_template.go
@@ -12,11 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package refs_template defines a template that can be used by reference counted
-// objects.
+// Package refs_template defines a template that can be used by reference
+// counted objects. The "owner" template parameter is used in log messages to
+// indicate the type of reference-counted object that exhibited a reference
+// leak. As a result, structs that are embedded in other structs should not use
+// this template, since it will make tracking down leaks more difficult.
 package refs_template
 
 import (
+	"fmt"
 	"runtime"
 	"sync/atomic"
 
@@ -38,6 +42,11 @@ var ownerType *T
 // Note that the number of references is actually refCount + 1 so that a default
 // zero-value Refs object contains one reference.
 //
+// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in
+// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount.
+// This will allow us to add stack trace information to the leak messages
+// without growing the size of Refs.
+//
 // +stateify savable
 type Refs struct {
 	// refCount is composed of two fields:
@@ -82,7 +91,7 @@ func (r *Refs) ReadRefs() int64 {
 //go:nosplit
 func (r *Refs) IncRef() {
 	if v := atomic.AddInt64(&r.refCount, 1); v <= 0 {
-		panic("Incrementing non-positive ref count")
+		panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, ownerType))
 	}
 }
 
@@ -122,7 +131,7 @@ func (r *Refs) TryIncRef() bool {
 func (r *Refs) DecRef(destroy func()) {
 	switch v := atomic.AddInt64(&r.refCount, -1); {
 	case v < -1:
-		panic("Decrementing non-positive ref count")
+		panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, ownerType))
 
 	case v == -1:
 		// Call the destructor.
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 93512c9b6..3f64fab3a 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -1,7 +1,19 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "root_inode_refs",
+    out = "root_inode_refs.go",
+    package = "devpts",
+    prefix = "rootInode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "rootInode",
+    },
+)
+
 go_library(
     name = "devpts",
     srcs = [
@@ -9,6 +21,7 @@ go_library(
         "line_discipline.go",
         "master.go",
         "queue.go",
+        "root_inode_refs.go",
         "slave.go",
         "terminal.go",
     ],
@@ -16,6 +29,8 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/log",
+        "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 3f3a099bd..0eaff9087 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -83,6 +83,7 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds
 	}
 	root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
 	root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	root.EnableLeakCheck()
 	root.dentry.Init(root)
 
 	// Construct the pts master inode and dentry. Linux always uses inode
@@ -110,6 +111,7 @@ func (fs *filesystem) Release(ctx context.Context) {
 
 // rootInode is the root directory inode for the devpts mounts.
 type rootInode struct {
+	rootInodeRefs
 	kernfs.AlwaysValid
 	kernfs.InodeAttrs
 	kernfs.InodeDirectoryNoNewChildren
@@ -233,3 +235,8 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
 	}
 	return offset, nil
 }
+
+// DecRef implements kernfs.Inode.
+func (i *rootInode) DecRef(context.Context) {
+	i.rootInodeRefs.DecRef(i.Destroy)
+}
diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD
index 999111deb..53a4f3012 100644
--- a/pkg/sentry/fsimpl/fuse/BUILD
+++ b/pkg/sentry/fsimpl/fuse/BUILD
@@ -15,6 +15,17 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "inode_refs",
+    out = "inode_refs.go",
+    package = "fuse",
+    prefix = "inode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "inode",
+    },
+)
+
 go_library(
     name = "fuse",
     srcs = [
@@ -22,6 +33,7 @@ go_library(
         "dev.go",
         "fusefs.go",
         "init.go",
+        "inode_refs.go",
         "register.go",
         "request_list.go",
     ],
@@ -30,6 +42,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/log",
+        "//pkg/refs",
         "//pkg/sentry/fsimpl/devtmpfs",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index 44021ee4b..9717c0e15 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -198,6 +198,7 @@ func (fs *filesystem) Release(ctx context.Context) {
 
 // inode implements kernfs.Inode.
 type inode struct {
+	inodeRefs
 	kernfs.InodeAttrs
 	kernfs.InodeNoDynamicLookup
 	kernfs.InodeNotSymlink
@@ -213,6 +214,7 @@ func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *ke
 	i := &inode{}
 	i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
 	i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	i.EnableLeakCheck()
 	i.dentry.Init(i)
 
 	return &i.dentry
@@ -324,3 +326,8 @@ func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptio
 
 	return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil
 }
+
+// DecRef implements kernfs.Inode.
+func (i *inode) DecRef(context.Context) {
+	i.inodeRefs.DecRef(i.Destroy)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 3835557fe..637dca70c 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -26,9 +26,54 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "dentry_refs",
+    out = "dentry_refs.go",
+    package = "kernfs",
+    prefix = "Dentry",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "Dentry",
+    },
+)
+
+go_template_instance(
+    name = "static_directory_refs",
+    out = "static_directory_refs.go",
+    package = "kernfs",
+    prefix = "StaticDirectory",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "StaticDirectory",
+    },
+)
+
+go_template_instance(
+    name = "dir_refs",
+    out = "dir_refs.go",
+    package = "kernfs_test",
+    prefix = "dir",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "dir",
+    },
+)
+
+go_template_instance(
+    name = "readonly_dir_refs",
+    out = "readonly_dir_refs.go",
+    package = "kernfs_test",
+    prefix = "readonlyDir",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "readonlyDir",
+    },
+)
+
 go_library(
     name = "kernfs",
     srcs = [
+        "dentry_refs.go",
         "dynamic_bytes_file.go",
         "fd_impl_util.go",
         "filesystem.go",
@@ -36,6 +81,7 @@ go_library(
         "inode_impl_util.go",
         "kernfs.go",
         "slot_list.go",
+        "static_directory_refs.go",
         "symlink.go",
     ],
     visibility = ["//pkg/sentry:internal"],
@@ -59,11 +105,17 @@ go_library(
 go_test(
     name = "kernfs_test",
     size = "small",
-    srcs = ["kernfs_test.go"],
+    srcs = [
+        "dir_refs.go",
+        "kernfs_test.go",
+        "readonly_dir_refs.go",
+    ],
     deps = [
         ":kernfs",
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/log",
+        "//pkg/refs",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/testutil",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 885856868..f442a5606 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -20,7 +20,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -344,8 +343,6 @@ type OrderedChildrenOptions struct {
 //
 // Must be initialize with Init before first use.
 type OrderedChildren struct {
-	refs.AtomicRefCount
-
 	// Can children be modified by user syscalls? It set to false, interface
 	// methods that would modify the children return EPERM. Immutable.
 	writable bool
@@ -361,14 +358,14 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
 	o.set = make(map[string]*slot)
 }
 
-// DecRef implements Inode.DecRef.
-func (o *OrderedChildren) DecRef(ctx context.Context) {
-	o.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
-		o.mu.Lock()
-		defer o.mu.Unlock()
-		o.order.Reset()
-		o.set = nil
-	})
+// Destroy clears the children stored in o. It should be called by structs
+// embedding OrderedChildren upon destruction, i.e. when their reference count
+// reaches zero.
+func (o *OrderedChildren) Destroy() {
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	o.order.Reset()
+	o.set = nil
 }
 
 // Populate inserts children into this OrderedChildren, and d's dentry
@@ -549,6 +546,7 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
 //
 // +stateify savable
 type StaticDirectory struct {
+	StaticDirectoryRefs
 	InodeNotSymlink
 	InodeDirectoryNoNewChildren
 	InodeAttrs
@@ -594,11 +592,16 @@ func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd
 	return fd.VFSFileDescription(), nil
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
+// DecRef implements kernfs.Inode.
+func (s *StaticDirectory) DecRef(context.Context) {
+	s.StaticDirectoryRefs.DecRef(s.Destroy)
+}
+
 // AlwaysValid partially implements kernfs.inodeDynamicLookup.
 type AlwaysValid struct{}
 
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 51dbc050c..ca3685800 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -57,7 +57,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -161,9 +160,9 @@ const (
 //
 // Must be initialized by Init prior to first use.
 type Dentry struct {
-	vfsd vfs.Dentry
+	DentryRefs
 
-	refs.AtomicRefCount
+	vfsd vfs.Dentry
 
 	// flags caches useful information about the dentry from the inode. See the
 	// dflags* consts above. Must be accessed by atomic ops.
@@ -194,6 +193,7 @@ func (d *Dentry) Init(inode Inode) {
 	if ftype == linux.ModeSymlink {
 		d.flags |= dflagsIsSymlink
 	}
+	d.EnableLeakCheck()
 }
 
 // VFSDentry returns the generic vfs dentry for this kernfs dentry.
@@ -213,16 +213,14 @@ func (d *Dentry) isSymlink() bool {
 
 // DecRef implements vfs.DentryImpl.DecRef.
 func (d *Dentry) DecRef(ctx context.Context) {
-	d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy)
-}
-
-// Precondition: Dentry must be removed from VFS' dentry cache.
-func (d *Dentry) destroy(ctx context.Context) {
-	d.inode.DecRef(ctx) // IncRef from Init.
-	d.inode = nil
-	if d.parent != nil {
-		d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild.
-	}
+	// Before the destructor is called, Dentry must be removed from VFS' dentry cache.
+	d.DentryRefs.DecRef(func() {
+		d.inode.DecRef(ctx) // IncRef from Init.
+		d.inode = nil
+		if d.parent != nil {
+			d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild.
+		}
+	})
 }
 
 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index e5c28c0e4..e376d1736 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -96,6 +96,7 @@ func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.S
 }
 
 type readonlyDir struct {
+	readonlyDirRefs
 	attrs
 	kernfs.InodeNotSymlink
 	kernfs.InodeNoDynamicLookup
@@ -111,6 +112,7 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
 	dir := &readonlyDir{}
 	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
 	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	dir.EnableLeakCheck()
 	dir.dentry.Init(dir)
 
 	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
@@ -128,7 +130,12 @@ func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs
 	return fd.VFSFileDescription(), nil
 }
 
+func (d *readonlyDir) DecRef(context.Context) {
+	d.readonlyDirRefs.DecRef(d.Destroy)
+}
+
 type dir struct {
+	dirRefs
 	attrs
 	kernfs.InodeNotSymlink
 	kernfs.InodeNoDynamicLookup
@@ -145,6 +152,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 	dir.fs = fs
 	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
 	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
+	dir.EnableLeakCheck()
 	dir.dentry.Init(dir)
 
 	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
@@ -162,6 +170,10 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry,
 	return fd.VFSFileDescription(), nil
 }
 
+func (d *dir) DecRef(context.Context) {
+	d.dirRefs.DecRef(d.Destroy)
+}
+
 func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
 	creds := auth.CredentialsFromContext(ctx)
 	dir := d.fs.newDir(creds, opts.Mode, nil)
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 14ecfd300..a45b44440 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -1,18 +1,79 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "fd_dir_inode_refs",
+    out = "fd_dir_inode_refs.go",
+    package = "proc",
+    prefix = "fdDirInode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "fdDirInode",
+    },
+)
+
+go_template_instance(
+    name = "fd_info_dir_inode_refs",
+    out = "fd_info_dir_inode_refs.go",
+    package = "proc",
+    prefix = "fdInfoDirInode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "fdInfoDirInode",
+    },
+)
+
+go_template_instance(
+    name = "subtasks_inode_refs",
+    out = "subtasks_inode_refs.go",
+    package = "proc",
+    prefix = "subtasksInode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "subtasksInode",
+    },
+)
+
+go_template_instance(
+    name = "task_inode_refs",
+    out = "task_inode_refs.go",
+    package = "proc",
+    prefix = "taskInode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "taskInode",
+    },
+)
+
+go_template_instance(
+    name = "tasks_inode_refs",
+    out = "tasks_inode_refs.go",
+    package = "proc",
+    prefix = "tasksInode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "tasksInode",
+    },
+)
+
 go_library(
     name = "proc",
     srcs = [
+        "fd_dir_inode_refs.go",
+        "fd_info_dir_inode_refs.go",
         "filesystem.go",
         "subtasks.go",
+        "subtasks_inode_refs.go",
         "task.go",
         "task_fds.go",
         "task_files.go",
+        "task_inode_refs.go",
         "task_net.go",
         "tasks.go",
         "tasks_files.go",
+        "tasks_inode_refs.go",
         "tasks_sys.go",
     ],
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index f25747da3..01c0efb3a 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -31,6 +31,7 @@ import (
 //
 // +stateify savable
 type subtasksInode struct {
+	subtasksInodeRefs
 	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeAttrs
@@ -57,6 +58,7 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace,
 	// Note: credentials are overridden by taskOwnedInode.
 	subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
 	subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	subInode.EnableLeakCheck()
 
 	inode := &taskOwnedInode{Inode: subInode, owner: task}
 	dentry := &kernfs.Dentry{}
@@ -182,3 +184,8 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs
 func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
+
+// DecRef implements kernfs.Inode.
+func (i *subtasksInode) DecRef(context.Context) {
+	i.subtasksInodeRefs.DecRef(i.Destroy)
+}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 109b31b4c..66b557abd 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -32,6 +32,7 @@ import (
 //
 // +stateify savable
 type taskInode struct {
+	taskInodeRefs
 	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeNoDynamicLookup
@@ -84,6 +85,7 @@ func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace
 	taskInode := &taskInode{task: task}
 	// Note: credentials are overridden by taskOwnedInode.
 	taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	taskInode.EnableLeakCheck()
 
 	inode := &taskOwnedInode{Inode: taskInode, owner: task}
 	dentry := &kernfs.Dentry{}
@@ -119,6 +121,11 @@ func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, v
 	return syserror.EPERM
 }
 
+// DecRef implements kernfs.Inode.
+func (i *taskInode) DecRef(context.Context) {
+	i.taskInodeRefs.DecRef(i.Destroy)
+}
+
 // taskOwnedInode implements kernfs.Inode and overrides inode owner with task
 // effective user and group.
 type taskOwnedInode struct {
@@ -147,6 +154,7 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.
 	dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, kernfs.GenericDirectoryFDOptions{
 		SeekEnd: kernfs.SeekEndZero,
 	})
+	dir.EnableLeakCheck()
 
 	inode := &taskOwnedInode{Inode: dir, owner: task}
 	d := &kernfs.Dentry{}
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index e8fcb9aa1..0527b2de8 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -22,7 +22,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -101,6 +100,7 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off
 //
 // +stateify savable
 type fdDirInode struct {
+	fdDirInodeRefs
 	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeAttrs
@@ -120,6 +120,7 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
 		},
 	}
 	inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	inode.EnableLeakCheck()
 
 	dentry := &kernfs.Dentry{}
 	dentry.Init(inode)
@@ -175,6 +176,11 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia
 	return err
 }
 
+// DecRef implements kernfs.Inode.
+func (i *fdDirInode) DecRef(context.Context) {
+	i.fdDirInodeRefs.DecRef(i.Destroy)
+}
+
 // fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file.
 //
 // +stateify savable
@@ -227,6 +233,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen
 //
 // +stateify savable
 type fdInfoDirInode struct {
+	fdInfoDirInodeRefs
 	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeAttrs
@@ -245,6 +252,7 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
 		},
 	}
 	inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	inode.EnableLeakCheck()
 
 	dentry := &kernfs.Dentry{}
 	dentry.Init(inode)
@@ -282,12 +290,16 @@ func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *
 	return fd.VFSFileDescription(), nil
 }
 
+// DecRef implements kernfs.Inode.
+func (i *fdInfoDirInode) DecRef(context.Context) {
+	i.fdInfoDirInodeRefs.DecRef(i.Destroy)
+}
+
 // fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd].
 //
 // +stateify savable
 type fdInfoData struct {
 	kernfs.DynamicBytesFile
-	refs.AtomicRefCount
 
 	task *kernel.Task
 	fd   int32
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index a4c884bf9..4e69782c7 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -262,7 +262,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		// For now, we always redact this pointer.
 		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d",
 			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
-			s.Refs()-1,                    // RefCount, don't count our own ref.
+			s.ReadRefs()-1,                // RefCount, don't count our own ref.
 			0,                             // Protocol, always 0 for UDS.
 			sockFlags,                     // Flags.
 			sops.Endpoint().Type(),        // Type.
@@ -430,7 +430,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 
 		// Field: refcount. Don't count the ref we obtain while deferencing
 		// the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", s.Refs()-1)
+		fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -589,7 +589,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 
 		// Field: ref; reference count on the socket inode. Don't count the ref
 		// we obtain while deferencing the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", s.Refs()-1)
+		fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 1391992b7..863c4467e 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -37,6 +37,7 @@ const (
 //
 // +stateify savable
 type tasksInode struct {
+	tasksInodeRefs
 	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeAttrs
@@ -84,6 +85,7 @@ func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace
 		cgroupControllers: cgroupControllers,
 	}
 	inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	inode.EnableLeakCheck()
 
 	dentry := &kernfs.Dentry{}
 	dentry.Init(inode)
@@ -226,6 +228,11 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St
 	return stat, nil
 }
 
+// DecRef implements kernfs.Inode.
+func (i *tasksInode) DecRef(context.Context) {
+	i.tasksInodeRefs.DecRef(i.Destroy)
+}
+
 // staticFileSetStat implements a special static file that allows inode
 // attributes to be set. This is to support /proc files that are readonly, but
 // allow attributes to be set.
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index f9b232da6..906cd52cb 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -1,10 +1,23 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "dir_refs",
+    out = "dir_refs.go",
+    package = "sys",
+    prefix = "dir",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "dir",
+    },
+)
+
 go_library(
     name = "sys",
     srcs = [
+        "dir_refs.go",
         "kcov.go",
         "sys.go",
     ],
@@ -13,6 +26,8 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/coverage",
+        "//pkg/log",
+        "//pkg/refs",
         "//pkg/sentry/arch",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 1f042d9f7..ea30a4ec2 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -118,6 +118,7 @@ func (fs *filesystem) Release(ctx context.Context) {
 
 // dir implements kernfs.Inode.
 type dir struct {
+	dirRefs
 	kernfs.InodeAttrs
 	kernfs.InodeNoDynamicLookup
 	kernfs.InodeNotSymlink
@@ -133,6 +134,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 	d := &dir{}
 	d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
 	d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	d.EnableLeakCheck()
 	d.dentry.Init(d)
 
 	d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents))
@@ -140,7 +142,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 	return &d.dentry
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
@@ -156,6 +158,11 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry,
 	return fd.VFSFileDescription(), nil
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (d *dir) DecRef(context.Context) {
+	d.dirRefs.DecRef(d.Destroy)
+}
+
 // cpuFile implements kernfs.Inode.
 type cpuFile struct {
 	kernfs.DynamicBytesFile
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index d1ecceba3..d436daab4 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -74,6 +74,50 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "fd_table_refs",
+    out = "fd_table_refs.go",
+    package = "kernel",
+    prefix = "FDTable",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "FDTable",
+    },
+)
+
+go_template_instance(
+    name = "fs_context_refs",
+    out = "fs_context_refs.go",
+    package = "kernel",
+    prefix = "FSContext",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "FSContext",
+    },
+)
+
+go_template_instance(
+    name = "process_group_refs",
+    out = "process_group_refs.go",
+    package = "kernel",
+    prefix = "ProcessGroup",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "ProcessGroup",
+    },
+)
+
+go_template_instance(
+    name = "session_refs",
+    out = "session_refs.go",
+    package = "kernel",
+    prefix = "Session",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "Session",
+    },
+)
+
 proto_library(
     name = "uncaught_signal",
     srcs = ["uncaught_signal.proto"],
@@ -88,8 +132,10 @@ go_library(
         "aio.go",
         "context.go",
         "fd_table.go",
+        "fd_table_refs.go",
         "fd_table_unsafe.go",
         "fs_context.go",
+        "fs_context_refs.go",
         "ipc_namespace.go",
         "kcov.go",
         "kcov_unsafe.go",
@@ -101,6 +147,7 @@ go_library(
         "pending_signals_state.go",
         "posixtimer.go",
         "process_group_list.go",
+        "process_group_refs.go",
         "ptrace.go",
         "ptrace_amd64.go",
         "ptrace_arm64.go",
@@ -108,6 +155,7 @@ go_library(
         "seccomp.go",
         "seqatomic_taskgoroutineschedinfo_unsafe.go",
         "session_list.go",
+        "session_refs.go",
         "sessions.go",
         "signal.go",
         "signal_handlers.go",
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index ce53af69b..5773244ac 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -23,7 +23,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -78,7 +77,8 @@ type descriptor struct {
 //
 // +stateify savable
 type FDTable struct {
-	refs.AtomicRefCount
+	FDTableRefs
+
 	k *Kernel
 
 	// mu protects below.
@@ -176,16 +176,15 @@ func (k *Kernel) NewFDTable() *FDTable {
 	return f
 }
 
-// destroy removes all of the file descriptors from the map.
-func (f *FDTable) destroy(ctx context.Context) {
-	f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool {
-		return true
-	})
-}
-
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
+// DecRef implements RefCounter.DecRef.
+//
+// If f reaches zero references, all of its file descriptors are removed.
 func (f *FDTable) DecRef(ctx context.Context) {
-	f.DecRefWithDestructor(ctx, f.destroy)
+	f.FDTableRefs.DecRef(func() {
+		f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool {
+			return true
+		})
+	})
 }
 
 // Size returns the number of file descriptor slots currently allocated.
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index 7fd97dc53..6b8feb107 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -31,6 +31,8 @@ type descriptorTable struct {
 }
 
 // init initializes the table.
+//
+// TODO(gvisor.dev/1486): Enable leak check for FDTable.
 func (f *FDTable) init() {
 	var slice []unsafe.Pointer // Empty slice.
 	atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 8f2d36d5a..d46d1e1c1 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -30,7 +29,7 @@ import (
 //
 // +stateify savable
 type FSContext struct {
-	refs.AtomicRefCount
+	FSContextRefs
 
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
@@ -64,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
 		cwd:   cwd,
 		umask: umask,
 	}
-	f.EnableLeakCheck("kernel.FSContext")
+	f.EnableLeakCheck()
 	return &f
 }
 
@@ -77,54 +76,56 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
 		cwdVFS2:  cwd,
 		umask:    umask,
 	}
-	f.EnableLeakCheck("kernel.FSContext")
+	f.EnableLeakCheck()
 	return &f
 }
 
-// destroy is the destructor for an FSContext.
+// DecRef implements RefCounter.DecRef.
 //
-// This will call DecRef on both root and cwd Dirents.  If either call to
-// DecRef returns an error, then it will be propagated.  If both calls to
-// DecRef return an error, then the one from root.DecRef will be propagated.
+// When f reaches zero references, DecRef will be called on both root and cwd
+// Dirents.
 //
 // Note that there may still be calls to WorkingDirectory() or RootDirectory()
 // (that return nil).  This is because valid references may still be held via
 // proc files or other mechanisms.
-func (f *FSContext) destroy(ctx context.Context) {
-	// Hold f.mu so that we don't race with RootDirectory() and
-	// WorkingDirectory().
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	if VFS2Enabled {
-		f.rootVFS2.DecRef(ctx)
-		f.rootVFS2 = vfs.VirtualDentry{}
-		f.cwdVFS2.DecRef(ctx)
-		f.cwdVFS2 = vfs.VirtualDentry{}
-	} else {
-		f.root.DecRef(ctx)
-		f.root = nil
-		f.cwd.DecRef(ctx)
-		f.cwd = nil
-	}
-}
-
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
 func (f *FSContext) DecRef(ctx context.Context) {
-	f.DecRefWithDestructor(ctx, f.destroy)
+	f.FSContextRefs.DecRef(func() {
+		// Hold f.mu so that we don't race with RootDirectory() and
+		// WorkingDirectory().
+		f.mu.Lock()
+		defer f.mu.Unlock()
+
+		if VFS2Enabled {
+			f.rootVFS2.DecRef(ctx)
+			f.rootVFS2 = vfs.VirtualDentry{}
+			f.cwdVFS2.DecRef(ctx)
+			f.cwdVFS2 = vfs.VirtualDentry{}
+		} else {
+			f.root.DecRef(ctx)
+			f.root = nil
+			f.cwd.DecRef(ctx)
+			f.cwd = nil
+		}
+	})
 }
 
 // Fork forks this FSContext.
 //
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) Fork() *FSContext {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
 	if VFS2Enabled {
+		if !f.cwdVFS2.Ok() {
+			panic("FSContext.Fork() called after destroy")
+		}
 		f.cwdVFS2.IncRef()
 		f.rootVFS2.IncRef()
 	} else {
+		if f.cwd == nil {
+			panic("FSContext.Fork() called after destroy")
+		}
 		f.cwd.IncRef()
 		f.root.IncRef()
 	}
@@ -140,8 +141,8 @@ func (f *FSContext) Fork() *FSContext {
 
 // WorkingDirectory returns the current working directory.
 //
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
 func (f *FSContext) WorkingDirectory() *fs.Dirent {
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -152,8 +153,8 @@ func (f *FSContext) WorkingDirectory() *fs.Dirent {
 
 // WorkingDirectoryVFS2 returns the current working directory.
 //
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
 func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -165,7 +166,7 @@ func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
 // SetWorkingDirectory sets the current working directory.
 // This will take an extra reference on the Dirent.
 //
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) {
 	if d == nil {
 		panic("FSContext.SetWorkingDirectory called with nil dirent")
@@ -187,11 +188,15 @@ func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) {
 // SetWorkingDirectoryVFS2 sets the current working directory.
 // This will take an extra reference on the VirtualDentry.
 //
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
+	if !f.cwdVFS2.Ok() {
+		panic(fmt.Sprintf("FSContext.SetWorkingDirectoryVFS2(%v)) called after destroy", d))
+	}
+
 	old := f.cwdVFS2
 	f.cwdVFS2 = d
 	d.IncRef()
@@ -200,8 +205,8 @@ func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDe
 
 // RootDirectory returns the current filesystem root.
 //
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
 func (f *FSContext) RootDirectory() *fs.Dirent {
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -213,8 +218,8 @@ func (f *FSContext) RootDirectory() *fs.Dirent {
 
 // RootDirectoryVFS2 returns the current filesystem root.
 //
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
 func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -226,7 +231,7 @@ func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
 // SetRootDirectory sets the root directory.
 // This will take an extra reference on the Dirent.
 //
-// This is not a valid call after free.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) {
 	if d == nil {
 		panic("FSContext.SetRootDirectory called with nil dirent")
@@ -247,7 +252,7 @@ func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) {
 
 // SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
 //
-// This is not a valid call after free.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) {
 	if !vd.Ok() {
 		panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 5c4c622c2..df5c8421b 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -16,8 +16,6 @@ package kernel
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -32,7 +30,7 @@ type ProcessGroupID ThreadID
 //
 // +stateify savable
 type Session struct {
-	refs refs.AtomicRefCount
+	SessionRefs
 
 	// leader is the originator of the Session.
 	//
@@ -62,16 +60,11 @@ type Session struct {
 	sessionEntry
 }
 
-// incRef grabs a reference.
-func (s *Session) incRef() {
-	s.refs.IncRef()
-}
-
-// decRef drops a reference.
+// DecRef drops a reference.
 //
 // Precondition: callers must hold TaskSet.mu for writing.
-func (s *Session) decRef() {
-	s.refs.DecRefWithDestructor(nil, func(context.Context) {
+func (s *Session) DecRef() {
+	s.SessionRefs.DecRef(func() {
 		// Remove translations from the leader.
 		for ns := s.leader.pidns; ns != nil; ns = ns.parent {
 			id := ns.sids[s]
@@ -88,7 +81,7 @@ func (s *Session) decRef() {
 //
 // +stateify savable
 type ProcessGroup struct {
-	refs refs.AtomicRefCount // not exported.
+	refs ProcessGroupRefs
 
 	// originator is the originator of the group.
 	//
@@ -163,7 +156,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
 	}
 
 	alive := true
-	pg.refs.DecRefWithDestructor(nil, func(context.Context) {
+	pg.refs.DecRef(func() {
 		alive = false // don't bother with handleOrphan.
 
 		// Remove translations from the originator.
@@ -175,7 +168,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
 
 		// Remove the list of process groups.
 		pg.session.processGroups.Remove(pg)
-		pg.session.decRef()
+		pg.session.DecRef()
 	})
 	if alive {
 		pg.handleOrphan()
@@ -302,7 +295,7 @@ func (tg *ThreadGroup) createSession() error {
 		id:     SessionID(id),
 		leader: tg,
 	}
-	s.refs.EnableLeakCheck("kernel.Session")
+	s.EnableLeakCheck()
 
 	// Create a new ProcessGroup, belonging to that Session.
 	// This also has a single reference (assigned below).
@@ -316,7 +309,7 @@ func (tg *ThreadGroup) createSession() error {
 		session:    s,
 		ancestors:  0,
 	}
-	pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+	pg.refs.EnableLeakCheck()
 
 	// Tie them and return the result.
 	s.processGroups.PushBack(pg)
@@ -396,13 +389,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
 	//
 	// We manually adjust the ancestors if the parent is in the same
 	// session.
-	tg.processGroup.session.incRef()
+	tg.processGroup.session.IncRef()
 	pg := ProcessGroup{
 		id:         ProcessGroupID(id),
 		originator: tg,
 		session:    tg.processGroup.session,
 	}
-	pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+	pg.refs.EnableLeakCheck()
 
 	if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
 		pg.ancestors++
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index c211fc8d0..b7e4b480d 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,12 +1,25 @@
 load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
+go_template_instance(
+    name = "shm_refs",
+    out = "shm_refs.go",
+    package = "shm",
+    prefix = "Shm",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "Shm",
+    },
+)
+
 go_library(
     name = "shm",
     srcs = [
         "device.go",
         "shm.go",
+        "shm_refs.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 13ec7afe0..00c03585e 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -39,7 +39,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -252,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
 		creatorPID:    pid,
 		changeTime:    ktime.NowFromContext(ctx),
 	}
-	shm.EnableLeakCheck("kernel.Shm")
+	shm.EnableLeakCheck()
 
 	// Find the next available ID.
 	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
@@ -337,14 +336,14 @@ func (r *Registry) remove(s *Shm) {
 //
 // +stateify savable
 type Shm struct {
-	// AtomicRefCount tracks the number of references to this segment.
+	// ShmRefs tracks the number of references to this segment.
 	//
 	// A segment holds a reference to itself until it is marked for
 	// destruction.
 	//
 	// In addition to direct users, the MemoryManager will hold references
 	// via MappingIdentity.
-	refs.AtomicRefCount
+	ShmRefs
 
 	mfp pgalloc.MemoryFileProvider
 
@@ -428,11 +427,14 @@ func (s *Shm) InodeID() uint64 {
 	return uint64(s.ID)
 }
 
-// DecRef overrides refs.RefCount.DecRef with a destructor.
+// DecRef drops a reference on s.
 //
 // Precondition: Caller must not hold s.mu.
 func (s *Shm) DecRef(ctx context.Context) {
-	s.DecRefWithDestructor(ctx, s.destroy)
+	s.ShmRefs.DecRef(func() {
+		s.mfp.MemoryFile().DecRef(s.fr)
+		s.registry.remove(s)
+	})
 }
 
 // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
@@ -642,11 +644,6 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
 	return nil
 }
 
-func (s *Shm) destroy(context.Context) {
-	s.mfp.MemoryFile().DecRef(s.fr)
-	s.registry.remove(s)
-}
-
 // MarkDestroyed marks a segment for destruction. The segment is actually
 // destroyed once it has no references. MarkDestroyed may be called multiple
 // times, and is safe to call after a segment has already been destroyed. See
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index f9d0837a1..b4a47ccca 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -73,12 +73,35 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "aio_mappable_refs",
+    out = "aio_mappable_refs.go",
+    package = "mm",
+    prefix = "aioMappable",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "aioMappable",
+    },
+)
+
+go_template_instance(
+    name = "special_mappable_refs",
+    out = "special_mappable_refs.go",
+    package = "mm",
+    prefix = "SpecialMappable",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "SpecialMappable",
+    },
+)
+
 go_library(
     name = "mm",
     srcs = [
         "address_space.go",
         "aio_context.go",
         "aio_context_state.go",
+        "aio_mappable_refs.go",
         "debug.go",
         "file_refcount_set.go",
         "io.go",
@@ -92,6 +115,7 @@ go_library(
         "save_restore.go",
         "shm.go",
         "special_mappable.go",
+        "special_mappable_refs.go",
         "syscalls.go",
         "vma.go",
         "vma_set.go",
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 16fea53c4..7bf48cb2c 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -17,7 +17,6 @@ package mm
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
@@ -239,7 +238,7 @@ func (ctx *AIOContext) Drain() {
 //
 // +stateify savable
 type aioMappable struct {
-	refs.AtomicRefCount
+	aioMappableRefs
 
 	mfp pgalloc.MemoryFileProvider
 	fr  memmap.FileRange
@@ -253,13 +252,13 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
 		return nil, err
 	}
 	m := aioMappable{mfp: mfp, fr: fr}
-	m.EnableLeakCheck("mm.aioMappable")
+	m.EnableLeakCheck()
 	return &m, nil
 }
 
 // DecRef implements refs.RefCounter.DecRef.
 func (m *aioMappable) DecRef(ctx context.Context) {
-	m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
+	m.aioMappableRefs.DecRef(func() {
 		m.mfp.MemoryFile().DecRef(m.fr)
 	})
 }
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 4cdb52eb6..f4c93baeb 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -16,7 +16,6 @@ package mm
 
 import (
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
@@ -31,7 +30,7 @@ import (
 //
 // +stateify savable
 type SpecialMappable struct {
-	refs.AtomicRefCount
+	SpecialMappableRefs
 
 	mfp  pgalloc.MemoryFileProvider
 	fr   memmap.FileRange
@@ -45,13 +44,13 @@ type SpecialMappable struct {
 // Preconditions: fr.Length() != 0.
 func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable {
 	m := SpecialMappable{mfp: mfp, fr: fr, name: name}
-	m.EnableLeakCheck("mm.SpecialMappable")
+	m.EnableLeakCheck()
 	return &m
 }
 
 // DecRef implements refs.RefCounter.DecRef.
 func (m *SpecialMappable) DecRef(ctx context.Context) {
-	m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
+	m.SpecialMappableRefs.DecRef(func() {
 		m.mfp.MemoryFile().DecRef(m.fr)
 	})
 }
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index c708b6030..26c3a51b9 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -15,6 +15,17 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "queue_refs",
+    out = "queue_refs.go",
+    package = "transport",
+    prefix = "queue",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "queue",
+    },
+)
+
 go_library(
     name = "transport",
     srcs = [
@@ -22,6 +33,7 @@ go_library(
         "connectioned_state.go",
         "connectionless.go",
         "queue.go",
+        "queue_refs.go",
         "transport_message_list.go",
         "unix.go",
     ],
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index c67b602f0..e3a75b519 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -142,9 +142,9 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E
 	}
 
 	q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit}
-	q1.EnableLeakCheck("transport.queue")
+	q1.EnableLeakCheck()
 	q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit}
-	q2.EnableLeakCheck("transport.queue")
+	q2.EnableLeakCheck()
 
 	if stype == linux.SOCK_STREAM {
 		a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}}
@@ -300,14 +300,14 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn
 	}
 
 	readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit}
-	readQueue.EnableLeakCheck("transport.queue")
+	readQueue.EnableLeakCheck()
 	ne.connected = &connectedEndpoint{
 		endpoint:   ce,
 		writeQueue: readQueue,
 	}
 
 	writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit}
-	writeQueue.EnableLeakCheck("transport.queue")
+	writeQueue.EnableLeakCheck()
 	if e.stype == linux.SOCK_STREAM {
 		ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}}
 	} else {
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 70ee8f9b8..4751b2fd8 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -42,7 +42,7 @@ var (
 func NewConnectionless(ctx context.Context) Endpoint {
 	ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}}
 	q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit}
-	q.EnableLeakCheck("transport.queue")
+	q.EnableLeakCheck()
 	ep.receiver = &queueReceiver{readQueue: &q}
 	return ep
 }
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index ef6043e19..342def28f 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -16,7 +16,6 @@ package transport
 
 import (
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -28,7 +27,7 @@ import (
 //
 // +stateify savable
 type queue struct {
-	refs.AtomicRefCount
+	queueRefs
 
 	ReaderQueue *waiter.Queue
 	WriterQueue *waiter.Queue
@@ -68,11 +67,13 @@ func (q *queue) Reset(ctx context.Context) {
 	q.mu.Unlock()
 }
 
-// DecRef implements RefCounter.DecRef with destructor q.Reset.
+// DecRef implements RefCounter.DecRef.
 func (q *queue) DecRef(ctx context.Context) {
-	q.DecRefWithDestructor(ctx, q.Reset)
-	// We don't need to notify after resetting because no one cares about
-	// this queue after all references have been dropped.
+	q.queueRefs.DecRef(func() {
+		// We don't need to notify after resetting because no one cares about
+		// this queue after all references have been dropped.
+		q.Reset(ctx)
+	})
 }
 
 // IsReadable determines if q is currently readable.
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 642769e7c..8093ca55c 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -27,6 +27,39 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "file_description_refs",
+    out = "file_description_refs.go",
+    package = "vfs",
+    prefix = "FileDescription",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "FileDescription",
+    },
+)
+
+go_template_instance(
+    name = "mount_namespace_refs",
+    out = "mount_namespace_refs.go",
+    package = "vfs",
+    prefix = "MountNamespace",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "MountNamespace",
+    },
+)
+
+go_template_instance(
+    name = "filesystem_refs",
+    out = "filesystem_refs.go",
+    package = "vfs",
+    prefix = "Filesystem",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "Filesystem",
+    },
+)
+
 go_library(
     name = "vfs",
     srcs = [
@@ -40,12 +73,15 @@ go_library(
         "event_list.go",
         "file_description.go",
         "file_description_impl_util.go",
+        "file_description_refs.go",
         "filesystem.go",
         "filesystem_impl_util.go",
+        "filesystem_refs.go",
         "filesystem_type.go",
         "inotify.go",
         "lock.go",
         "mount.go",
+        "mount_namespace_refs.go",
         "mount_unsafe.go",
         "options.go",
         "pathname.go",
@@ -63,6 +99,7 @@ go_library(
         "//pkg/fspath",
         "//pkg/gohacks",
         "//pkg/log",
+        "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
index 4b9faf2ea..5aad31b78 100644
--- a/pkg/sentry/vfs/README.md
+++ b/pkg/sentry/vfs/README.md
@@ -184,12 +184,3 @@ This construction, which is essentially a type-safe analogue to Linux's
     -   File locking
 
     -   `O_ASYNC`
-
--   Reference counts in the `vfs` package do not use the `refs` package since
-    `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference
-    count, resulting in considerable cache bloat. 24 bytes of this overhead is
-    for weak reference support, which have poor performance and will not be used
-    by VFS2. The remaining 40 bytes is to store a descriptive string and stack
-    trace for reference leak checking; we can support reference leak checking
-    without incurring this space overhead by including the applicable
-    information directly in finalizers for applicable types.
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 3219a9e13..22a54fa48 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -38,9 +38,7 @@ import (
 //
 // FileDescription is analogous to Linux's struct file.
 type FileDescription struct {
-	// refs is the reference count. refs is accessed using atomic memory
-	// operations.
-	refs int64
+	FileDescriptionRefs
 
 	// flagsMu protects statusFlags and asyncHandler below.
 	flagsMu sync.Mutex
@@ -131,7 +129,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou
 		}
 	}
 
-	fd.refs = 1
+	fd.EnableLeakCheck()
 
 	// Remove "file creation flags" to mirror the behavior from file.f_flags in
 	// fs/open.c:do_dentry_open.
@@ -149,30 +147,9 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou
 	return nil
 }
 
-// IncRef increments fd's reference count.
-func (fd *FileDescription) IncRef() {
-	atomic.AddInt64(&fd.refs, 1)
-}
-
-// TryIncRef increments fd's reference count and returns true. If fd's
-// reference count is already zero, TryIncRef does nothing and returns false.
-//
-// TryIncRef does not require that a reference is held on fd.
-func (fd *FileDescription) TryIncRef() bool {
-	for {
-		refs := atomic.LoadInt64(&fd.refs)
-		if refs <= 0 {
-			return false
-		}
-		if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) {
-			return true
-		}
-	}
-}
-
 // DecRef decrements fd's reference count.
 func (fd *FileDescription) DecRef(ctx context.Context) {
-	if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
+	fd.FileDescriptionRefs.DecRef(func() {
 		// Unregister fd from all epoll instances.
 		fd.epollMu.Lock()
 		epolls := fd.epolls
@@ -208,15 +185,7 @@ func (fd *FileDescription) DecRef(ctx context.Context) {
 		}
 		fd.asyncHandler = nil
 		fd.flagsMu.Unlock()
-	} else if refs < 0 {
-		panic("FileDescription.DecRef() called without holding a reference")
-	}
-}
-
-// Refs returns the current number of references. The returned count
-// is inherently racy and is unsafe to use without external synchronization.
-func (fd *FileDescription) Refs() int64 {
-	return atomic.LoadInt64(&fd.refs)
+	})
 }
 
 // Mount returns the mount on which fd was opened. It does not take a reference
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 2c60cfab2..46851f638 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -15,8 +15,6 @@
 package vfs
 
 import (
-	"sync/atomic"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
@@ -34,9 +32,7 @@ import (
 //
 // +stateify savable
 type Filesystem struct {
-	// refs is the reference count. refs is accessed using atomic memory
-	// operations.
-	refs int64
+	FilesystemRefs
 
 	// vfs is the VirtualFilesystem that uses this Filesystem. vfs is
 	// immutable.
@@ -52,7 +48,7 @@ type Filesystem struct {
 
 // Init must be called before first use of fs.
 func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) {
-	fs.refs = 1
+	fs.EnableLeakCheck()
 	fs.vfs = vfsObj
 	fs.fsType = fsType
 	fs.impl = impl
@@ -76,39 +72,14 @@ func (fs *Filesystem) Impl() FilesystemImpl {
 	return fs.impl
 }
 
-// IncRef increments fs' reference count.
-func (fs *Filesystem) IncRef() {
-	if atomic.AddInt64(&fs.refs, 1) <= 1 {
-		panic("Filesystem.IncRef() called without holding a reference")
-	}
-}
-
-// TryIncRef increments fs' reference count and returns true. If fs' reference
-// count is zero, TryIncRef does nothing and returns false.
-//
-// TryIncRef does not require that a reference is held on fs.
-func (fs *Filesystem) TryIncRef() bool {
-	for {
-		refs := atomic.LoadInt64(&fs.refs)
-		if refs <= 0 {
-			return false
-		}
-		if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) {
-			return true
-		}
-	}
-}
-
 // DecRef decrements fs' reference count.
 func (fs *Filesystem) DecRef(ctx context.Context) {
-	if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
+	fs.FilesystemRefs.DecRef(func() {
 		fs.vfs.filesystemsMu.Lock()
 		delete(fs.vfs.filesystems, fs)
 		fs.vfs.filesystemsMu.Unlock()
 		fs.impl.Release(ctx)
-	} else if refs < 0 {
-		panic("Filesystem.decRef() called without holding a reference")
-	}
+	})
 }
 
 // FilesystemImpl contains implementation details for a Filesystem.
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index cd5456eef..db5fb3bb1 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -128,16 +128,14 @@ func (mnt *Mount) Options() MountOptions {
 //
 // +stateify savable
 type MountNamespace struct {
+	MountNamespaceRefs
+
 	// Owner is the usernamespace that owns this mount namespace.
 	Owner *auth.UserNamespace
 
 	// root is the MountNamespace's root mount. root is immutable.
 	root *Mount
 
-	// refs is the reference count. refs is accessed using atomic memory
-	// operations.
-	refs int64
-
 	// mountpoints maps all Dentries which are mount points in this namespace
 	// to the number of Mounts for which they are mount points. mountpoints is
 	// protected by VirtualFilesystem.mountMu.
@@ -168,9 +166,9 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 	}
 	mntns := &MountNamespace{
 		Owner:       creds.UserNamespace,
-		refs:        1,
 		mountpoints: make(map[*Dentry]uint32),
 	}
+	mntns.EnableLeakCheck()
 	mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{})
 	return mntns, nil
 }
@@ -509,17 +507,10 @@ func (mnt *Mount) DecRef(ctx context.Context) {
 	}
 }
 
-// IncRef increments mntns' reference count.
-func (mntns *MountNamespace) IncRef() {
-	if atomic.AddInt64(&mntns.refs, 1) <= 1 {
-		panic("MountNamespace.IncRef() called without holding a reference")
-	}
-}
-
 // DecRef decrements mntns' reference count.
 func (mntns *MountNamespace) DecRef(ctx context.Context) {
 	vfs := mntns.root.fs.VirtualFilesystem()
-	if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 {
+	mntns.MountNamespaceRefs.DecRef(func() {
 		vfs.mountMu.Lock()
 		vfs.mounts.seq.BeginWrite()
 		vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{
@@ -533,9 +524,7 @@ func (mntns *MountNamespace) DecRef(ctx context.Context) {
 		for _, mnt := range mountsToDecRef {
 			mnt.DecRef(ctx)
 		}
-	} else if refs < 0 {
-		panic("MountNamespace.DecRef() called without holding a reference")
-	}
+	})
 }
 
 // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index 6c137f693..0243424f6 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -1,18 +1,32 @@
 load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
+go_template_instance(
+    name = "tun_endpoint_refs",
+    out = "tun_endpoint_refs.go",
+    package = "tun",
+    prefix = "tunEndpoint",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "tunEndpoint",
+    },
+)
+
 go_library(
     name = "tun",
     srcs = [
         "device.go",
         "protocol.go",
+        "tun_endpoint_refs.go",
         "tun_unsafe.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/log",
         "//pkg/refs",
         "//pkg/sync",
         "//pkg/syserror",
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
index 3b1510a33..b6ddbe81e 100644
--- a/pkg/tcpip/link/tun/device.go
+++ b/pkg/tcpip/link/tun/device.go
@@ -19,7 +19,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -135,6 +134,7 @@ func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkE
 
 		// 2. Creating a new NIC.
 		id := tcpip.NICID(s.UniqueID())
+		// TODO(gvisor.dev/1486): enable leak check for tunEndpoint.
 		endpoint := &tunEndpoint{
 			Endpoint: channel.New(defaultDevOutQueueLen, defaultDevMtu, ""),
 			stack:    s,
@@ -331,19 +331,18 @@ func (d *Device) WriteNotify() {
 // It is ref-counted as multiple opening files can attach to the same NIC.
 // The last owner is responsible for deleting the NIC.
 type tunEndpoint struct {
+	tunEndpointRefs
 	*channel.Endpoint
 
-	refs.AtomicRefCount
-
 	stack *stack.Stack
 	nicID tcpip.NICID
 	name  string
 	isTap bool
 }
 
-// DecRef decrements refcount of e, removes NIC if refcount goes to 0.
+// DecRef decrements refcount of e, removing NIC if it reaches 0.
 func (e *tunEndpoint) DecRef(ctx context.Context) {
-	e.DecRefWithDestructor(ctx, func(context.Context) {
+	e.tunEndpointRefs.DecRef(func() {
 		e.stack.RemoveNIC(e.nicID)
 	})
 }
-- 
cgit v1.2.3


From 2a322c451e0a04df55d8fa4ea6e055da39231efa Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 26 Aug 2020 10:03:46 -0700
Subject: tmpfs: Allow xattrs in the trusted namespace if creds has
 CAP_SYS_ADMIN.

This is needed to support the overlay opaque attribute.

PiperOrigin-RevId: 328552985
---
 pkg/abi/linux/xattr.go                  |  3 ++
 pkg/sentry/fsimpl/overlay/filesystem.go |  2 +-
 pkg/sentry/fsimpl/tmpfs/tmpfs.go        | 68 ++++++++++++++++++------------
 test/syscalls/linux/memfd.cc            | 17 +-------
 test/syscalls/linux/xattr.cc            | 74 +++++++++++++++++++++++++++++++++
 test/util/fs_util.cc                    | 20 +++++++++
 test/util/fs_util.h                     |  6 +++
 7 files changed, 147 insertions(+), 43 deletions(-)

(limited to 'pkg')

diff --git a/pkg/abi/linux/xattr.go b/pkg/abi/linux/xattr.go
index 99180b208..8ef837f27 100644
--- a/pkg/abi/linux/xattr.go
+++ b/pkg/abi/linux/xattr.go
@@ -23,6 +23,9 @@ const (
 	XATTR_CREATE  = 1
 	XATTR_REPLACE = 2
 
+	XATTR_TRUSTED_PREFIX     = "trusted."
+	XATTR_TRUSTED_PREFIX_LEN = len(XATTR_TRUSTED_PREFIX)
+
 	XATTR_USER_PREFIX     = "user."
 	XATTR_USER_PREFIX_LEN = len(XATTR_USER_PREFIX)
 )
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index a3cee4047..e720bfb0b 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -30,7 +30,7 @@ import (
 // _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
 // opaque directories.
 // Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
-const _OVL_XATTR_OPAQUE = "trusted.overlay.opaque"
+const _OVL_XATTR_OPAQUE = linux.XATTR_TRUSTED_PREFIX + "overlay.opaque"
 
 func isWhiteout(stat *linux.Statx) bool {
 	return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index a7fdf19ca..c4cec4130 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -631,49 +631,65 @@ func (i *inode) listxattr(size uint64) ([]string, error) {
 }
 
 func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
-	if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
+	if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
 		return "", err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return "", syserror.EOPNOTSUPP
-	}
-	if !i.userXattrSupported() {
-		return "", syserror.ENODATA
-	}
 	return i.xattrs.Getxattr(opts)
 }
 
 func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
-	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+	if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return syserror.EOPNOTSUPP
-	}
-	if !i.userXattrSupported() {
-		return syserror.EPERM
-	}
 	return i.xattrs.Setxattr(opts)
 }
 
 func (i *inode) removexattr(creds *auth.Credentials, name string) error {
-	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+	if err := i.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
-		return syserror.EOPNOTSUPP
-	}
-	if !i.userXattrSupported() {
-		return syserror.EPERM
-	}
 	return i.xattrs.Removexattr(name)
 }
 
-// Extended attributes in the user.* namespace are only supported for regular
-// files and directories.
-func (i *inode) userXattrSupported() bool {
-	filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
-	return filetype == linux.S_IFREG || filetype == linux.S_IFDIR
+func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+	switch {
+	case ats&vfs.MayRead == vfs.MayRead:
+		if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
+			return err
+		}
+	case ats&vfs.MayWrite == vfs.MayWrite:
+		if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+			return err
+		}
+	default:
+		panic(fmt.Sprintf("checkXattrPermissions called with impossible AccessTypes: %v", ats))
+	}
+
+	switch {
+	case strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX):
+		// The trusted.* namespace can only be accessed by privileged
+		// users.
+		if creds.HasCapability(linux.CAP_SYS_ADMIN) {
+			return nil
+		}
+		if ats&vfs.MayWrite == vfs.MayWrite {
+			return syserror.EPERM
+		}
+		return syserror.ENODATA
+	case strings.HasPrefix(name, linux.XATTR_USER_PREFIX):
+		// Extended attributes in the user.* namespace are only
+		// supported for regular files and directories.
+		filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
+		if filetype == linux.S_IFREG || filetype == linux.S_IFDIR {
+			return nil
+		}
+		if ats&vfs.MayWrite == vfs.MayWrite {
+			return syserror.EPERM
+		}
+		return syserror.ENODATA
+
+	}
+	return syserror.EOPNOTSUPP
 }
 
 // fileDescription is embedded by tmpfs implementations of
diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc
index f8b7f7938..4a450742b 100644
--- a/test/syscalls/linux/memfd.cc
+++ b/test/syscalls/linux/memfd.cc
@@ -14,12 +14,10 @@
 
 #include <errno.h>
 #include <fcntl.h>
-#include <linux/magic.h>
 #include <linux/memfd.h>
 #include <linux/unistd.h>
 #include <string.h>
 #include <sys/mman.h>
-#include <sys/statfs.h>
 #include <sys/syscall.h>
 
 #include <vector>
@@ -53,6 +51,7 @@ namespace {
 #define F_SEAL_GROW 0x0004
 #define F_SEAL_WRITE 0x0008
 
+using ::gvisor::testing::IsTmpfs;
 using ::testing::StartsWith;
 
 const std::string kMemfdName = "some-memfd";
@@ -444,20 +443,6 @@ TEST(MemfdTest, SealsAreInodeLevelProperties) {
   EXPECT_THAT(ftruncate(memfd3.get(), kPageSize), SyscallFailsWithErrno(EPERM));
 }
 
-PosixErrorOr<bool> IsTmpfs(const std::string& path) {
-  struct statfs stat;
-  if (statfs(path.c_str(), &stat)) {
-    if (errno == ENOENT) {
-      // Nothing at path, don't raise this as an error. Instead, just report no
-      // tmpfs at path.
-      return false;
-    }
-    return PosixError(errno,
-                      absl::StrFormat("statfs(\"%s\", %#p)", path, &stat));
-  }
-  return stat.f_type == TMPFS_MAGIC;
-}
-
 // Tmpfs files also support seals, but are created with F_SEAL_SEAL.
 TEST(MemfdTest, TmpfsFilesHaveSealSeal) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs("/tmp")));
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index cbcf08451..5510a87a0 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -28,6 +28,7 @@
 #include "test/syscalls/linux/file_base.h"
 #include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
@@ -37,6 +38,8 @@ namespace testing {
 
 namespace {
 
+using ::gvisor::testing::IsTmpfs;
+
 class XattrTest : public FileTest {};
 
 TEST_F(XattrTest, XattrNonexistentFile) {
@@ -604,6 +607,77 @@ TEST_F(XattrTest, XattrWithFD) {
   EXPECT_THAT(fremovexattr(fd.get(), name), SyscallSucceeds());
 }
 
+TEST_F(XattrTest, TrustedNamespaceWithCapSysAdmin) {
+  // Trusted namespace not supported in VFS1.
+  SKIP_IF(IsRunningWithVFS1());
+
+  // TODO(b/66162845): Only gVisor tmpfs currently supports trusted namespace.
+  SKIP_IF(IsRunningOnGvisor() &&
+          !ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs(test_file_name_)));
+
+  // Setting/Getting in the trusted namespace requires CAP_SYS_ADMIN.
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  const char* path = test_file_name_.c_str();
+  const char name[] = "trusted.test";
+
+  // Set.
+  char val = 'a';
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  // Get.
+  char got = '\0';
+  EXPECT_THAT(getxattr(path, name, &got, size), SyscallSucceedsWithValue(size));
+  EXPECT_EQ(val, got);
+
+  // List.
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  // Remove.
+  EXPECT_THAT(removexattr(path, name), SyscallSucceeds());
+
+  // Get should now return ENODATA.
+  EXPECT_THAT(getxattr(path, name, &got, size), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, TrustedNamespaceWithoutCapSysAdmin) {
+  // Trusted namespace not supported in VFS1.
+  SKIP_IF(IsRunningWithVFS1());
+
+  // TODO(b/66162845): Only gVisor tmpfs currently supports trusted namespace.
+  SKIP_IF(IsRunningOnGvisor() &&
+          !ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs(test_file_name_)));
+
+  // Drop CAP_SYS_ADMIN if we have it.
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))) {
+    EXPECT_NO_ERRNO(SetCapability(CAP_SYS_ADMIN, false));
+  }
+
+  const char* path = test_file_name_.c_str();
+  const char name[] = "trusted.test";
+
+  // Set fails.
+  char val = 'a';
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+              SyscallFailsWithErrno(EPERM));
+
+  // Get fails.
+  char got = '\0';
+  EXPECT_THAT(getxattr(path, name, &got, size), SyscallFailsWithErrno(ENODATA));
+
+  // List still works, but returns no items.
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)), SyscallSucceedsWithValue(0));
+
+  // Remove fails.
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
index 5418948fe..dffa16183 100644
--- a/test/util/fs_util.cc
+++ b/test/util/fs_util.cc
@@ -15,7 +15,11 @@
 #include "test/util/fs_util.h"
 
 #include <dirent.h>
+#ifndef __fuchsia__
+#include <linux/magic.h>
+#endif  // __fuchsia__
 #include <sys/stat.h>
+#include <sys/statfs.h>
 #include <sys/types.h>
 #include <unistd.h>
 
@@ -629,5 +633,21 @@ PosixErrorOr<std::string> ProcessExePath(int pid) {
   return ReadLink(absl::StrCat("/proc/", pid, "/exe"));
 }
 
+#ifndef __fuchsia__
+PosixErrorOr<bool> IsTmpfs(const std::string& path) {
+  struct statfs stat;
+  if (statfs(path.c_str(), &stat)) {
+    if (errno == ENOENT) {
+      // Nothing at path, don't raise this as an error. Instead, just report no
+      // tmpfs at path.
+      return false;
+    }
+    return PosixError(errno,
+                      absl::StrFormat("statfs(\"%s\", %#p)", path, &stat));
+  }
+  return stat.f_type == TMPFS_MAGIC;
+}
+#endif  // __fuchsia__
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/util/fs_util.h b/test/util/fs_util.h
index 8cdac23a1..044190657 100644
--- a/test/util/fs_util.h
+++ b/test/util/fs_util.h
@@ -17,6 +17,7 @@
 
 #include <dirent.h>
 #include <sys/stat.h>
+#include <sys/statfs.h>
 #include <sys/types.h>
 #include <unistd.h>
 
@@ -178,6 +179,11 @@ std::string CleanPath(absl::string_view path);
 // Returns the full path to the executable of the given pid or a PosixError.
 PosixErrorOr<std::string> ProcessExePath(int pid);
 
+#ifndef __fuchsia__
+// IsTmpfs returns true if the file at path is backed by tmpfs.
+PosixErrorOr<bool> IsTmpfs(const std::string& path);
+#endif  // __fucshia__
+
 namespace internal {
 // Not part of the public API.
 std::string JoinPathImpl(std::initializer_list<absl::string_view> paths);
-- 
cgit v1.2.3


From d872b342b2c2291420a9570edcf340040754bb44 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 26 Aug 2020 12:28:58 -0700
Subject: Remove spurious fd.IncRef().

PiperOrigin-RevId: 328583461
---
 pkg/sentry/fsimpl/overlay/non_directory.go | 1 -
 1 file changed, 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go
index d3060a481..268b32537 100644
--- a/pkg/sentry/fsimpl/overlay/non_directory.go
+++ b/pkg/sentry/fsimpl/overlay/non_directory.go
@@ -121,7 +121,6 @@ func (fd *nonDirectoryFD) OnClose(ctx context.Context) error {
 		fd.cachedFlags = statusFlags
 	}
 	wrappedFD := fd.cachedFD
-	defer wrappedFD.IncRef()
 	fd.mu.Unlock()
 	return wrappedFD.OnClose(ctx)
 }
-- 
cgit v1.2.3


From f63cddc6b4826007ca2a755d30b2df65ea21c518 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 26 Aug 2020 14:40:30 -0700
Subject: Support stdlib analyzers with nogo.

This immediately revealed an escape analysis violation (!), where
the sync.Map was being used in a context that escapes were not
allowed. This is a relatively minor fix and is included.

PiperOrigin-RevId: 328611237
---
 pkg/sentry/platform/kvm/bluepill_fault.go          |   4 +
 pkg/sentry/platform/kvm/kvm_const.go               |   2 +
 pkg/sentry/platform/kvm/machine.go                 |  40 ++-
 tools/bazeldefs/defs.bzl                           |  11 +-
 tools/checkescape/BUILD                            |   2 +-
 tools/checkescape/checkescape.go                   |  19 +-
 tools/checkescape/test1/test1.go                   |  15 -
 tools/checkescape/test2/test2.go                   |   6 -
 tools/go_marshal/gomarshal/generator_interfaces.go |   2 +-
 tools/nogo/BUILD                                   |  13 +-
 tools/nogo/build.go                                |   4 +-
 tools/nogo/config.go                               |   8 +
 tools/nogo/data/BUILD                              |  10 -
 tools/nogo/data/data.go                            |  21 --
 tools/nogo/defs.bzl                                | 185 ++++++++----
 tools/nogo/dump/BUILD                              |  10 +
 tools/nogo/dump/dump.go                            |  78 +++++
 tools/nogo/nogo.go                                 | 323 +++++++++++++++++----
 18 files changed, 568 insertions(+), 185 deletions(-)
 delete mode 100644 tools/nogo/data/BUILD
 delete mode 100644 tools/nogo/data/data.go
 create mode 100644 tools/nogo/dump/BUILD
 create mode 100644 tools/nogo/dump/dump.go

(limited to 'pkg')

diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
index e34f46aeb..a182e4f22 100644
--- a/pkg/sentry/platform/kvm/bluepill_fault.go
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -98,6 +98,10 @@ func handleBluepillFault(m *machine, physical uintptr, phyRegions []physicalRegi
 	}
 	errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart, flags)
 	if errno == 0 {
+		// Store the physical address in the slot. This is used to
+		// avoid calls to handleBluepillFault in the future (see
+		// machine.mapPhysical).
+		atomic.StoreUintptr(&m.usedSlots[slot], physical)
 		// Successfully added region; we can increment nextSlot and
 		// allow another set to proceed here.
 		atomic.StoreUint32(&m.nextSlot, slot+1)
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 3bf918446..5c4b18899 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -56,6 +56,7 @@ const (
 
 // KVM capability options.
 const (
+	_KVM_CAP_MAX_MEMSLOTS          = 0x0a
 	_KVM_CAP_MAX_VCPUS             = 0x42
 	_KVM_CAP_ARM_VM_IPA_SIZE       = 0xa5
 	_KVM_CAP_VCPU_EVENTS           = 0x29
@@ -64,6 +65,7 @@ const (
 
 // KVM limits.
 const (
+	_KVM_NR_MEMSLOTS      = 0x100
 	_KVM_NR_VCPUS         = 0xff
 	_KVM_NR_INTERRUPTS    = 0x100
 	_KVM_NR_CPUID_ENTRIES = 0x100
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 6c54712d1..372a4cbd7 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -43,9 +43,6 @@ type machine struct {
 	// kernel is the set of global structures.
 	kernel ring0.Kernel
 
-	// mappingCache is used for mapPhysical.
-	mappingCache sync.Map
-
 	// mu protects vCPUs.
 	mu sync.RWMutex
 
@@ -63,6 +60,12 @@ type machine struct {
 	// maxVCPUs is the maximum number of vCPUs supported by the machine.
 	maxVCPUs int
 
+	// maxSlots is the maximum number of memory slots supported by the machine.
+	maxSlots int
+
+	// usedSlots is the set of used physical addresses (sorted).
+	usedSlots []uintptr
+
 	// nextID is the next vCPU ID.
 	nextID uint32
 }
@@ -184,6 +187,7 @@ func newMachine(vm int) (*machine, error) {
 		PageTables: pagetables.New(newAllocator()),
 	})
 
+	// Pull the maximum vCPUs.
 	maxVCPUs, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
 	if errno != 0 {
 		m.maxVCPUs = _KVM_NR_VCPUS
@@ -191,11 +195,19 @@ func newMachine(vm int) (*machine, error) {
 		m.maxVCPUs = int(maxVCPUs)
 	}
 	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
-
-	// Create the vCPUs map/slices.
 	m.vCPUsByTID = make(map[uint64]*vCPU)
 	m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
 
+	// Pull the maximum slots.
+	maxSlots, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
+	if errno != 0 {
+		m.maxSlots = _KVM_NR_MEMSLOTS
+	} else {
+		m.maxSlots = int(maxSlots)
+	}
+	log.Debugf("The maximum number of slots is %d.", m.maxSlots)
+	m.usedSlots = make([]uintptr, m.maxSlots)
+
 	// Apply the physical mappings. Note that these mappings may point to
 	// guest physical addresses that are not actually available. These
 	// physical pages are mapped on demand, see kernel_unsafe.go.
@@ -272,6 +284,20 @@ func newMachine(vm int) (*machine, error) {
 	return m, nil
 }
 
+// hasSlot returns true iff the given address is mapped.
+//
+// This must be done via a linear scan.
+//
+//go:nosplit
+func (m *machine) hasSlot(physical uintptr) bool {
+	for i := 0; i < len(m.usedSlots); i++ {
+		if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical {
+			return true
+		}
+	}
+	return false
+}
+
 // mapPhysical checks for the mapping of a physical range, and installs one if
 // not available. This attempts to be efficient for calls in the hot path.
 //
@@ -286,8 +312,8 @@ func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalReg
 			panic("mapPhysical on unknown physical address")
 		}
 
-		if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok {
-			// Not present in the cache; requires setting the slot.
+		// Is this already mapped? Check the usedSlots.
+		if !m.hasSlot(physicalStart) {
 			if _, ok := handleBluepillFault(m, physical, phyRegions, flags); !ok {
 				panic("handleBluepillFault failed")
 			}
diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index 4bbcda054..dad5fc3b2 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -147,7 +147,7 @@ def go_rule(rule, implementation, **kwargs):
     Returns:
         The result of invoking the rule.
     """
-    attrs = kwargs.pop("attrs", [])
+    attrs = kwargs.pop("attrs", dict())
     attrs["_go_context_data"] = attr.label(default = "@io_bazel_rules_go//:go_context_data")
     attrs["_stdlib"] = attr.label(default = "@io_bazel_rules_go//:stdlib")
     toolchains = kwargs.get("toolchains", []) + ["@io_bazel_rules_go//go:toolchain"]
@@ -158,12 +158,17 @@ def go_test_library(target):
         return target.attr.embed[0]
     return None
 
-def go_context(ctx):
+def go_context(ctx, std = False):
+    # We don't change anything for the standard library analysis. All Go files
+    # are available in all instances. Note that this includes the standard
+    # library sources, which are analyzed by nogo.
     go_ctx = _go_context(ctx)
     return struct(
         go = go_ctx.go,
         env = go_ctx.env,
-        runfiles = depset([go_ctx.go] + go_ctx.sdk.tools + go_ctx.stdlib.libs),
+        nogo_args = [],
+        stdlib_srcs = go_ctx.sdk.srcs,
+        runfiles = depset([go_ctx.go] + go_ctx.sdk.srcs + go_ctx.sdk.tools + go_ctx.stdlib.libs),
         goos = go_ctx.sdk.goos,
         goarch = go_ctx.sdk.goarch,
         tags = go_ctx.tags,
diff --git a/tools/checkescape/BUILD b/tools/checkescape/BUILD
index b8c3ddf44..6273aa779 100644
--- a/tools/checkescape/BUILD
+++ b/tools/checkescape/BUILD
@@ -8,7 +8,7 @@ go_library(
     nogo = False,
     visibility = ["//tools/nogo:__subpackages__"],
     deps = [
-        "//tools/nogo/data",
+        "//tools/nogo/dump",
         "@org_golang_x_tools//go/analysis:go_tool_library",
         "@org_golang_x_tools//go/analysis/passes/buildssa:go_tool_library",
         "@org_golang_x_tools//go/ssa:go_tool_library",
diff --git a/tools/checkescape/checkescape.go b/tools/checkescape/checkescape.go
index f8def4823..aab3c36a1 100644
--- a/tools/checkescape/checkescape.go
+++ b/tools/checkescape/checkescape.go
@@ -66,7 +66,6 @@ import (
 	"go/token"
 	"go/types"
 	"io"
-	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -74,7 +73,7 @@ import (
 	"golang.org/x/tools/go/analysis"
 	"golang.org/x/tools/go/analysis/passes/buildssa"
 	"golang.org/x/tools/go/ssa"
-	"gvisor.dev/gvisor/tools/nogo/data"
+	"gvisor.dev/gvisor/tools/nogo/dump"
 )
 
 const (
@@ -256,15 +255,14 @@ func (ec *EscapeCount) Record(reason EscapeReason) bool {
 // used only to remove false positives for escape analysis. The call will be
 // elided if escape analysis is able to put the object on the heap exclusively.
 func loadObjdump() (map[LinePosition]string, error) {
-	f, err := os.Open(data.Objdump)
+	cmd, out, err := dump.Command()
 	if err != nil {
 		return nil, err
 	}
-	defer f.Close()
 
 	// Build the map.
 	m := make(map[LinePosition]string)
-	r := bufio.NewReader(f)
+	r := bufio.NewReader(out)
 	var (
 		lastField string
 		lastPos   LinePosition
@@ -329,6 +327,11 @@ func loadObjdump() (map[LinePosition]string, error) {
 		}
 	}
 
+	// Wait for the dump to finish.
+	if err := cmd.Wait(); err != nil {
+		return nil, err
+	}
+
 	return m, nil
 }
 
@@ -413,12 +416,6 @@ func run(pass *analysis.Pass) (interface{}, error) {
 					return escapes(unknownPackage, "no package", inst, ec)
 				}
 
-				// Atomic functions are instrinics. We can
-				// assume that they don't escape.
-				if x.Pkg.Pkg.Name() == "atomic" {
-					return nil
-				}
-
 				// Is this a local function? If yes, call the
 				// function to load the local function. The
 				// local escapes are the escapes found in the
diff --git a/tools/checkescape/test1/test1.go b/tools/checkescape/test1/test1.go
index 68d3f72cc..a1d36459f 100644
--- a/tools/checkescape/test1/test1.go
+++ b/tools/checkescape/test1/test1.go
@@ -17,7 +17,6 @@ package test1
 
 import (
 	"fmt"
-	"reflect"
 )
 
 // Interface is a generic interface.
@@ -163,20 +162,6 @@ func dynamicRec(f func()) {
 	Dynamic(f)
 }
 
-// +mustescape:local,unknown
-//go:noinline
-//go:nosplit
-func Unknown() {
-	_ = reflect.TypeOf((*Type)(nil)) // Does not actually escape.
-}
-
-// +mustescape:unknown
-//go:noinline
-//go:nosplit
-func unknownRec() {
-	Unknown()
-}
-
 //go:noinline
 //go:nosplit
 func internalFunc() {
diff --git a/tools/checkescape/test2/test2.go b/tools/checkescape/test2/test2.go
index 7fce3e3be..2d5865f47 100644
--- a/tools/checkescape/test2/test2.go
+++ b/tools/checkescape/test2/test2.go
@@ -81,12 +81,6 @@ func dynamicCrossPkg(f func()) {
 	test1.Dynamic(f)
 }
 
-// +mustescape:unknown
-//go:noinline
-func unknownCrossPkg() {
-	test1.Unknown()
-}
-
 // +mustescape:stack
 //go:noinline
 func splitCrosssPkt() {
diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
index e3c3dac63..cf76b5241 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -224,7 +224,7 @@ func (g *interfaceGenerator) emitNoEscapeSliceDataPointer(srcPtr, dstVar string)
 func (g *interfaceGenerator) emitKeepAlive(ptrVar string) {
 	g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", ptrVar)
 	g.emit("// must live until the use above.\n")
-	g.emit("runtime.KeepAlive(%s)\n", ptrVar)
+	g.emit("runtime.KeepAlive(%s) // escapes: replaced by intrinsic.\n", ptrVar)
 }
 
 func (g *interfaceGenerator) expandBinaryExpr(b *strings.Builder, e *ast.BinaryExpr) {
diff --git a/tools/nogo/BUILD b/tools/nogo/BUILD
index e1bfb9a2c..fb35c5ffd 100644
--- a/tools/nogo/BUILD
+++ b/tools/nogo/BUILD
@@ -1,7 +1,18 @@
 load("//tools:defs.bzl", "bzl_library", "go_library")
+load("//tools/nogo:defs.bzl", "nogo_dump_tool", "nogo_stdlib")
 
 package(licenses = ["notice"])
 
+nogo_dump_tool(
+    name = "dump_tool",
+    visibility = ["//visibility:public"],
+)
+
+nogo_stdlib(
+    name = "stdlib",
+    visibility = ["//visibility:public"],
+)
+
 go_library(
     name = "nogo",
     srcs = [
@@ -16,7 +27,7 @@ go_library(
     deps = [
         "//tools/checkescape",
         "//tools/checkunsafe",
-        "//tools/nogo/data",
+        "//tools/nogo/dump",
         "@org_golang_x_tools//go/analysis:go_tool_library",
         "@org_golang_x_tools//go/analysis/internal/facts:go_tool_library",
         "@org_golang_x_tools//go/analysis/passes/asmdecl:go_tool_library",
diff --git a/tools/nogo/build.go b/tools/nogo/build.go
index 433d13738..37947b5c3 100644
--- a/tools/nogo/build.go
+++ b/tools/nogo/build.go
@@ -31,10 +31,10 @@ var (
 )
 
 // findStdPkg needs to find the bundled standard library packages.
-func (i *importer) findStdPkg(path string) (io.ReadCloser, error) {
+func findStdPkg(GOOS, GOARCH, path string) (io.ReadCloser, error) {
 	if path == "C" {
 		// Cgo builds cannot be analyzed. Skip.
 		return nil, ErrSkip
 	}
-	return os.Open(fmt.Sprintf("external/go_sdk/pkg/%s_%s/%s.a", i.GOOS, i.GOARCH, path))
+	return os.Open(fmt.Sprintf("external/go_sdk/pkg/%s_%s/%s.a", GOOS, GOARCH, path))
 }
diff --git a/tools/nogo/config.go b/tools/nogo/config.go
index 6958fca69..451cd4a4c 100644
--- a/tools/nogo/config.go
+++ b/tools/nogo/config.go
@@ -84,6 +84,14 @@ var analyzerConfig = map[*analysis.Analyzer]matcher{
 		externalExcluded(
 			".*protobuf/.*.go",              // Bad conversions.
 			".*flate/huffman_bit_writer.go", // Bad conversion.
+
+			// Runtime internal violations.
+			".*reflect/value.go",
+			".*encoding/xml/xml.go",
+			".*runtime/pprof/internal/profile/proto.go",
+			".*fmt/scan.go",
+			".*go/types/conversions.go",
+			".*golang.org/x/net/dns/dnsmessage/message.go",
 		),
 	),
 	shadow.Analyzer:      disableMatches(),  // Disabled for now.
diff --git a/tools/nogo/data/BUILD b/tools/nogo/data/BUILD
deleted file mode 100644
index b7564cc44..000000000
--- a/tools/nogo/data/BUILD
+++ /dev/null
@@ -1,10 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "data",
-    srcs = ["data.go"],
-    nogo = False,
-    visibility = ["//tools:__subpackages__"],
-)
diff --git a/tools/nogo/data/data.go b/tools/nogo/data/data.go
deleted file mode 100644
index eb84d0d27..000000000
--- a/tools/nogo/data/data.go
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package data contains shared data for nogo analysis.
-//
-// This is used to break a dependency cycle.
-package data
-
-// Objdump is the dumped binary under analysis.
-var Objdump string
diff --git a/tools/nogo/defs.bzl b/tools/nogo/defs.bzl
index 5377620b0..963084d53 100644
--- a/tools/nogo/defs.bzl
+++ b/tools/nogo/defs.bzl
@@ -2,6 +2,103 @@
 
 load("//tools/bazeldefs:defs.bzl", "go_context", "go_importpath", "go_rule", "go_test_library")
 
+def _nogo_dump_tool_impl(ctx):
+    # Extract the Go context.
+    go_ctx = go_context(ctx)
+
+    # Construct the magic dump command.
+    #
+    # Note that in some cases, the input is being fed into the tool via stdin.
+    # Unfortunately, the Go objdump tool expects to see a seekable file [1], so
+    # we need the tool to handle this case by creating a temporary file.
+    #
+    # [1] https://github.com/golang/go/issues/41051
+    env_prefix = " ".join(["%s=%s" % (key, value) for (key, value) in go_ctx.env.items()])
+    dumper = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.write(dumper, "\n".join([
+        "#!/bin/bash",
+        "set -euo pipefail",
+        "if [[ $# -eq 0 ]]; then",
+        " T=$(mktemp -u -t libXXXXXX.a)",
+        " cat /dev/stdin > ${T}",
+        "else",
+        " T=$1;",
+        "fi",
+        "%s %s tool objdump ${T}" % (
+            env_prefix,
+            go_ctx.go.path,
+        ),
+        "if [[ $# -eq 0 ]]; then",
+        " rm -rf ${T}",
+        "fi",
+        "",
+    ]), is_executable = True)
+
+    # Include the full runfiles.
+    return [DefaultInfo(
+        runfiles = ctx.runfiles(files = go_ctx.runfiles.to_list()),
+        executable = dumper,
+    )]
+
+nogo_dump_tool = go_rule(
+    rule,
+    implementation = _nogo_dump_tool_impl,
+)
+
+# NogoStdlibInfo is the set of standard library facts.
+NogoStdlibInfo = provider(
+    "information for nogo analysis (standard library facts)",
+    fields = {
+        "facts": "serialized standard library facts",
+    },
+)
+
+def _nogo_stdlib_impl(ctx):
+    # Extract the Go context.
+    go_ctx = go_context(ctx)
+
+    # Build the standard library facts.
+    facts = ctx.actions.declare_file(ctx.label.name + ".facts")
+    config = struct(
+        Srcs = [f.path for f in go_ctx.stdlib_srcs],
+        GOOS = go_ctx.goos,
+        GOARCH = go_ctx.goarch,
+        Tags = go_ctx.tags,
+        FactOutput = facts.path,
+    )
+    config_file = ctx.actions.declare_file(ctx.label.name + ".cfg")
+    ctx.actions.write(config_file, config.to_json())
+    ctx.actions.run(
+        inputs = [config_file] + go_ctx.stdlib_srcs,
+        outputs = [facts],
+        tools = depset(go_ctx.runfiles.to_list() + ctx.files._dump_tool),
+        executable = ctx.files._nogo[0],
+        mnemonic = "GoStandardLibraryAnalysis",
+        progress_message = "Analyzing Go Standard Library",
+        arguments = go_ctx.nogo_args + [
+            "-dump_tool=%s" % ctx.files._dump_tool[0].path,
+            "-stdlib=%s" % config_file.path,
+        ],
+    )
+
+    # Return the stdlib facts as output.
+    return [NogoStdlibInfo(
+        facts = facts,
+    )]
+
+nogo_stdlib = go_rule(
+    rule,
+    implementation = _nogo_stdlib_impl,
+    attrs = {
+        "_nogo": attr.label(
+            default = "//tools/nogo/check:check",
+        ),
+        "_dump_tool": attr.label(
+            default = "//tools/nogo:dump_tool",
+        ),
+    },
+)
+
 # NogoInfo is the serialized set of package facts for a nogo analysis.
 #
 # Each go_library rule will generate a corresponding nogo rule, which will run
@@ -33,6 +130,9 @@ def _nogo_aspect_impl(target, ctx):
     else:
         return [NogoInfo()]
 
+    # Extract the Go context.
+    go_ctx = go_context(ctx)
+
     # If we're using the "library" attribute, then we need to aggregate the
     # original library sources and dependencies into this target to perform
     # proper type analysis.
@@ -45,10 +145,6 @@ def _nogo_aspect_impl(target, ctx):
             if hasattr(info, "deps"):
                 deps = deps + info.deps
 
-    # Construct the Go environment from the go_ctx.env dictionary.
-    go_ctx = go_context(ctx)
-    env_prefix = " ".join(["%s=%s" % (key, value) for (key, value) in go_ctx.env.items()])
-
     # Start with all target files and srcs as input.
     inputs = target.files.to_list() + srcs
 
@@ -64,26 +160,7 @@ def _nogo_aspect_impl(target, ctx):
     else:
         # Use the raw binary for go_binary and go_test targets.
         target_objfile = binaries[0]
-    disasm_file = ctx.actions.declare_file(target.label.name + ".out")
-    dumper = ctx.actions.declare_file("%s-dumper" % ctx.label.name)
-    ctx.actions.write(dumper, "\n".join([
-        "#!/bin/bash",
-        "%s %s tool objdump %s > %s\n" % (
-            env_prefix,
-            go_ctx.go.path,
-            target_objfile.path,
-            disasm_file.path,
-        ),
-    ]), is_executable = True)
-    ctx.actions.run(
-        inputs = [target_objfile],
-        outputs = [disasm_file],
-        tools = go_ctx.runfiles,
-        mnemonic = "GoObjdump",
-        progress_message = "Objdump %s" % target.label,
-        executable = dumper,
-    )
-    inputs.append(disasm_file)
+    inputs.append(target_objfile)
 
     # Extract the importpath for this package.
     if ctx.rule.kind == "go_test":
@@ -96,25 +173,9 @@ def _nogo_aspect_impl(target, ctx):
     else:
         importpath = go_importpath(target)
 
-    # The nogo tool requires a configfile serialized in JSON format to do its
-    # work. This must line up with the nogo.Config fields.
-    facts = ctx.actions.declare_file(target.label.name + ".facts")
-    config = struct(
-        ImportPath = importpath,
-        GoFiles = [src.path for src in srcs if src.path.endswith(".go")],
-        NonGoFiles = [src.path for src in srcs if not src.path.endswith(".go")],
-        # Google's internal build system needs a bit more help to find std.
-        StdZip = go_ctx.std_zip.short_path if hasattr(go_ctx, "std_zip") else "",
-        GOOS = go_ctx.goos,
-        GOARCH = go_ctx.goarch,
-        Tags = go_ctx.tags,
-        FactMap = {},  # Constructed below.
-        ImportMap = {},  # Constructed below.
-        FactOutput = facts.path,
-        Objdump = disasm_file.path,
-    )
-
     # Collect all info from shadow dependencies.
+    fact_map = dict()
+    import_map = dict()
     for dep in deps:
         # There will be no file attribute set for all transitive dependencies
         # that are not go_library or go_binary rules, such as a proto rules.
@@ -129,27 +190,46 @@ def _nogo_aspect_impl(target, ctx):
         x_files = [f.path for f in info.binaries if f.path.endswith(".x")]
         if not len(x_files):
             x_files = [f.path for f in info.binaries if f.path.endswith(".a")]
-        config.ImportMap[info.importpath] = x_files[0]
-        config.FactMap[info.importpath] = info.facts.path
+        import_map[info.importpath] = x_files[0]
+        fact_map[info.importpath] = info.facts.path
 
         # Ensure the above are available as inputs.
         inputs.append(info.facts)
         inputs += info.binaries
 
-    # Write the configuration and run the tool.
+    # Add the standard library facts.
+    stdlib_facts = ctx.attr._nogo_stdlib[NogoStdlibInfo].facts
+    inputs.append(stdlib_facts)
+
+    # The nogo tool operates on a configuration serialized in JSON format.
+    facts = ctx.actions.declare_file(target.label.name + ".facts")
+    config = struct(
+        ImportPath = importpath,
+        GoFiles = [src.path for src in srcs if src.path.endswith(".go")],
+        NonGoFiles = [src.path for src in srcs if not src.path.endswith(".go")],
+        GOOS = go_ctx.goos,
+        GOARCH = go_ctx.goarch,
+        Tags = go_ctx.tags,
+        FactMap = fact_map,
+        ImportMap = import_map,
+        StdlibFacts = stdlib_facts.path,
+        FactOutput = facts.path,
+    )
     config_file = ctx.actions.declare_file(target.label.name + ".cfg")
     ctx.actions.write(config_file, config.to_json())
     inputs.append(config_file)
-
-    # Run the nogo tool itself.
     ctx.actions.run(
         inputs = inputs,
         outputs = [facts],
-        tools = go_ctx.runfiles,
+        tools = depset(go_ctx.runfiles.to_list() + ctx.files._dump_tool),
         executable = ctx.files._nogo[0],
         mnemonic = "GoStaticAnalysis",
         progress_message = "Analyzing %s" % target.label,
-        arguments = ["-config=%s" % config_file.path],
+        arguments = go_ctx.nogo_args + [
+            "-binary=%s" % target_objfile.path,
+            "-dump_tool=%s" % ctx.files._dump_tool[0].path,
+            "-package=%s" % config_file.path,
+        ],
     )
 
     # Return the package facts as output.
@@ -172,7 +252,12 @@ nogo_aspect = go_rule(
     attrs = {
         "_nogo": attr.label(
             default = "//tools/nogo/check:check",
-            allow_single_file = True,
+        ),
+        "_nogo_stdlib": attr.label(
+            default = "//tools/nogo:stdlib",
+        ),
+        "_dump_tool": attr.label(
+            default = "//tools/nogo:dump_tool",
         ),
     },
 )
diff --git a/tools/nogo/dump/BUILD b/tools/nogo/dump/BUILD
new file mode 100644
index 000000000..dfa29d651
--- /dev/null
+++ b/tools/nogo/dump/BUILD
@@ -0,0 +1,10 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "dump",
+    srcs = ["dump.go"],
+    nogo = False,
+    visibility = ["//tools:__subpackages__"],
+)
diff --git a/tools/nogo/dump/dump.go b/tools/nogo/dump/dump.go
new file mode 100644
index 000000000..f06567e0f
--- /dev/null
+++ b/tools/nogo/dump/dump.go
@@ -0,0 +1,78 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package dump contains data dump tools.
+//
+// The interface used by the package corresponds to the tool generated by the
+// nogo_dump_tool rule.
+//
+// This package is separate in order to avoid a dependency cycle.
+package dump
+
+import (
+	"flag"
+	"fmt"
+	"io"
+	"os"
+	"os/exec"
+)
+
+var (
+	// Binary is the binary under analysis.
+	//
+	// See Reader, below.
+	binary = flag.String("binary", "", "binary under analysis")
+
+	// Reader is the input stream.
+	//
+	// This may be set instead of Binary.
+	Reader io.Reader
+
+	// Tool is the tool used to dump a binary.
+	tool = flag.String("dump_tool", "", "tool used to dump a binary")
+)
+
+// Command returns a command that will emit the dumped object on stdout.
+//
+// You must call Wait on the resulting command.
+func Command() (*exec.Cmd, io.Reader, error) {
+	var (
+		args  []string
+		stdin io.Reader
+	)
+	if *binary != "" {
+		args = append(args, *binary)
+		*binary = "" // Clear.
+	} else if Reader != nil {
+		stdin = Reader
+		Reader = nil // Clear.
+	} else {
+		// We have no input stream or binary.
+		return nil, nil, fmt.Errorf("no binary or reader provided!")
+	}
+
+	// Construct our command.
+	cmd := exec.Command(*tool, args...)
+	cmd.Stdin = stdin
+	cmd.Stderr = os.Stderr
+	out, err := cmd.StdoutPipe()
+	if err != nil {
+		return nil, nil, err
+	}
+	if err := cmd.Start(); err != nil {
+		return nil, nil, err
+	}
+
+	return cmd, out, err
+}
diff --git a/tools/nogo/nogo.go b/tools/nogo/nogo.go
index ea1e97076..e44f32d4c 100644
--- a/tools/nogo/nogo.go
+++ b/tools/nogo/nogo.go
@@ -32,51 +32,97 @@ import (
 	"io/ioutil"
 	"log"
 	"os"
+	"path"
 	"path/filepath"
 	"reflect"
+	"strings"
 
 	"golang.org/x/tools/go/analysis"
 	"golang.org/x/tools/go/analysis/internal/facts"
 	"golang.org/x/tools/go/gcexportdata"
-	"gvisor.dev/gvisor/tools/nogo/data"
+	"gvisor.dev/gvisor/tools/nogo/dump"
 )
 
-// pkgConfig is serialized as the configuration.
+// stdlibConfig is serialized as the configuration.
 //
-// This contains everything required for the analysis.
-type pkgConfig struct {
-	ImportPath string
-	GoFiles    []string
-	NonGoFiles []string
-	Tags       []string
+// This contains everything required for stdlib analysis.
+type stdlibConfig struct {
+	Srcs       []string
 	GOOS       string
 	GOARCH     string
-	ImportMap  map[string]string
-	FactMap    map[string]string
+	Tags       []string
 	FactOutput string
-	Objdump    string
-	StdZip     string
 }
 
-// loadFacts finds and loads facts per FactMap.
-func (c *pkgConfig) loadFacts(path string) ([]byte, error) {
-	realPath, ok := c.FactMap[path]
-	if !ok {
-		return nil, nil // No facts available.
+// packageConfig is serialized as the configuration.
+//
+// This contains everything required for single package analysis.
+type packageConfig struct {
+	ImportPath  string
+	GoFiles     []string
+	NonGoFiles  []string
+	Tags        []string
+	GOOS        string
+	GOARCH      string
+	ImportMap   map[string]string
+	FactMap     map[string]string
+	FactOutput  string
+	StdlibFacts string
+}
+
+// loader is a fact-loader function.
+type loader func(string) ([]byte, error)
+
+// saver is a fact-saver function.
+type saver func([]byte) error
+
+// factLoader returns a function that loads facts.
+//
+// This resolves all standard library facts and imported package facts up
+// front. The returned loader function will never return an error, only
+// empty facts.
+//
+// This is done because all stdlib data is stored together, and we don't want
+// to load this data many times over.
+func (c *packageConfig) factLoader() (loader, error) {
+	allFacts := make(map[string][]byte)
+	if c.StdlibFacts != "" {
+		data, err := ioutil.ReadFile(c.StdlibFacts)
+		if err != nil {
+			return nil, fmt.Errorf("error loading stdlib facts from %q: %w", c.StdlibFacts, err)
+		}
+		var stdlibFacts map[string][]byte
+		if err := json.Unmarshal(data, &stdlibFacts); err != nil {
+			return nil, fmt.Errorf("error loading stdlib facts: %w", err)
+		}
+		for pkg, data := range stdlibFacts {
+			allFacts[pkg] = data
+		}
+	}
+	for pkg, file := range c.FactMap {
+		data, err := ioutil.ReadFile(file)
+		if err != nil {
+			return nil, fmt.Errorf("error loading %q: %w", file, err)
+		}
+		allFacts[pkg] = data
 	}
+	return func(path string) ([]byte, error) {
+		return allFacts[path], nil
+	}, nil
+}
 
-	// Read the files file.
-	data, err := ioutil.ReadFile(realPath)
-	if err != nil {
-		return nil, err
+// factSaver may be used directly as a saver.
+func (c *packageConfig) factSaver(factData []byte) error {
+	if c.FactOutput == "" {
+		return nil // Nothing to save.
 	}
-	return data, nil
+	return ioutil.WriteFile(c.FactOutput, factData, 0644)
 }
 
 // shouldInclude indicates whether the file should be included.
 //
 // NOTE: This does only basic parsing of tags.
-func (c *pkgConfig) shouldInclude(path string) (bool, error) {
+func (c *packageConfig) shouldInclude(path string) (bool, error) {
 	ctx := build.Default
 	ctx.GOOS = c.GOOS
 	ctx.GOARCH = c.GOARCH
@@ -90,10 +136,11 @@ func (c *pkgConfig) shouldInclude(path string) (bool, error) {
 // files, and the facts. Note that this importer implementation will always
 // pass when a given package is not available.
 type importer struct {
-	pkgConfig
-	fset    *token.FileSet
-	cache   map[string]*types.Package
-	lastErr error
+	*packageConfig
+	fset     *token.FileSet
+	cache    map[string]*types.Package
+	lastErr  error
+	callback func(string) error
 }
 
 // Import implements types.Importer.Import.
@@ -104,6 +151,17 @@ func (i *importer) Import(path string) (*types.Package, error) {
 		// analyzers are specifically looking for this.
 		return types.Unsafe, nil
 	}
+
+	// Call the internal callback. This is used to resolve loading order
+	// for the standard library. See checkStdlib.
+	if i.callback != nil {
+		if err := i.callback(path); err != nil {
+			i.lastErr = err
+			return nil, err
+		}
+	}
+
+	// Actually load the data.
 	realPath, ok := i.ImportMap[path]
 	var (
 		rc  io.ReadCloser
@@ -112,7 +170,7 @@ func (i *importer) Import(path string) (*types.Package, error) {
 	if !ok {
 		// Not found in the import path. Attempt to find the package
 		// via the standard library.
-		rc, err = i.findStdPkg(path)
+		rc, err = findStdPkg(i.GOOS, i.GOARCH, path)
 	} else {
 		// Open the file.
 		rc, err = os.Open(realPath)
@@ -135,6 +193,139 @@ func (i *importer) Import(path string) (*types.Package, error) {
 // ErrSkip indicates the package should be skipped.
 var ErrSkip = errors.New("skipped")
 
+// checkStdlib checks the standard library.
+//
+// This constructs a synthetic package configuration for each library in the
+// standard library sources, and call checkPackage repeatedly.
+//
+// Note that not all parts of the source are expected to build. We skip obvious
+// test files, and cmd files, which should not be dependencies.
+func checkStdlib(config *stdlibConfig) ([]string, error) {
+	if len(config.Srcs) == 0 {
+		return nil, nil
+	}
+
+	// Ensure all paths are normalized.
+	for i := 0; i < len(config.Srcs); i++ {
+		config.Srcs[i] = path.Clean(config.Srcs[i])
+	}
+
+	// Calculate the root directory.
+	longestPrefix := path.Dir(config.Srcs[0])
+	for _, file := range config.Srcs[1:] {
+		for i := 0; i < len(file) && i < len(longestPrefix); i++ {
+			if file[i] != longestPrefix[i] {
+				// Truncate here; will stop the loop.
+				longestPrefix = longestPrefix[:i]
+				break
+			}
+		}
+	}
+	if len(longestPrefix) > 0 && longestPrefix[len(longestPrefix)-1] != '/' {
+		longestPrefix += "/"
+	}
+
+	// Aggregate all files by directory.
+	packages := make(map[string]*packageConfig)
+	for _, file := range config.Srcs {
+		d := path.Dir(file)
+		if len(longestPrefix) >= len(d) {
+			continue // Not a file.
+		}
+		pkg := path.Dir(file)[len(longestPrefix):]
+		// Skip cmd packages and obvious test files: see above.
+		if strings.HasPrefix(pkg, "cmd/") || strings.HasSuffix(file, "_test.go") {
+			continue
+		}
+		c, ok := packages[pkg]
+		if !ok {
+			c = &packageConfig{
+				ImportPath: pkg,
+				GOOS:       config.GOOS,
+				GOARCH:     config.GOARCH,
+				Tags:       config.Tags,
+			}
+			packages[pkg] = c
+		}
+		// Add the files appropriately. Note that they will be further
+		// filtered by architecture and build tags below, so this need
+		// not be done immediately.
+		if strings.HasSuffix(file, ".go") {
+			c.GoFiles = append(c.GoFiles, file)
+		} else {
+			c.NonGoFiles = append(c.NonGoFiles, file)
+		}
+	}
+
+	// Closure to check a single package.
+	allFindings := make([]string, 0)
+	stdlibFacts := make(map[string][]byte)
+	var checkOne func(pkg string) error // Recursive.
+	checkOne = func(pkg string) error {
+		// Is this already done?
+		if _, ok := stdlibFacts[pkg]; ok {
+			return nil
+		}
+
+		// Lookup the configuration.
+		config, ok := packages[pkg]
+		if !ok {
+			return nil // Not known.
+		}
+
+		// Find the binary package, and provide to objdump.
+		rc, err := findStdPkg(config.GOOS, config.GOARCH, pkg)
+		if err != nil {
+			// If there's no binary for this package, it is likely
+			// not built with the distribution. That's fine, we can
+			// just skip analysis.
+			return nil
+		}
+
+		// Provide the input.
+		oldReader := dump.Reader
+		dump.Reader = rc // For analysis.
+		defer func() {
+			rc.Close()
+			dump.Reader = oldReader // Restore.
+		}()
+
+		// Run the analysis.
+		findings, err := checkPackage(config, func(factData []byte) error {
+			stdlibFacts[pkg] = factData
+			return nil
+		}, checkOne)
+		if err != nil {
+			// If we can't analyze a package from the standard library,
+			// then we skip it. It will simply not have any findings.
+			return nil
+		}
+		allFindings = append(allFindings, findings...)
+		return nil
+	}
+
+	// Check all packages.
+	//
+	// Note that this may call checkOne recursively, so it's not guaranteed
+	// to evaluate in the order provided here. We do ensure however, that
+	// all packages are evaluated.
+	for pkg := range packages {
+		checkOne(pkg)
+	}
+
+	// Write out all findings.
+	factData, err := json.Marshal(stdlibFacts)
+	if err != nil {
+		return nil, fmt.Errorf("error saving stdlib facts: %w", err)
+	}
+	if err := ioutil.WriteFile(config.FactOutput, factData, 0644); err != nil {
+		return nil, fmt.Errorf("error saving findings to %q: %v", config.FactOutput, err)
+	}
+
+	// Return all findings.
+	return allFindings, nil
+}
+
 // checkPackage runs all analyzers.
 //
 // The implementation was adapted from [1], which was in turn adpated from [2].
@@ -143,11 +334,12 @@ var ErrSkip = errors.New("skipped")
 //
 // [1] bazelbuid/rules_go/tools/builders/nogo_main.go
 // [2] golang.org/x/tools/go/checker/internal/checker
-func checkPackage(config pkgConfig) ([]string, error) {
+func checkPackage(config *packageConfig, factSaver saver, importCallback func(string) error) ([]string, error) {
 	imp := &importer{
-		pkgConfig: config,
-		fset:      token.NewFileSet(),
-		cache:     make(map[string]*types.Package),
+		packageConfig: config,
+		fset:          token.NewFileSet(),
+		cache:         make(map[string]*types.Package),
+		callback:      importCallback,
 	}
 
 	// Load all source files.
@@ -184,14 +376,15 @@ func checkPackage(config pkgConfig) ([]string, error) {
 	}
 
 	// Load all package facts.
-	facts, err := facts.Decode(types, config.loadFacts)
+	loader, err := config.factLoader()
+	if err != nil {
+		return nil, fmt.Errorf("error loading facts: %w", err)
+	}
+	facts, err := facts.Decode(types, loader)
 	if err != nil {
 		return nil, fmt.Errorf("error decoding facts: %w", err)
 	}
 
-	// Set the binary global for use.
-	data.Objdump = config.Objdump
-
 	// Register fact types and establish dependencies between analyzers.
 	// The visit closure will execute recursively, and populate results
 	// will all required analysis results.
@@ -263,11 +456,9 @@ func checkPackage(config pkgConfig) ([]string, error) {
 	}
 
 	// Write the output file.
-	if config.FactOutput != "" {
-		factData := facts.Encode()
-		if err := ioutil.WriteFile(config.FactOutput, factData, 0644); err != nil {
-			return nil, fmt.Errorf("error: unable to open facts output %q: %v", config.FactOutput, err)
-		}
+	factData := facts.Encode()
+	if err := factSaver(factData); err != nil {
+		return nil, fmt.Errorf("error: unable to save facts: %v", err)
 	}
 
 	// Convert all diagnostics to strings.
@@ -284,38 +475,56 @@ func checkPackage(config pkgConfig) ([]string, error) {
 }
 
 var (
-	configFile = flag.String("config", "", "configuration file (in JSON format)")
+	packageFile = flag.String("package", "", "package configuration file (in JSON format)")
+	stdlibFile  = flag.String("stdlib", "", "stdlib configuration file (in JSON format)")
 )
 
-// Main is the entrypoint; it should be called directly from main.
-//
-// N.B. This package registers it's own flags.
-func Main() {
-	// Parse all flags.
-	flag.Parse()
-
+func loadConfig(file string, config interface{}) interface{} {
 	// Load the configuration.
-	f, err := os.Open(*configFile)
+	f, err := os.Open(file)
 	if err != nil {
-		log.Fatalf("unable to open configuration %q: %v", *configFile, err)
+		log.Fatalf("unable to open configuration %q: %v", file, err)
 	}
 	defer f.Close()
-	config := new(pkgConfig)
 	dec := json.NewDecoder(f)
 	dec.DisallowUnknownFields()
 	if err := dec.Decode(config); err != nil {
 		log.Fatalf("unable to decode configuration: %v", err)
 	}
+	return config
+}
 
-	// Process the package.
-	findings, err := checkPackage(*config)
+// Main is the entrypoint; it should be called directly from main.
+//
+// N.B. This package registers it's own flags.
+func Main() {
+	// Parse all flags.
+	flag.Parse()
+
+	var (
+		findings []string
+		err      error
+	)
+
+	// Check the configuration.
+	if *packageFile != "" && *stdlibFile != "" {
+		log.Fatalf("unable to perform stdlib and package analysis; provide only one!")
+	} else if *stdlibFile != "" {
+		c := loadConfig(*stdlibFile, new(stdlibConfig)).(*stdlibConfig)
+		findings, err = checkStdlib(c)
+	} else if *packageFile != "" {
+		c := loadConfig(*packageFile, new(packageConfig)).(*packageConfig)
+		findings, err = checkPackage(c, c.factSaver, nil)
+	} else {
+		log.Fatalf("please provide at least one of package or stdlib!")
+	}
+
+	// Handle findings & errors.
 	if err != nil {
 		log.Fatalf("error checking package: %v", err)
 	}
-
-	// No findings?
 	if len(findings) == 0 {
-		os.Exit(0)
+		return
 	}
 
 	// Print findings and exit with non-zero code.
-- 
cgit v1.2.3


From dd8b3ffcb8eb7f7867dbea2c721f7fb7d0ec0342 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Mon, 24 Aug 2020 22:40:20 -0400
Subject: Device major number greater than 2 digits in /proc/self/maps on arm64
 N1 machine

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 pkg/sentry/platform/kvm/virtual_map.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go
index c8897d34f..4dcdbf8a7 100644
--- a/pkg/sentry/platform/kvm/virtual_map.go
+++ b/pkg/sentry/platform/kvm/virtual_map.go
@@ -34,7 +34,7 @@ type virtualRegion struct {
 }
 
 // mapsLine matches a single line from /proc/PID/maps.
-var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2}:[0-9a-f]{2,} [0-9]+\\s+(.*)")
+var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2,3}:[0-9a-f]{2,} [0-9]+\\s+(.*)")
 
 // excludeRegion returns true if these regions should be excluded from the
 // physical map. Virtual regions need to be excluded if get_user_pages will
-- 
cgit v1.2.3


From 4705782bf39e7202a5fd66a966fac94baf36492b Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 26 Aug 2020 20:22:39 -0700
Subject: Make flag propagation automatic

Use reflection and tags to provide automatic conversion from
Config to flags. This makes adding new flags less error-prone,
skips flags using default values (easier to read), and makes
tests correctly use default flag values for test Configs.

Updates #3494

PiperOrigin-RevId: 328662070
---
 pkg/refs/refcounter.go          |  33 +++
 pkg/sentry/watchdog/watchdog.go |  28 ++-
 pkg/test/testutil/testutil.go   |  31 +--
 runsc/boot/loader_test.go       |  20 +-
 runsc/boot/strace.go            |   4 +-
 runsc/config/BUILD              |  15 +-
 runsc/config/config.go          | 443 ++++++++++++++++++----------------------
 runsc/config/config_test.go     | 185 +++++++++++++++++
 runsc/config/flags.go           | 168 +++++++++++++++
 runsc/flag/flag.go              |  14 +-
 runsc/main.go                   | 170 +++------------
 runsc/sandbox/network.go        |   2 +-
 12 files changed, 691 insertions(+), 422 deletions(-)
 create mode 100644 runsc/config/config_test.go
 create mode 100644 runsc/config/flags.go

(limited to 'pkg')

diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index d9d5e6bcb..57d8542b9 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -234,6 +234,39 @@ const (
 	LeaksLogTraces
 )
 
+// Set implements flag.Value.
+func (l *LeakMode) Set(v string) error {
+	switch v {
+	case "disabled":
+		*l = NoLeakChecking
+	case "log-names":
+		*l = LeaksLogWarning
+	case "log-traces":
+		*l = LeaksLogTraces
+	default:
+		return fmt.Errorf("invalid ref leak mode %q", v)
+	}
+	return nil
+}
+
+// Get implements flag.Value.
+func (l *LeakMode) Get() interface{} {
+	return *l
+}
+
+// String implements flag.Value.
+func (l *LeakMode) String() string {
+	switch *l {
+	case NoLeakChecking:
+		return "disabled"
+	case LeaksLogWarning:
+		return "log-names"
+	case LeaksLogTraces:
+		return "log-traces"
+	}
+	panic(fmt.Sprintf("invalid ref leak mode %q", *l))
+}
+
 // leakMode stores the current mode for the reference leak checker.
 //
 // Values must be one of the LeakMode values.
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 748273366..bbafb8b7f 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -96,15 +96,33 @@ const (
 	Panic
 )
 
+// Set implements flag.Value.
+func (a *Action) Set(v string) error {
+	switch v {
+	case "log", "logwarning":
+		*a = LogWarning
+	case "panic":
+		*a = Panic
+	default:
+		return fmt.Errorf("invalid watchdog action %q", v)
+	}
+	return nil
+}
+
+// Get implements flag.Value.
+func (a *Action) Get() interface{} {
+	return *a
+}
+
 // String returns Action's string representation.
-func (a Action) String() string {
-	switch a {
+func (a *Action) String() string {
+	switch *a {
 	case LogWarning:
-		return "LogWarning"
+		return "logWarning"
 	case Panic:
-		return "Panic"
+		return "panic"
 	default:
-		panic(fmt.Sprintf("Invalid action: %d", a))
+		panic(fmt.Sprintf("Invalid watchdog action: %d", *a))
 	}
 }
 
diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go
index 42d79f5c2..b7f873392 100644
--- a/pkg/test/testutil/testutil.go
+++ b/pkg/test/testutil/testutil.go
@@ -138,20 +138,23 @@ func TestConfig(t *testing.T) *config.Config {
 	if dir, ok := os.LookupEnv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
 		logDir = dir + "/"
 	}
-	return &config.Config{
-		Debug:              true,
-		DebugLog:           path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%"),
-		LogFormat:          "text",
-		DebugLogFormat:     "text",
-		LogPackets:         true,
-		Network:            config.NetworkNone,
-		Strace:             true,
-		Platform:           "ptrace",
-		FileAccess:         config.FileAccessExclusive,
-		NumNetworkChannels: 1,
-
-		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
-	}
+
+	// Only register flags if config is being used. Otherwise anyone that uses
+	// testutil will get flags registered and they may conflict.
+	config.RegisterFlags()
+
+	conf, err := config.NewFromFlags()
+	if err != nil {
+		panic(err)
+	}
+	// Change test defaults.
+	conf.Debug = true
+	conf.DebugLog = path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%")
+	conf.LogPackets = true
+	conf.Network = config.NetworkNone
+	conf.Strace = true
+	conf.TestOnlyAllowRunAsCurrentUserWithoutChroot = true
+	return conf
 }
 
 // NewSpecWithArgs creates a simple spec with the given args suitable for use
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 03cbaec33..2343ce76c 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -44,15 +44,19 @@ func init() {
 	if err := fsgofer.OpenProcSelfFD(); err != nil {
 		panic(err)
 	}
+	config.RegisterFlags()
 }
 
 func testConfig() *config.Config {
-	return &config.Config{
-		RootDir:        "unused_root_dir",
-		Network:        config.NetworkNone,
-		DisableSeccomp: true,
-		Platform:       "ptrace",
+	conf, err := config.NewFromFlags()
+	if err != nil {
+		panic(err)
 	}
+	// Change test defaults.
+	conf.RootDir = "unused_root_dir"
+	conf.Network = config.NetworkNone
+	conf.DisableSeccomp = true
+	return conf
 }
 
 // testSpec returns a simple spec that can be used in tests.
@@ -546,7 +550,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 					},
 					"tmpfs": {
@@ -600,7 +604,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 						{
 							Dev:        "9pfs-/dev/fd-foo",
@@ -658,7 +662,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 					},
 					"tmpfs": {
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index 176981f74..c21648a32 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -15,6 +15,8 @@
 package boot
 
 import (
+	"strings"
+
 	"gvisor.dev/gvisor/pkg/sentry/strace"
 	"gvisor.dev/gvisor/runsc/config"
 )
@@ -37,5 +39,5 @@ func enableStrace(conf *config.Config) error {
 		strace.EnableAll(strace.SinkTypeLog)
 		return nil
 	}
-	return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+	return strace.Enable(strings.Split(conf.StraceSyscalls, ","), strace.SinkTypeLog)
 }
diff --git a/runsc/config/BUILD b/runsc/config/BUILD
index 3c8713d53..b1672bb9d 100644
--- a/runsc/config/BUILD
+++ b/runsc/config/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -6,10 +6,23 @@ go_library(
     name = "config",
     srcs = [
         "config.go",
+        "flags.go",
     ],
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/refs",
         "//pkg/sentry/watchdog",
+        "//pkg/sync",
+        "//runsc/flag",
     ],
 )
+
+go_test(
+    name = "config_test",
+    size = "small",
+    srcs = [
+        "config_test.go",
+    ],
+    library = ":config",
+    deps = ["//runsc/flag"],
+)
diff --git a/runsc/config/config.go b/runsc/config/config.go
index 8cf0378d5..bca27ebf1 100644
--- a/runsc/config/config.go
+++ b/runsc/config/config.go
@@ -19,254 +19,105 @@ package config
 
 import (
 	"fmt"
-	"strconv"
-	"strings"
 
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 )
 
-// FileAccessType tells how the filesystem is accessed.
-type FileAccessType int
-
-const (
-	// FileAccessShared sends IO requests to a Gofer process that validates the
-	// requests and forwards them to the host.
-	FileAccessShared FileAccessType = iota
-
-	// FileAccessExclusive is the same as FileAccessShared, but enables
-	// extra caching for improved performance. It should only be used if
-	// the sandbox has exclusive access to the filesystem.
-	FileAccessExclusive
-)
-
-// MakeFileAccessType converts type from string.
-func MakeFileAccessType(s string) (FileAccessType, error) {
-	switch s {
-	case "shared":
-		return FileAccessShared, nil
-	case "exclusive":
-		return FileAccessExclusive, nil
-	default:
-		return 0, fmt.Errorf("invalid file access type %q", s)
-	}
-}
-
-func (f FileAccessType) String() string {
-	switch f {
-	case FileAccessShared:
-		return "shared"
-	case FileAccessExclusive:
-		return "exclusive"
-	default:
-		return fmt.Sprintf("unknown(%d)", f)
-	}
-}
-
-// NetworkType tells which network stack to use.
-type NetworkType int
-
-const (
-	// NetworkSandbox uses internal network stack, isolated from the host.
-	NetworkSandbox NetworkType = iota
-
-	// NetworkHost redirects network related syscalls to the host network.
-	NetworkHost
-
-	// NetworkNone sets up just loopback using netstack.
-	NetworkNone
-)
-
-// MakeNetworkType converts type from string.
-func MakeNetworkType(s string) (NetworkType, error) {
-	switch s {
-	case "sandbox":
-		return NetworkSandbox, nil
-	case "host":
-		return NetworkHost, nil
-	case "none":
-		return NetworkNone, nil
-	default:
-		return 0, fmt.Errorf("invalid network type %q", s)
-	}
-}
-
-func (n NetworkType) String() string {
-	switch n {
-	case NetworkSandbox:
-		return "sandbox"
-	case NetworkHost:
-		return "host"
-	case NetworkNone:
-		return "none"
-	default:
-		return fmt.Sprintf("unknown(%d)", n)
-	}
-}
-
-// MakeWatchdogAction converts type from string.
-func MakeWatchdogAction(s string) (watchdog.Action, error) {
-	switch strings.ToLower(s) {
-	case "log", "logwarning":
-		return watchdog.LogWarning, nil
-	case "panic":
-		return watchdog.Panic, nil
-	default:
-		return 0, fmt.Errorf("invalid watchdog action %q", s)
-	}
-}
-
-// MakeRefsLeakMode converts type from string.
-func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
-	switch strings.ToLower(s) {
-	case "disabled":
-		return refs.NoLeakChecking, nil
-	case "log-names":
-		return refs.LeaksLogWarning, nil
-	case "log-traces":
-		return refs.LeaksLogTraces, nil
-	default:
-		return 0, fmt.Errorf("invalid refs leakmode %q", s)
-	}
-}
-
-func refsLeakModeToString(mode refs.LeakMode) string {
-	switch mode {
-	// If not set, default it to disabled.
-	case refs.UninitializedLeakChecking, refs.NoLeakChecking:
-		return "disabled"
-	case refs.LeaksLogWarning:
-		return "log-names"
-	case refs.LeaksLogTraces:
-		return "log-traces"
-	default:
-		panic(fmt.Sprintf("Invalid leakmode: %d", mode))
-	}
-}
-
-// QueueingDiscipline is used to specify the kind of Queueing Discipline to
-// apply for a give FDBasedLink.
-type QueueingDiscipline int
-
-const (
-	// QDiscNone disables any queueing for the underlying FD.
-	QDiscNone QueueingDiscipline = iota
-
-	// QDiscFIFO applies a simple fifo based queue to the underlying
-	// FD.
-	QDiscFIFO
-)
-
-// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
-// else returns an error.
-func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
-	switch s {
-	case "none":
-		return QDiscNone, nil
-	case "fifo":
-		return QDiscFIFO, nil
-	default:
-		return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
-	}
-}
-
-// String implements fmt.Stringer.
-func (q QueueingDiscipline) String() string {
-	switch q {
-	case QDiscNone:
-		return "none"
-	case QDiscFIFO:
-		return "fifo"
-	default:
-		panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
-	}
-}
-
 // Config holds configuration that is not part of the runtime spec.
+//
+// Follow these steps to add a new flag:
+//   1. Create a new field in Config.
+//   2. Add a field tag with the flag name
+//   3. Register a new flag in flags.go, with name and description
+//   4. Add any necessary validation into validate()
+//   5. If adding an enum, follow the same pattern as FileAccessType
+//
 type Config struct {
 	// RootDir is the runtime root directory.
-	RootDir string
+	RootDir string `flag:"root"`
 
 	// Debug indicates that debug logging should be enabled.
-	Debug bool
+	Debug bool `flag:"debug"`
 
 	// LogFilename is the filename to log to, if not empty.
-	LogFilename string
+	LogFilename string `flag:"log"`
 
 	// LogFormat is the log format.
-	LogFormat string
+	LogFormat string `flag:"log-format"`
 
 	// DebugLog is the path to log debug information to, if not empty.
-	DebugLog string
+	DebugLog string `flag:"debug-log"`
 
 	// PanicLog is the path to log GO's runtime messages, if not empty.
-	PanicLog string
+	PanicLog string `flag:"panic-log"`
 
 	// DebugLogFormat is the log format for debug.
-	DebugLogFormat string
+	DebugLogFormat string `flag:"debug-log-format"`
 
 	// FileAccess indicates how the filesystem is accessed.
-	FileAccess FileAccessType
+	FileAccess FileAccessType `flag:"file-access"`
 
 	// Overlay is whether to wrap the root filesystem in an overlay.
-	Overlay bool
+	Overlay bool `flag:"overlay"`
 
 	// FSGoferHostUDS enables the gofer to mount a host UDS.
-	FSGoferHostUDS bool
+	FSGoferHostUDS bool `flag:"fsgofer-host-uds"`
 
 	// Network indicates what type of network to use.
-	Network NetworkType
+	Network NetworkType `flag:"network"`
 
 	// EnableRaw indicates whether raw sockets should be enabled. Raw
 	// sockets are disabled by stripping CAP_NET_RAW from the list of
 	// capabilities.
-	EnableRaw bool
+	EnableRaw bool `flag:"net-raw"`
 
 	// HardwareGSO indicates that hardware segmentation offload is enabled.
-	HardwareGSO bool
+	HardwareGSO bool `flag:"gso"`
 
 	// SoftwareGSO indicates that software segmentation offload is enabled.
-	SoftwareGSO bool
+	SoftwareGSO bool `flag:"software-gso"`
 
 	// TXChecksumOffload indicates that TX Checksum Offload is enabled.
-	TXChecksumOffload bool
+	TXChecksumOffload bool `flag:"tx-checksum-offload"`
 
 	// RXChecksumOffload indicates that RX Checksum Offload is enabled.
-	RXChecksumOffload bool
+	RXChecksumOffload bool `flag:"rx-checksum-offload"`
 
 	// QDisc indicates the type of queuening discipline to use by default
 	// for non-loopback interfaces.
-	QDisc QueueingDiscipline
+	QDisc QueueingDiscipline `flag:"qdisc"`
 
 	// LogPackets indicates that all network packets should be logged.
-	LogPackets bool
+	LogPackets bool `flag:"log-packets"`
 
 	// Platform is the platform to run on.
-	Platform string
+	Platform string `flag:"platform"`
 
 	// Strace indicates that strace should be enabled.
-	Strace bool
+	Strace bool `flag:"strace"`
 
-	// StraceSyscalls is the set of syscalls to trace.  If StraceEnable is
-	// true and this list is empty, then all syscalls will be traced.
-	StraceSyscalls []string
+	// StraceSyscalls is the set of syscalls to trace (comma-separated values).
+	// If StraceEnable is true and this string is empty, then all syscalls will
+	// be traced.
+	StraceSyscalls string `flag:"strace-syscalls"`
 
 	// StraceLogSize is the max size of data blobs to display.
-	StraceLogSize uint
+	StraceLogSize uint `flag:"strace-log-size"`
 
 	// DisableSeccomp indicates whether seccomp syscall filters should be
 	// disabled. Pardon the double negation, but default to enabled is important.
 	DisableSeccomp bool
 
 	// WatchdogAction sets what action the watchdog takes when triggered.
-	WatchdogAction watchdog.Action
+	WatchdogAction watchdog.Action `flag:"watchdog-action"`
 
 	// PanicSignal registers signal handling that panics. Usually set to
 	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
-	PanicSignal int
+	PanicSignal int `flag:"panic-signal"`
 
 	// ProfileEnable is set to prepare the sandbox to be profiled.
-	ProfileEnable bool
+	ProfileEnable bool `flag:"profile"`
 
 	// RestoreFile is the path to the saved container image
 	RestoreFile string
@@ -274,105 +125,209 @@ type Config struct {
 	// NumNetworkChannels controls the number of AF_PACKET sockets that map
 	// to the same underlying network device. This allows netstack to better
 	// scale for high throughput use cases.
-	NumNetworkChannels int
+	NumNetworkChannels int `flag:"num-network-channels"`
 
 	// Rootless allows the sandbox to be started with a user that is not root.
 	// Defense is depth measures are weaker with rootless. Specifically, the
 	// sandbox and Gofer process run as root inside a user namespace with root
 	// mapped to the caller's user.
-	Rootless bool
+	Rootless bool `flag:"rootless"`
 
 	// AlsoLogToStderr allows to send log messages to stderr.
-	AlsoLogToStderr bool
+	AlsoLogToStderr bool `flag:"alsologtostderr"`
 
 	// ReferenceLeakMode sets reference leak check mode
-	ReferenceLeakMode refs.LeakMode
+	ReferenceLeak refs.LeakMode `flag:"ref-leak-mode"`
 
 	// OverlayfsStaleRead instructs the sandbox to assume that the root mount
 	// is on a Linux overlayfs mount, which does not necessarily preserve
 	// coherence between read-only and subsequent writable file descriptors
 	// representing the "same" file.
-	OverlayfsStaleRead bool
+	OverlayfsStaleRead bool `flag:"overlayfs-stale-read"`
 
 	// CPUNumFromQuota sets CPU number count to available CPU quota, using
 	// least integer value greater than or equal to quota.
 	//
 	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
-	CPUNumFromQuota bool
+	CPUNumFromQuota bool `flag:"cpu-num-from-quota"`
 
-	// Enables VFS2 (not plumbed through yet).
-	VFS2 bool
+	// Enables VFS2.
+	VFS2 bool `flag:"vfs2"`
 
-	// Enables FUSE usage (not plumbed through yet).
-	FUSE bool
+	// Enables FUSE usage.
+	FUSE bool `flag:"fuse"`
 
 	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
 	// tests. It allows runsc to start the sandbox process as the current
 	// user, and without chrooting the sandbox process. This can be
 	// necessary in test environments that have limited capabilities.
-	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+	TestOnlyAllowRunAsCurrentUserWithoutChroot bool `flag:"TESTONLY-unsafe-nonroot"`
 
 	// TestOnlyTestNameEnv should only be used in tests. It looks up for the
 	// test name in the container environment variables and adds it to the debug
 	// log file name. This is done to help identify the log with the test when
 	// multiple tests are run in parallel, since there is no way to pass
 	// parameters to the runtime from docker.
-	TestOnlyTestNameEnv string
+	TestOnlyTestNameEnv string `flag:"TESTONLY-test-name-env"`
+}
+
+func (c *Config) validate() error {
+	if c.FileAccess == FileAccessShared && c.Overlay {
+		return fmt.Errorf("overlay flag is incompatible with shared file access")
+	}
+	if c.NumNetworkChannels <= 0 {
+		return fmt.Errorf("num_network_channels must be > 0, got: %d", c.NumNetworkChannels)
+	}
+	return nil
 }
 
-// ToFlags returns a slice of flags that correspond to the given Config.
-func (c *Config) ToFlags() []string {
-	f := []string{
-		"--root=" + c.RootDir,
-		"--debug=" + strconv.FormatBool(c.Debug),
-		"--log=" + c.LogFilename,
-		"--log-format=" + c.LogFormat,
-		"--debug-log=" + c.DebugLog,
-		"--panic-log=" + c.PanicLog,
-		"--debug-log-format=" + c.DebugLogFormat,
-		"--file-access=" + c.FileAccess.String(),
-		"--overlay=" + strconv.FormatBool(c.Overlay),
-		"--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
-		"--network=" + c.Network.String(),
-		"--log-packets=" + strconv.FormatBool(c.LogPackets),
-		"--platform=" + c.Platform,
-		"--strace=" + strconv.FormatBool(c.Strace),
-		"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
-		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
-		"--watchdog-action=" + c.WatchdogAction.String(),
-		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
-		"--profile=" + strconv.FormatBool(c.ProfileEnable),
-		"--net-raw=" + strconv.FormatBool(c.EnableRaw),
-		"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
-		"--rootless=" + strconv.FormatBool(c.Rootless),
-		"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
-		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
-		"--gso=" + strconv.FormatBool(c.HardwareGSO),
-		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
-		"--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload),
-		"--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload),
-		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
-		"--qdisc=" + c.QDisc.String(),
-		"--vfs2=" + strconv.FormatBool(c.VFS2),
-		"--fuse=" + strconv.FormatBool(c.FUSE),
+// FileAccessType tells how the filesystem is accessed.
+type FileAccessType int
+
+const (
+	// FileAccessExclusive is the same as FileAccessShared, but enables
+	// extra caching for improved performance. It should only be used if
+	// the sandbox has exclusive access to the filesystem.
+	FileAccessExclusive FileAccessType = iota
+
+	// FileAccessShared sends IO requests to a Gofer process that validates the
+	// requests and forwards them to the host.
+	FileAccessShared
+)
+
+func fileAccessTypePtr(v FileAccessType) *FileAccessType {
+	return &v
+}
+
+// Set implements flag.Value.
+func (f *FileAccessType) Set(v string) error {
+	switch v {
+	case "shared":
+		*f = FileAccessShared
+	case "exclusive":
+		*f = FileAccessExclusive
+	default:
+		return fmt.Errorf("invalid file access type %q", v)
 	}
-	if c.CPUNumFromQuota {
-		f = append(f, "--cpu-num-from-quota")
+	return nil
+}
+
+// Get implements flag.Value.
+func (f *FileAccessType) Get() interface{} {
+	return *f
+}
+
+// String implements flag.Value.
+func (f *FileAccessType) String() string {
+	switch *f {
+	case FileAccessShared:
+		return "shared"
+	case FileAccessExclusive:
+		return "exclusive"
 	}
-	if c.VFS2 {
-		f = append(f, "--vfs2=true")
+	panic(fmt.Sprintf("Invalid file access type %v", *f))
+}
+
+// NetworkType tells which network stack to use.
+type NetworkType int
+
+const (
+	// NetworkSandbox uses internal network stack, isolated from the host.
+	NetworkSandbox NetworkType = iota
+
+	// NetworkHost redirects network related syscalls to the host network.
+	NetworkHost
+
+	// NetworkNone sets up just loopback using netstack.
+	NetworkNone
+)
+
+func networkTypePtr(v NetworkType) *NetworkType {
+	return &v
+}
+
+// Set implements flag.Value.
+func (n *NetworkType) Set(v string) error {
+	switch v {
+	case "sandbox":
+		*n = NetworkSandbox
+	case "host":
+		*n = NetworkHost
+	case "none":
+		*n = NetworkNone
+	default:
+		return fmt.Errorf("invalid network type %q", v)
 	}
-	if c.FUSE {
-		f = append(f, "--fuse=true")
+	return nil
+}
+
+// Get implements flag.Value.
+func (n *NetworkType) Get() interface{} {
+	return *n
+}
+
+// String implements flag.Value.
+func (n *NetworkType) String() string {
+	switch *n {
+	case NetworkSandbox:
+		return "sandbox"
+	case NetworkHost:
+		return "host"
+	case NetworkNone:
+		return "none"
 	}
+	panic(fmt.Sprintf("Invalid network type %v", *n))
+}
+
+// QueueingDiscipline is used to specify the kind of Queueing Discipline to
+// apply for a give FDBasedLink.
+type QueueingDiscipline int
 
-	// Only include these if set since it is never to be used by users.
-	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		f = append(f, "--TESTONLY-unsafe-nonroot=true")
+const (
+	// QDiscNone disables any queueing for the underlying FD.
+	QDiscNone QueueingDiscipline = iota
+
+	// QDiscFIFO applies a simple fifo based queue to the underlying FD.
+	QDiscFIFO
+)
+
+func queueingDisciplinePtr(v QueueingDiscipline) *QueueingDiscipline {
+	return &v
+}
+
+// Set implements flag.Value.
+func (q *QueueingDiscipline) Set(v string) error {
+	switch v {
+	case "none":
+		*q = QDiscNone
+	case "fifo":
+		*q = QDiscFIFO
+	default:
+		return fmt.Errorf("invalid qdisc %q", v)
 	}
-	if len(c.TestOnlyTestNameEnv) != 0 {
-		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
+	return nil
+}
+
+// Get implements flag.Value.
+func (q *QueueingDiscipline) Get() interface{} {
+	return *q
+}
+
+// String implements flag.Value.
+func (q *QueueingDiscipline) String() string {
+	switch *q {
+	case QDiscNone:
+		return "none"
+	case QDiscFIFO:
+		return "fifo"
 	}
+	panic(fmt.Sprintf("Invalid qdisc %v", *q))
+}
+
+func leakModePtr(v refs.LeakMode) *refs.LeakMode {
+	return &v
+}
 
-	return f
+func watchdogActionPtr(v watchdog.Action) *watchdog.Action {
+	return &v
 }
diff --git a/runsc/config/config_test.go b/runsc/config/config_test.go
new file mode 100644
index 000000000..af7867a2a
--- /dev/null
+++ b/runsc/config/config_test.go
@@ -0,0 +1,185 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package config
+
+import (
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+func init() {
+	RegisterFlags()
+}
+
+func TestDefault(t *testing.T) {
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	// "--root" is always set to something different than the default. Reset it
+	// to make it easier to test that default values do not generate flags.
+	c.RootDir = ""
+
+	// All defaults doesn't require setting flags.
+	flags := c.ToFlags()
+	if len(flags) > 0 {
+		t.Errorf("default flags not set correctly for: %s", flags)
+	}
+}
+
+func setDefault(name string) {
+	fl := flag.CommandLine.Lookup(name)
+	fl.Value.Set(fl.DefValue)
+}
+
+func TestFromFlags(t *testing.T) {
+	flag.CommandLine.Lookup("root").Value.Set("some-path")
+	flag.CommandLine.Lookup("debug").Value.Set("true")
+	flag.CommandLine.Lookup("num-network-channels").Value.Set("123")
+	flag.CommandLine.Lookup("network").Value.Set("none")
+	defer func() {
+		setDefault("root")
+		setDefault("debug")
+		setDefault("num-network-channels")
+		setDefault("network")
+	}()
+
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if want := "some-path"; c.RootDir != want {
+		t.Errorf("RootDir=%v, want: %v", c.RootDir, want)
+	}
+	if want := true; c.Debug != want {
+		t.Errorf("Debug=%v, want: %v", c.Debug, want)
+	}
+	if want := 123; c.NumNetworkChannels != want {
+		t.Errorf("NumNetworkChannels=%v, want: %v", c.NumNetworkChannels, want)
+	}
+	if want := NetworkNone; c.Network != want {
+		t.Errorf("Network=%v, want: %v", c.Network, want)
+	}
+}
+
+func TestToFlags(t *testing.T) {
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	c.RootDir = "some-path"
+	c.Debug = true
+	c.NumNetworkChannels = 123
+	c.Network = NetworkNone
+
+	flags := c.ToFlags()
+	if len(flags) != 4 {
+		t.Errorf("wrong number of flags set, want: 4, got: %d: %s", len(flags), flags)
+	}
+	t.Logf("Flags: %s", flags)
+	fm := map[string]string{}
+	for _, f := range flags {
+		kv := strings.Split(f, "=")
+		fm[kv[0]] = kv[1]
+	}
+	for name, want := range map[string]string{
+		"--root":                 "some-path",
+		"--debug":                "true",
+		"--num-network-channels": "123",
+		"--network":              "none",
+	} {
+		if got, ok := fm[name]; ok {
+			if got != want {
+				t.Errorf("flag %q, want: %q, got: %q", name, want, got)
+			}
+		} else {
+			t.Errorf("flag %q not set", name)
+		}
+	}
+}
+
+// TestInvalidFlags checks that enum flags fail when value is not in enum set.
+func TestInvalidFlags(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		error string
+	}{
+		{
+			name:  "file-access",
+			error: "invalid file access type",
+		},
+		{
+			name:  "network",
+			error: "invalid network type",
+		},
+		{
+			name:  "qdisc",
+			error: "invalid qdisc",
+		},
+		{
+			name:  "watchdog-action",
+			error: "invalid watchdog action",
+		},
+		{
+			name:  "ref-leak-mode",
+			error: "invalid ref leak mode",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			defer setDefault(tc.name)
+			if err := flag.CommandLine.Lookup(tc.name).Value.Set("invalid"); err == nil || !strings.Contains(err.Error(), tc.error) {
+				t.Errorf("flag.Value.Set(invalid) wrong error reported: %v", err)
+			}
+		})
+	}
+}
+
+func TestValidationFail(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		flags map[string]string
+		error string
+	}{
+		{
+			name: "shared+overlay",
+			flags: map[string]string{
+				"file-access": "shared",
+				"overlay":     "true",
+			},
+			error: "overlay flag is incompatible",
+		},
+		{
+			name: "network-channels",
+			flags: map[string]string{
+				"num-network-channels": "-1",
+			},
+			error: "num_network_channels must be > 0",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			for name, val := range tc.flags {
+				defer setDefault(name)
+				if err := flag.CommandLine.Lookup(name).Value.Set(val); err != nil {
+					t.Errorf("%s=%q: %v", name, val, err)
+				}
+			}
+			if _, err := NewFromFlags(); err == nil || !strings.Contains(err.Error(), tc.error) {
+				t.Errorf("NewFromFlags() wrong error reported: %v", err)
+			}
+		})
+	}
+}
diff --git a/runsc/config/flags.go b/runsc/config/flags.go
new file mode 100644
index 000000000..488a4b9fb
--- /dev/null
+++ b/runsc/config/flags.go
@@ -0,0 +1,168 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package config
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"reflect"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/watchdog"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+var registration sync.Once
+
+// This is the set of flags used to populate Config.
+func RegisterFlags() {
+	registration.Do(func() {
+		// Although these flags are not part of the OCI spec, they are used by
+		// Docker, and thus should not be changed.
+		flag.String("root", "", "root directory for storage of container state.")
+		flag.String("log", "", "file path where internal debug information is written, default is stdout.")
+		flag.String("log-format", "text", "log format: text (default), json, or json-k8s.")
+		flag.Bool("debug", false, "enable debug logging.")
+
+		// These flags are unique to runsc, and are used to configure parts of the
+		// system that are not covered by the runtime spec.
+
+		// Debugging flags.
+		flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
+		flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.")
+		flag.Bool("log-packets", false, "enable network packet logging.")
+		flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
+		flag.Bool("alsologtostderr", false, "send log messages to stderr.")
+
+		// Debugging flags: strace related
+		flag.Bool("strace", false, "enable strace.")
+		flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
+		flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.")
+
+		// Flags that control sandbox runtime behavior.
+		flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
+		flag.Var(watchdogActionPtr(watchdog.LogWarning), "watchdog-action", "sets what action the watchdog takes when triggered: log (default), panic.")
+		flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+		flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+		flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
+		flag.Var(leakModePtr(refs.NoLeakChecking), "ref-leak-mode", "sets reference leak check mode: disabled (default), log-names, log-traces.")
+		flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
+
+		// Flags that control sandbox runtime behavior: FS related.
+		flag.Var(fileAccessTypePtr(FileAccessExclusive), "file-access", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
+		flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+		flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem")
+		flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
+		flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
+		flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.")
+
+		// Flags that control sandbox runtime behavior: network related.
+		flag.Var(networkTypePtr(NetworkSandbox), "network", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+		flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
+		flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
+		flag.Bool("software-gso", true, "enable software segmentation offload when hardware offload can't be enabled.")
+		flag.Bool("tx-checksum-offload", false, "enable TX checksum offload.")
+		flag.Bool("rx-checksum-offload", true, "enable RX checksum offload.")
+		flag.Var(queueingDisciplinePtr(QDiscFIFO), "qdisc", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.")
+		flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
+
+		// Test flags, not to be used outside tests, ever.
+		flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
+		flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.")
+	})
+}
+
+// NewFromFlags creates a new Config with values coming from command line flags.
+func NewFromFlags() (*Config, error) {
+	conf := &Config{}
+
+	obj := reflect.ValueOf(conf).Elem()
+	st := obj.Type()
+	for i := 0; i < st.NumField(); i++ {
+		f := st.Field(i)
+		name, ok := f.Tag.Lookup("flag")
+		if !ok {
+			// No flag set for this field.
+			continue
+		}
+		fl := flag.CommandLine.Lookup(name)
+		if fl == nil {
+			panic(fmt.Sprintf("Flag %q not found", name))
+		}
+		x := reflect.ValueOf(flag.Get(fl.Value))
+		obj.Field(i).Set(x)
+	}
+
+	if len(conf.RootDir) == 0 {
+		// If not set, set default root dir to something (hopefully) user-writeable.
+		conf.RootDir = "/var/run/runsc"
+		if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
+			conf.RootDir = filepath.Join(runtimeDir, "runsc")
+		}
+	}
+
+	if err := conf.validate(); err != nil {
+		return nil, err
+	}
+	return conf, nil
+}
+
+// ToFlags returns a slice of flags that correspond to the given Config.
+func (c *Config) ToFlags() []string {
+	var rv []string
+
+	obj := reflect.ValueOf(c).Elem()
+	st := obj.Type()
+	for i := 0; i < st.NumField(); i++ {
+		f := st.Field(i)
+		name, ok := f.Tag.Lookup("flag")
+		if !ok {
+			// No flag set for this field.
+			continue
+		}
+		val := getVal(obj.Field(i))
+
+		flag := flag.CommandLine.Lookup(name)
+		if flag == nil {
+			panic(fmt.Sprintf("Flag %q not found", name))
+		}
+		if val == flag.DefValue {
+			continue
+		}
+		rv = append(rv, fmt.Sprintf("--%s=%s", flag.Name, val))
+	}
+	return rv
+}
+
+func getVal(field reflect.Value) string {
+	if str, ok := field.Addr().Interface().(fmt.Stringer); ok {
+		return str.String()
+	}
+	switch field.Kind() {
+	case reflect.Bool:
+		return strconv.FormatBool(field.Bool())
+	case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
+		return strconv.FormatInt(field.Int(), 10)
+	case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr:
+		return strconv.FormatUint(field.Uint(), 10)
+	case reflect.String:
+		return field.String()
+	default:
+		panic("unknown type " + field.Kind().String())
+	}
+}
diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go
index 0ca4829d7..ba1ff833f 100644
--- a/runsc/flag/flag.go
+++ b/runsc/flag/flag.go
@@ -21,13 +21,19 @@ import (
 type FlagSet = flag.FlagSet
 
 var (
-	NewFlagSet  = flag.NewFlagSet
-	String      = flag.String
 	Bool        = flag.Bool
-	Int         = flag.Int
-	Uint        = flag.Uint
 	CommandLine = flag.CommandLine
+	Int         = flag.Int
+	NewFlagSet  = flag.NewFlagSet
 	Parse       = flag.Parse
+	String      = flag.String
+	Uint        = flag.Uint
+	Var         = flag.Var
 )
 
 const ContinueOnError = flag.ContinueOnError
+
+// Get returns the flag's underlying object.
+func Get(v flag.Value) interface{} {
+	return v.(flag.Getter).Get()
+}
diff --git a/runsc/main.go b/runsc/main.go
index c2ffecbdc..ed244c4ba 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -23,8 +23,6 @@ import (
 	"io/ioutil"
 	"os"
 	"os/signal"
-	"path/filepath"
-	"strings"
 	"syscall"
 	"time"
 
@@ -41,58 +39,17 @@ import (
 var (
 	// Although these flags are not part of the OCI spec, they are used by
 	// Docker, and thus should not be changed.
-	rootDir     = flag.String("root", "", "root directory for storage of container state.")
-	logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout.")
-	logFormat   = flag.String("log-format", "text", "log format: text (default), json, or json-k8s.")
-	debug       = flag.Bool("debug", false, "enable debug logging.")
-	showVersion = flag.Bool("version", false, "show version and exit.")
 	// TODO(gvisor.dev/issue/193): support systemd cgroups
 	systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.")
+	showVersion   = flag.Bool("version", false, "show version and exit.")
 
 	// These flags are unique to runsc, and are used to configure parts of the
 	// system that are not covered by the runtime spec.
 
 	// Debugging flags.
-	debugLog        = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
-	panicLog        = flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.")
-	logPackets      = flag.Bool("log-packets", false, "enable network packet logging.")
-	logFD           = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
-	debugLogFD      = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
-	panicLogFD      = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.")
-	debugLogFormat  = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
-	alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr.")
-
-	// Debugging flags: strace related
-	strace         = flag.Bool("strace", false, "enable strace.")
-	straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
-	straceLogSize  = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.")
-
-	// Flags that control sandbox runtime behavior.
-	platformName       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
-	network            = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
-	hardwareGSO        = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
-	softwareGSO        = flag.Bool("software-gso", true, "enable software segmentation offload when hardware offload can't be enabled.")
-	txChecksumOffload  = flag.Bool("tx-checksum-offload", false, "enable TX checksum offload.")
-	rxChecksumOffload  = flag.Bool("rx-checksum-offload", true, "enable RX checksum offload.")
-	qDisc              = flag.String("qdisc", "fifo", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.")
-	fileAccess         = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
-	fsGoferHostUDS     = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
-	overlay            = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
-	overlayfsStaleRead = flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem")
-	watchdogAction     = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
-	panicSignal        = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
-	profile            = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
-	netRaw             = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
-	numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
-	rootless           = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
-	referenceLeakMode  = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
-	cpuNumFromQuota    = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
-	vfs2Enabled        = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
-	fuseEnabled        = flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.")
-
-	// Test flags, not to be used outside tests, ever.
-	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
-	testOnlyTestNameEnv                        = flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.")
+	logFD      = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
+	debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
+	panicLogFD = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.")
 )
 
 func main() {
@@ -136,6 +93,8 @@ func main() {
 	subcommands.Register(new(cmd.Gofer), internalGroup)
 	subcommands.Register(new(cmd.Statefile), internalGroup)
 
+	config.RegisterFlags()
+
 	// All subcommands must be registered before flag parsing.
 	flag.Parse()
 
@@ -147,6 +106,12 @@ func main() {
 		os.Exit(0)
 	}
 
+	// Create a new Config from the flags.
+	conf, err := config.NewFromFlags()
+	if err != nil {
+		cmd.Fatalf(err.Error())
+	}
+
 	// TODO(gvisor.dev/issue/193): support systemd cgroups
 	if *systemdCgroup {
 		fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193")
@@ -157,103 +122,28 @@ func main() {
 	if *logFD > -1 {
 		errorLogger = os.NewFile(uintptr(*logFD), "error log file")
 
-	} else if *logFilename != "" {
+	} else if conf.LogFilename != "" {
 		// We must set O_APPEND and not O_TRUNC because Docker passes
 		// the same log file for all commands (and also parses these
 		// log files), so we can't destroy them on each command.
 		var err error
-		errorLogger, err = os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
+		errorLogger, err = os.OpenFile(conf.LogFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
 		if err != nil {
-			cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
+			cmd.Fatalf("error opening log file %q: %v", conf.LogFilename, err)
 		}
 	}
 	cmd.ErrorLogger = errorLogger
 
-	platformType := *platformName
-	if _, err := platform.Lookup(platformType); err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	fsAccess, err := config.MakeFileAccessType(*fileAccess)
-	if err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	if fsAccess == config.FileAccessShared && *overlay {
-		cmd.Fatalf("overlay flag is incompatible with shared file access")
-	}
-
-	netType, err := config.MakeNetworkType(*network)
-	if err != nil {
+	if _, err := platform.Lookup(conf.Platform); err != nil {
 		cmd.Fatalf("%v", err)
 	}
 
-	wa, err := config.MakeWatchdogAction(*watchdogAction)
-	if err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	if *numNetworkChannels <= 0 {
-		cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels)
-	}
-
-	refsLeakMode, err := config.MakeRefsLeakMode(*referenceLeakMode)
-	if err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	queueingDiscipline, err := config.MakeQueueingDiscipline(*qDisc)
-	if err != nil {
-		cmd.Fatalf("%s", err)
-	}
-
 	// Sets the reference leak check mode. Also set it in config below to
 	// propagate it to child processes.
-	refs.SetLeakMode(refsLeakMode)
-
-	// Create a new Config from the flags.
-	conf := &config.Config{
-		RootDir:            *rootDir,
-		Debug:              *debug,
-		LogFilename:        *logFilename,
-		LogFormat:          *logFormat,
-		DebugLog:           *debugLog,
-		PanicLog:           *panicLog,
-		DebugLogFormat:     *debugLogFormat,
-		FileAccess:         fsAccess,
-		FSGoferHostUDS:     *fsGoferHostUDS,
-		Overlay:            *overlay,
-		Network:            netType,
-		HardwareGSO:        *hardwareGSO,
-		SoftwareGSO:        *softwareGSO,
-		TXChecksumOffload:  *txChecksumOffload,
-		RXChecksumOffload:  *rxChecksumOffload,
-		LogPackets:         *logPackets,
-		Platform:           platformType,
-		Strace:             *strace,
-		StraceLogSize:      *straceLogSize,
-		WatchdogAction:     wa,
-		PanicSignal:        *panicSignal,
-		ProfileEnable:      *profile,
-		EnableRaw:          *netRaw,
-		NumNetworkChannels: *numNetworkChannels,
-		Rootless:           *rootless,
-		AlsoLogToStderr:    *alsoLogToStderr,
-		ReferenceLeakMode:  refsLeakMode,
-		OverlayfsStaleRead: *overlayfsStaleRead,
-		CPUNumFromQuota:    *cpuNumFromQuota,
-		VFS2:               *vfs2Enabled,
-		FUSE:               *fuseEnabled,
-		QDisc:              queueingDiscipline,
-		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
-		TestOnlyTestNameEnv:                        *testOnlyTestNameEnv,
-	}
-	if len(*straceSyscalls) != 0 {
-		conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
-	}
+	refs.SetLeakMode(conf.ReferenceLeak)
 
 	// Set up logging.
-	if *debug {
+	if conf.Debug {
 		log.SetLevel(log.Debug)
 	}
 
@@ -275,14 +165,14 @@ func main() {
 	if *debugLogFD > -1 {
 		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
 
-		e = newEmitter(*debugLogFormat, f)
+		e = newEmitter(conf.DebugLogFormat, f)
 
-	} else if *debugLog != "" {
-		f, err := specutils.DebugLogFile(*debugLog, subcommand, "" /* name */)
+	} else if conf.DebugLog != "" {
+		f, err := specutils.DebugLogFile(conf.DebugLog, subcommand, "" /* name */)
 		if err != nil {
-			cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err)
+			cmd.Fatalf("error opening debug log file in %q: %v", conf.DebugLog, err)
 		}
-		e = newEmitter(*debugLogFormat, f)
+		e = newEmitter(conf.DebugLogFormat, f)
 
 	} else {
 		// Stderr is reserved for the application, just discard the logs if no debug
@@ -308,8 +198,8 @@ func main() {
 		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil {
 			cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err)
 		}
-	} else if *alsoLogToStderr {
-		e = &log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
+	} else if conf.AlsoLogToStderr {
+		e = &log.MultiEmitter{e, newEmitter(conf.DebugLogFormat, os.Stderr)}
 	}
 
 	log.SetTarget(e)
@@ -328,7 +218,7 @@ func main() {
 	log.Infof("\t\tVFS2 enabled: %v", conf.VFS2)
 	log.Infof("***************************")
 
-	if *testOnlyAllowRunAsCurrentUserWithoutChroot {
+	if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 		// SIGTERM is sent to all processes if a test exceeds its
 		// timeout and this case is handled by syscall_test_runner.
 		log.Warningf("Block the TERM signal. This is only safe in tests!")
@@ -364,11 +254,3 @@ func newEmitter(format string, logFile io.Writer) log.Emitter {
 	cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format)
 	panic("unreachable")
 }
-
-func init() {
-	// Set default root dir to something (hopefully) user-writeable.
-	*rootDir = "/var/run/runsc"
-	if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
-		*rootDir = filepath.Join(runtimeDir, "runsc")
-	}
-}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index f9abb2d44..0b9f39466 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -69,7 +69,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *config.Con
 	case config.NetworkHost:
 		// Nothing to do here.
 	default:
-		return fmt.Errorf("invalid network type: %d", conf.Network)
+		return fmt.Errorf("invalid network type: %v", conf.Network)
 	}
 	return nil
 }
-- 
cgit v1.2.3


From 97d6398d435025c7ab361c36994feab2c7e2d84f Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 27 Aug 2020 10:51:59 -0700
Subject: ip6tables: (de)serialize ip6tables structs

More implementation+testing to follow.

#3549.

PiperOrigin-RevId: 328770160
---
 pkg/abi/linux/netfilter_ipv6.go          |  13 ++
 pkg/sentry/socket/netfilter/BUILD        |   1 +
 pkg/sentry/socket/netfilter/ipv4.go      |  33 +++-
 pkg/sentry/socket/netfilter/ipv6.go      | 265 +++++++++++++++++++++++++++++++
 pkg/sentry/socket/netfilter/netfilter.go |  77 +++++----
 pkg/sentry/socket/netfilter/targets.go   |  10 +-
 pkg/sentry/socket/netstack/netstack.go   |  75 ++++++++-
 pkg/tcpip/stack/iptables.go              |  12 +-
 pkg/tcpip/stack/iptables_types.go        |   5 +
 test/syscalls/linux/ip6tables.cc         |  48 ++++++
 10 files changed, 489 insertions(+), 50 deletions(-)
 create mode 100644 pkg/sentry/socket/netfilter/ipv6.go

(limited to 'pkg')

diff --git a/pkg/abi/linux/netfilter_ipv6.go b/pkg/abi/linux/netfilter_ipv6.go
index 9bb9efb10..f6117024c 100644
--- a/pkg/abi/linux/netfilter_ipv6.go
+++ b/pkg/abi/linux/netfilter_ipv6.go
@@ -290,6 +290,19 @@ type IP6TIP struct {
 
 const SizeOfIP6TIP = 136
 
+// Flags in IP6TIP.Flags. Corresponding constants are in
+// include/uapi/linux/netfilter_ipv6/ip6_tables.h.
+const (
+	// Whether to check the Protocol field.
+	IP6T_F_PROTO = 0x01
+	// Whether to match the TOS field.
+	IP6T_F_TOS = 0x02
+	// Indicates that the jump target is an aboslute GOTO, not an offset.
+	IP6T_F_GOTO = 0x04
+	// Enables all flags.
+	IP6T_F_MASK = 0x07
+)
+
 // Flags in IP6TIP.InverseFlags. Corresponding constants are in
 // include/uapi/linux/netfilter_ipv6/ip6_tables.h.
 const (
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index 795620589..8aea0200f 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "extensions.go",
         "ipv4.go",
+        "ipv6.go",
         "netfilter.go",
         "owner_matcher.go",
         "targets.go",
diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go
index 4fb887e49..e4c55a100 100644
--- a/pkg/sentry/socket/netfilter/ipv4.go
+++ b/pkg/sentry/socket/netfilter/ipv4.go
@@ -36,14 +36,37 @@ var emptyIPv4Filter = stack.IPHeaderFilter{
 	SrcMask: "\x00\x00\x00\x00",
 }
 
-func getEntries4(table stack.Table, info *linux.IPTGetinfo) linux.KernelIPTGetEntries {
+// convertNetstackToBinary4 converts the iptables as stored in netstack to the
+// format expected by the iptables tool. Linux stores each table as a binary
+// blob that can only be traversed by parsing a little data, reading some
+// offsets, jumping to those offsets, parsing again, etc.
+func convertNetstackToBinary4(stk *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) {
+	// The table name has to fit in the struct.
+	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
+		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
+	}
+
+	table, ok := stk.IPTables().GetTable(tablename.String(), false)
+	if !ok {
+		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
+	}
+
+	// Setup the info struct.
+	entries, info := getEntries4(table, tablename)
+	return entries, info, nil
+}
+
+func getEntries4(table stack.Table, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo) {
+	var info linux.IPTGetinfo
 	var entries linux.KernelIPTGetEntries
+	copy(info.Name[:], tablename[:])
 	copy(entries.Name[:], info.Name[:])
+	info.ValidHooks = table.ValidHooks()
 
 	for ruleIdx, rule := range table.Rules {
 		nflog("convert to binary: current offset: %d", entries.Size)
 
-		setHooksAndUnderflow(info, table, entries.Size, ruleIdx)
+		setHooksAndUnderflow(&info, table, entries.Size, ruleIdx)
 		// Each rule corresponds to an entry.
 		entry := linux.KernelIPTEntry{
 			Entry: linux.IPTEntry{
@@ -100,7 +123,7 @@ func getEntries4(table stack.Table, info *linux.IPTGetinfo) linux.KernelIPTGetEn
 
 	info.Size = entries.Size
 	nflog("convert to binary: finished with an marshalled size of %d", info.Size)
-	return entries
+	return entries, info
 }
 
 func modifyEntries4(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) {
@@ -205,7 +228,9 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
 	ifnameMask := string(iptip.OutputInterfaceMask[:n])
 
 	return stack.IPHeaderFilter{
-		Protocol:              tcpip.TransportProtocolNumber(iptip.Protocol),
+		Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
+		// A Protocol value of 0 indicates all protocols match.
+		CheckProtocol:         iptip.Protocol != 0,
 		Dst:                   tcpip.Address(iptip.Dst[:]),
 		DstMask:               tcpip.Address(iptip.DstMask[:]),
 		DstInvert:             iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
diff --git a/pkg/sentry/socket/netfilter/ipv6.go b/pkg/sentry/socket/netfilter/ipv6.go
new file mode 100644
index 000000000..3b2c1becd
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/ipv6.go
@@ -0,0 +1,265 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// emptyIPv6Filter is for comparison with a rule's filters to determine whether
+// it is also empty. It is immutable.
+var emptyIPv6Filter = stack.IPHeaderFilter{
+	Dst:     "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	DstMask: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	Src:     "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	SrcMask: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+}
+
+// convertNetstackToBinary6 converts the ip6tables as stored in netstack to the
+// format expected by the iptables tool. Linux stores each table as a binary
+// blob that can only be traversed by parsing a little data, reading some
+// offsets, jumping to those offsets, parsing again, etc.
+func convertNetstackToBinary6(stk *stack.Stack, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo, error) {
+	// The table name has to fit in the struct.
+	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
+		return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
+	}
+
+	table, ok := stk.IPTables().GetTable(tablename.String(), true)
+	if !ok {
+		return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
+	}
+
+	// Setup the info struct, which is the same in IPv4 and IPv6.
+	entries, info := getEntries6(table, tablename)
+	return entries, info, nil
+}
+
+func getEntries6(table stack.Table, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo) {
+	var info linux.IPTGetinfo
+	var entries linux.KernelIP6TGetEntries
+	copy(info.Name[:], tablename[:])
+	copy(entries.Name[:], info.Name[:])
+	info.ValidHooks = table.ValidHooks()
+
+	for ruleIdx, rule := range table.Rules {
+		nflog("convert to binary: current offset: %d", entries.Size)
+
+		setHooksAndUnderflow(&info, table, entries.Size, ruleIdx)
+		// Each rule corresponds to an entry.
+		entry := linux.KernelIP6TEntry{
+			Entry: linux.IP6TEntry{
+				IPv6: linux.IP6TIP{
+					Protocol: uint16(rule.Filter.Protocol),
+				},
+				NextOffset:   linux.SizeOfIP6TEntry,
+				TargetOffset: linux.SizeOfIP6TEntry,
+			},
+		}
+		copy(entry.Entry.IPv6.Dst[:], rule.Filter.Dst)
+		copy(entry.Entry.IPv6.DstMask[:], rule.Filter.DstMask)
+		copy(entry.Entry.IPv6.Src[:], rule.Filter.Src)
+		copy(entry.Entry.IPv6.SrcMask[:], rule.Filter.SrcMask)
+		copy(entry.Entry.IPv6.OutputInterface[:], rule.Filter.OutputInterface)
+		copy(entry.Entry.IPv6.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
+		if rule.Filter.DstInvert {
+			entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_DSTIP
+		}
+		if rule.Filter.SrcInvert {
+			entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_SRCIP
+		}
+		if rule.Filter.OutputInterfaceInvert {
+			entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_VIA_OUT
+		}
+		if rule.Filter.CheckProtocol {
+			entry.Entry.IPv6.Flags |= linux.IP6T_F_PROTO
+		}
+
+		for _, matcher := range rule.Matchers {
+			// Serialize the matcher and add it to the
+			// entry.
+			serialized := marshalMatcher(matcher)
+			nflog("convert to binary: matcher serialized as: %v", serialized)
+			if len(serialized)%8 != 0 {
+				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
+			}
+			entry.Elems = append(entry.Elems, serialized...)
+			entry.Entry.NextOffset += uint16(len(serialized))
+			entry.Entry.TargetOffset += uint16(len(serialized))
+		}
+
+		// Serialize and append the target.
+		serialized := marshalTarget(rule.Target)
+		if len(serialized)%8 != 0 {
+			panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+		}
+		entry.Elems = append(entry.Elems, serialized...)
+		entry.Entry.NextOffset += uint16(len(serialized))
+
+		nflog("convert to binary: adding entry: %+v", entry)
+
+		entries.Size += uint32(entry.Entry.NextOffset)
+		entries.Entrytable = append(entries.Entrytable, entry)
+		info.NumEntries++
+	}
+
+	info.Size = entries.Size
+	nflog("convert to binary: finished with an marshalled size of %d", info.Size)
+	return entries, info
+}
+
+func modifyEntries6(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) {
+	nflog("set entries: setting entries in table %q", replace.Name.String())
+
+	// Convert input into a list of rules and their offsets.
+	var offset uint32
+	// offsets maps rule byte offsets to their position in table.Rules.
+	offsets := map[uint32]int{}
+	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+		nflog("set entries: processing entry at offset %d", offset)
+
+		// Get the struct ipt_entry.
+		if len(optVal) < linux.SizeOfIP6TEntry {
+			nflog("optVal has insufficient size for entry %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		var entry linux.IP6TEntry
+		buf := optVal[:linux.SizeOfIP6TEntry]
+		binary.Unmarshal(buf, usermem.ByteOrder, &entry)
+		initialOptValLen := len(optVal)
+		optVal = optVal[linux.SizeOfIP6TEntry:]
+
+		if entry.TargetOffset < linux.SizeOfIP6TEntry {
+			nflog("entry has too-small target offset %d", entry.TargetOffset)
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/170): We should support more IPTIP
+		// filtering fields.
+		filter, err := filterFromIP6TIP(entry.IPv6)
+		if err != nil {
+			nflog("bad iptip: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
+		// that they only work for certain protocols, hooks, tables.
+		// Get matchers.
+		matchersSize := entry.TargetOffset - linux.SizeOfIP6TEntry
+		if len(optVal) < int(matchersSize) {
+			nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		matchers, err := parseMatchers(filter, optVal[:matchersSize])
+		if err != nil {
+			nflog("failed to parse matchers: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+		optVal = optVal[matchersSize:]
+
+		// Get the target of the rule.
+		targetSize := entry.NextOffset - entry.TargetOffset
+		if len(optVal) < int(targetSize) {
+			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		target, err := parseTarget(filter, optVal[:targetSize])
+		if err != nil {
+			nflog("failed to parse target: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+		optVal = optVal[targetSize:]
+
+		table.Rules = append(table.Rules, stack.Rule{
+			Filter:   filter,
+			Target:   target,
+			Matchers: matchers,
+		})
+		offsets[offset] = int(entryIdx)
+		offset += uint32(entry.NextOffset)
+
+		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
+			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+	}
+	return offsets, nil
+}
+
+func filterFromIP6TIP(iptip linux.IP6TIP) (stack.IPHeaderFilter, error) {
+	if containsUnsupportedFields6(iptip) {
+		return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
+	}
+	if len(iptip.Dst) != header.IPv6AddressSize || len(iptip.DstMask) != header.IPv6AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
+	}
+	if len(iptip.Src) != header.IPv6AddressSize || len(iptip.SrcMask) != header.IPv6AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
+	}
+
+	n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterface)
+	}
+	ifname := string(iptip.OutputInterface[:n])
+
+	n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterfaceMask)
+	}
+	ifnameMask := string(iptip.OutputInterfaceMask[:n])
+
+	return stack.IPHeaderFilter{
+		Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
+		// In ip6tables a flag controls whether to check the protocol.
+		CheckProtocol:         iptip.Flags&linux.IP6T_F_PROTO != 0,
+		Dst:                   tcpip.Address(iptip.Dst[:]),
+		DstMask:               tcpip.Address(iptip.DstMask[:]),
+		DstInvert:             iptip.InverseFlags&linux.IP6T_INV_DSTIP != 0,
+		Src:                   tcpip.Address(iptip.Src[:]),
+		SrcMask:               tcpip.Address(iptip.SrcMask[:]),
+		SrcInvert:             iptip.InverseFlags&linux.IP6T_INV_SRCIP != 0,
+		OutputInterface:       ifname,
+		OutputInterfaceMask:   ifnameMask,
+		OutputInterfaceInvert: iptip.InverseFlags&linux.IP6T_INV_VIA_OUT != 0,
+	}, nil
+}
+
+func containsUnsupportedFields6(iptip linux.IP6TIP) bool {
+	// The following features are supported:
+	// - Protocol
+	// - Dst and DstMask
+	// - Src and SrcMask
+	// - The inverse destination IP check flag
+	// - OutputInterface, OutputInterfaceMask and its inverse.
+	var emptyInterface = [linux.IFNAMSIZ]byte{}
+	flagMask := uint8(linux.IP6T_F_PROTO)
+	// Disable any supported inverse flags.
+	inverseMask := uint8(linux.IP6T_INV_DSTIP) | uint8(linux.IP6T_INV_SRCIP) | uint8(linux.IP6T_INV_VIA_OUT)
+	return iptip.InputInterface != emptyInterface ||
+		iptip.InputInterfaceMask != emptyInterface ||
+		iptip.Flags&^flagMask != 0 ||
+		iptip.InverseFlags&^inverseMask != 0 ||
+		iptip.TOS != 0
+}
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index df256676f..3e1735079 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -42,14 +42,19 @@ func nflog(format string, args ...interface{}) {
 }
 
 // GetInfo returns information about iptables.
-func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
+func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, ipv6 bool) (linux.IPTGetinfo, *syserr.Error) {
 	// Read in the struct and table name.
 	var info linux.IPTGetinfo
 	if _, err := info.CopyIn(t, outPtr); err != nil {
 		return linux.IPTGetinfo{}, syserr.FromError(err)
 	}
 
-	_, info, err := convertNetstackToBinary(stack, info.Name)
+	var err error
+	if ipv6 {
+		_, info, err = convertNetstackToBinary6(stack, info.Name)
+	} else {
+		_, info, err = convertNetstackToBinary4(stack, info.Name)
+	}
 	if err != nil {
 		nflog("couldn't convert iptables: %v", err)
 		return linux.IPTGetinfo{}, syserr.ErrInvalidArgument
@@ -59,9 +64,9 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	return info, nil
 }
 
-// GetEntries4 returns netstack's iptables rules encoded for the iptables tool.
+// GetEntries4 returns netstack's iptables rules.
 func GetEntries4(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
-	// Read in the ABI struct.
+	// Read in the struct and table name.
 	var userEntries linux.IPTGetEntries
 	if _, err := userEntries.CopyIn(t, outPtr); err != nil {
 		nflog("couldn't copy in entries %q", userEntries.Name)
@@ -70,7 +75,7 @@ func GetEntries4(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 
 	// Convert netstack's iptables rules to something that the iptables
 	// tool can understand.
-	entries, _, err := convertNetstackToBinary(stack, userEntries.Name)
+	entries, _, err := convertNetstackToBinary4(stack, userEntries.Name)
 	if err != nil {
 		nflog("couldn't read entries: %v", err)
 		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
@@ -83,28 +88,29 @@ func GetEntries4(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	return entries, nil
 }
 
-// convertNetstackToBinary converts the iptables as stored in netstack to the
-// format expected by the iptables tool. Linux stores each table as a binary
-// blob that can only be traversed by parsing a bit, reading some offsets,
-// jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(stk *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) {
-	// The table name has to fit in the struct.
-	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
-		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
+// GetEntries6 returns netstack's ip6tables rules.
+func GetEntries6(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIP6TGetEntries, *syserr.Error) {
+	// Read in the struct and table name. IPv4 and IPv6 utilize structs
+	// with the same layout.
+	var userEntries linux.IPTGetEntries
+	if _, err := userEntries.CopyIn(t, outPtr); err != nil {
+		nflog("couldn't copy in entries %q", userEntries.Name)
+		return linux.KernelIP6TGetEntries{}, syserr.FromError(err)
 	}
 
-	table, ok := stk.IPTables().GetTable(tablename.String())
-	if !ok {
-		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
+	// Convert netstack's iptables rules to something that the iptables
+	// tool can understand.
+	entries, _, err := convertNetstackToBinary6(stack, userEntries.Name)
+	if err != nil {
+		nflog("couldn't read entries: %v", err)
+		return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument
+	}
+	if binary.Size(entries) > uintptr(outLen) {
+		nflog("insufficient GetEntries output size: %d", uintptr(outLen))
+		return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument
 	}
 
-	// Setup the info struct.
-	var info linux.IPTGetinfo
-	info.ValidHooks = table.ValidHooks()
-	copy(info.Name[:], tablename[:])
-
-	entries := getEntries4(table, &info)
-	return entries, info, nil
+	return entries, nil
 }
 
 // setHooksAndUnderflow checks whether the rule at ruleIdx is a hook entrypoint
@@ -128,7 +134,7 @@ func setHooksAndUnderflow(info *linux.IPTGetinfo, table stack.Table, offset uint
 
 // SetEntries sets iptables rules for a single table. See
 // net/ipv4/netfilter/ip_tables.c:translate_table for reference.
-func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
+func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error {
 	var replace linux.IPTReplace
 	replaceBuf := optVal[:linux.SizeOfIPTReplace]
 	optVal = optVal[linux.SizeOfIPTReplace:]
@@ -146,7 +152,13 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 		return syserr.ErrInvalidArgument
 	}
 
-	offsets, err := modifyEntries4(stk, optVal, &replace, &table)
+	var err *syserr.Error
+	var offsets map[uint32]int
+	if ipv6 {
+		offsets, err = modifyEntries6(stk, optVal, &replace, &table)
+	} else {
+		offsets, err = modifyEntries4(stk, optVal, &replace, &table)
+	}
 	if err != nil {
 		return err
 	}
@@ -163,7 +175,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 					table.BuiltinChains[hk] = ruleIdx
 				}
 				if offset == replace.Underflow[hook] {
-					if !validUnderflow(table.Rules[ruleIdx]) {
+					if !validUnderflow(table.Rules[ruleIdx], ipv6) {
 						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP", ruleIdx)
 						return syserr.ErrInvalidArgument
 					}
@@ -228,7 +240,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 			if ruleIdx == stack.HookUnset {
 				continue
 			}
-			if !isUnconditionalAccept(table.Rules[ruleIdx]) {
+			if !isUnconditionalAccept(table.Rules[ruleIdx], ipv6) {
 				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
 			}
@@ -240,7 +252,8 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	// - There are no chains without an unconditional final rule.
 	// - There are no chains without an unconditional underflow rule.
 
-	return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(replace.Name.String(), table))
+	return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(replace.Name.String(), table, ipv6))
+
 }
 
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
@@ -286,11 +299,11 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher,
 	return matchers, nil
 }
 
-func validUnderflow(rule stack.Rule) bool {
+func validUnderflow(rule stack.Rule, ipv6 bool) bool {
 	if len(rule.Matchers) != 0 {
 		return false
 	}
-	if rule.Filter != emptyIPv4Filter {
+	if (ipv6 && rule.Filter != emptyIPv6Filter) || (!ipv6 && rule.Filter != emptyIPv4Filter) {
 		return false
 	}
 	switch rule.Target.(type) {
@@ -301,8 +314,8 @@ func validUnderflow(rule stack.Rule) bool {
 	}
 }
 
-func isUnconditionalAccept(rule stack.Rule) bool {
-	if !validUnderflow(rule) {
+func isUnconditionalAccept(rule stack.Rule, ipv6 bool) bool {
+	if !validUnderflow(rule, ipv6) {
 		return false
 	}
 	_, ok := rule.Target.(stack.AcceptTarget)
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
index 8ebdaff18..87e41abd8 100644
--- a/pkg/sentry/socket/netfilter/targets.go
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -218,8 +218,8 @@ func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, erro
 			return nil, fmt.Errorf("netfilter.SetEntries: optVal has insufficient size for redirect target %d", len(optVal))
 		}
 
-		if filter.Protocol != header.TCPProtocolNumber && filter.Protocol != header.UDPProtocolNumber {
-			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
+		if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
+			return nil, fmt.Errorf("netfilter.SetEntries: bad proto %d", p)
 		}
 
 		var redirectTarget linux.XTRedirectTarget
@@ -232,7 +232,7 @@ func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, erro
 
 		// RangeSize should be 1.
 		if nfRange.RangeSize != 1 {
-			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
+			return nil, fmt.Errorf("netfilter.SetEntries: bad rangesize %d", nfRange.RangeSize)
 		}
 
 		// TODO(gvisor.dev/issue/170): Check if the flags are valid.
@@ -240,7 +240,7 @@ func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, erro
 		// For now, redirect target only supports destination port change.
 		// Port range and IP range are not supported yet.
 		if nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED == 0 {
-			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
+			return nil, fmt.Errorf("netfilter.SetEntries: invalid range flags %d", nfRange.RangeIPV4.Flags)
 		}
 		target.RangeProtoSpecified = true
 
@@ -249,7 +249,7 @@ func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, erro
 
 		// TODO(gvisor.dev/issue/170): Port range is not supported yet.
 		if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
-			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
+			return nil, fmt.Errorf("netfilter.SetEntries: minport != maxport (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
 		}
 
 		// Convert port from big endian to little endian.
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 9e2ebc7d4..2af2d8252 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -997,7 +997,7 @@ func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family in
 		return getSockOptTCP(t, ep, name, outLen)
 
 	case linux.SOL_IPV6:
-		return getSockOptIPv6(t, ep, name, outLen)
+		return getSockOptIPv6(t, s, ep, name, outPtr, outLen)
 
 	case linux.SOL_IP:
 		return getSockOptIP(t, s, ep, name, outPtr, outLen, family)
@@ -1455,7 +1455,7 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal
 }
 
 // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
-func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
+func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
 	switch name {
 	case linux.IPV6_V6ONLY:
 		if outLen < sizeOfInt32 {
@@ -1508,10 +1508,50 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marsha
 		vP := primitive.Int32(boolToInt32(v))
 		return &vP, nil
 
-	case linux.SO_ORIGINAL_DST:
+	case linux.IP6T_ORIGINAL_DST:
 		// TODO(gvisor.dev/issue/170): ip6tables.
 		return nil, syserr.ErrInvalidArgument
 
+	case linux.IP6T_SO_GET_INFO:
+		if outLen < linux.SizeOfIPTGetinfo {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv6 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, true)
+		if err != nil {
+			return nil, err
+		}
+		return &info, nil
+
+	case linux.IP6T_SO_GET_ENTRIES:
+		// IPTGetEntries is reused for IPv6.
+		if outLen < linux.SizeOfIPTGetEntries {
+			return nil, syserr.ErrInvalidArgument
+		}
+		// Only valid for raw IPv6 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		entries, err := netfilter.GetEntries6(t, stack.(*Stack).Stack, outPtr, outLen)
+		if err != nil {
+			return nil, err
+		}
+		return &entries, nil
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
@@ -1649,7 +1689,7 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 		if stack == nil {
 			return nil, syserr.ErrNoDevice
 		}
-		info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr)
+		info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, false)
 		if err != nil {
 			return nil, err
 		}
@@ -1722,7 +1762,7 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int
 		return setSockOptTCP(t, ep, name, optVal)
 
 	case linux.SOL_IPV6:
-		return setSockOptIPv6(t, ep, name, optVal)
+		return setSockOptIPv6(t, s, ep, name, optVal)
 
 	case linux.SOL_IP:
 		return setSockOptIP(t, s, ep, name, optVal)
@@ -2027,7 +2067,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 }
 
 // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
-func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
 	case linux.IPV6_V6ONLY:
 		if len(optVal) < sizeOfInt32 {
@@ -2076,6 +2116,27 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 
 		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0))
 
+	case linux.IP6T_SO_SET_REPLACE:
+		if len(optVal) < linux.SizeOfIP6TReplace {
+			return syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv6 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+			return syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return syserr.ErrNoDevice
+		}
+		// Stack must be a netstack stack.
+		return netfilter.SetEntries(stack.(*Stack).Stack, optVal, true)
+
+	case linux.IP6T_SO_SET_ADD_COUNTERS:
+		// TODO(gvisor.dev/issue/170): Counter support.
+		return nil
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
@@ -2271,7 +2332,7 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 			return syserr.ErrNoDevice
 		}
 		// Stack must be a netstack stack.
-		return netfilter.SetEntries(stack.(*Stack).Stack, optVal)
+		return netfilter.SetEntries(stack.(*Stack).Stack, optVal, false)
 
 	case linux.IPT_SO_SET_ADD_COUNTERS:
 		// TODO(gvisor.dev/issue/170): Counter support.
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index 41ef4236b..30aa41db2 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -165,7 +165,11 @@ func EmptyNATTable() Table {
 }
 
 // GetTable returns a table by name.
-func (it *IPTables) GetTable(name string) (Table, bool) {
+func (it *IPTables) GetTable(name string, ipv6 bool) (Table, bool) {
+	// TODO(gvisor.dev/issue/3549): Enable IPv6.
+	if ipv6 {
+		return Table{}, false
+	}
 	id, ok := nameToID[name]
 	if !ok {
 		return Table{}, false
@@ -176,7 +180,11 @@ func (it *IPTables) GetTable(name string) (Table, bool) {
 }
 
 // ReplaceTable replaces or inserts table by name.
-func (it *IPTables) ReplaceTable(name string, table Table) *tcpip.Error {
+func (it *IPTables) ReplaceTable(name string, table Table, ipv6 bool) *tcpip.Error {
+	// TODO(gvisor.dev/issue/3549): Enable IPv6.
+	if ipv6 {
+		return tcpip.ErrInvalidOptionValue
+	}
 	id, ok := nameToID[name]
 	if !ok {
 		return tcpip.ErrInvalidOptionValue
diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go
index 73274ada9..fbbd2f50f 100644
--- a/pkg/tcpip/stack/iptables_types.go
+++ b/pkg/tcpip/stack/iptables_types.go
@@ -155,6 +155,11 @@ type IPHeaderFilter struct {
 	// Protocol matches the transport protocol.
 	Protocol tcpip.TransportProtocolNumber
 
+	// CheckProtocol determines whether the Protocol field should be
+	// checked during matching.
+	// TODO(gvisor.dev/issue/3549): Check this field during matching.
+	CheckProtocol bool
+
 	// Dst matches the destination IP address.
 	Dst tcpip.Address
 
diff --git a/test/syscalls/linux/ip6tables.cc b/test/syscalls/linux/ip6tables.cc
index 685e513f8..78e1fa09d 100644
--- a/test/syscalls/linux/ip6tables.cc
+++ b/test/syscalls/linux/ip6tables.cc
@@ -34,6 +34,54 @@ constexpr size_t kEmptyStandardEntrySize =
 constexpr size_t kEmptyErrorEntrySize =
     sizeof(struct ip6t_entry) + sizeof(struct xt_error_target);
 
+TEST(IP6TablesBasic, FailSockoptNonRaw) {
+  // Even if the user has CAP_NET_RAW, they shouldn't be able to use the
+  // ip6tables sockopts with a non-raw socket.
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET6, SOCK_DGRAM, 0), SyscallSucceeds());
+
+  struct ipt_getinfo info = {};
+  snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  socklen_t info_size = sizeof(info);
+  EXPECT_THAT(getsockopt(sock, SOL_IPV6, IP6T_SO_GET_INFO, &info, &info_size),
+              SyscallFailsWithErrno(ENOPROTOOPT));
+
+  EXPECT_THAT(close(sock), SyscallSucceeds());
+}
+
+TEST(IP6TablesBasic, GetInfoErrorPrecedence) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET6, SOCK_DGRAM, 0), SyscallSucceeds());
+
+  // When using the wrong type of socket and a too-short optlen, we should get
+  // EINVAL.
+  struct ipt_getinfo info = {};
+  snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  socklen_t info_size = sizeof(info) - 1;
+  EXPECT_THAT(getsockopt(sock, SOL_IPV6, IP6T_SO_GET_INFO, &info, &info_size),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(IP6TablesBasic, GetEntriesErrorPrecedence) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET6, SOCK_DGRAM, 0), SyscallSucceeds());
+
+  // When using the wrong type of socket and a too-short optlen, we should get
+  // EINVAL.
+  struct ip6t_get_entries entries = {};
+  socklen_t entries_size = sizeof(struct ip6t_get_entries) - 1;
+  snprintf(entries.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  EXPECT_THAT(
+      getsockopt(sock, SOL_IPV6, IP6T_SO_GET_ENTRIES, &entries, &entries_size),
+      SyscallFailsWithErrno(EINVAL));
+}
+
 // This tests the initial state of a machine with empty ip6tables via
 // getsockopt(IP6T_SO_GET_INFO). We don't have a guarantee that the iptables are
 // empty when running in native, but we can test that gVisor has the same
-- 
cgit v1.2.3


From 5d449c870622f7088825af5650786e8bb755567a Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 27 Aug 2020 12:48:19 -0700
Subject: Add function to get error from a tcpip.Endpoint

In an upcoming CL, socket option types are made to implement a marker
interface with pointer receivers. Since this results in calling methods
of an interface with a pointer, we incur an allocation when attempting
to get an Endpoint's last error with the current implementation.

When calling the method of an interface, the compiler is unable to
determine what the interface implementation does with the pointer
(since calling a method on an interface uses virtual dispatch at runtime
so the compiler does not know what the interface method will do) so it
allocates on the heap to be safe incase an implementation continues to
hold the pointer after the functioon returns (the reference escapes the
scope of the object).

In the example below, the compiler does not know what b.foo does with
the reference to a it allocates a on the heap as the reference to a may
escape the scope of a.
```
var a int
var b someInterface
b.foo(&a)
```

This change removes the opportunity for that allocation.

RELNOTES: n/a
PiperOrigin-RevId: 328796559
---
 pkg/sentry/socket/netstack/netstack.go             |  5 ++++-
 pkg/sentry/socket/unix/transport/unix.go           | 10 ++++++++-
 pkg/tcpip/adapters/gonet/gonet.go                  |  2 +-
 pkg/tcpip/adapters/gonet/gonet_test.go             |  2 +-
 pkg/tcpip/sample/tun_tcp_connect/main.go           |  2 +-
 pkg/tcpip/stack/transport_test.go                  | 26 +++++++++-------------
 pkg/tcpip/tcpip.go                                 |  7 +++---
 pkg/tcpip/transport/icmp/endpoint.go               | 15 ++++++-------
 pkg/tcpip/transport/packet/endpoint.go             |  8 ++-----
 pkg/tcpip/transport/raw/endpoint.go                | 14 +++++-------
 pkg/tcpip/transport/tcp/connect.go                 |  4 ++--
 pkg/tcpip/transport/tcp/dual_stack_test.go         |  6 ++---
 pkg/tcpip/transport/tcp/endpoint.go                |  5 +----
 pkg/tcpip/transport/tcp/tcp_test.go                | 10 ++++-----
 pkg/tcpip/transport/tcp/testing/context/context.go |  5 ++---
 pkg/tcpip/transport/udp/endpoint.go                |  8 +++----
 16 files changed, 60 insertions(+), 69 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 2af2d8252..8da77cc68 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -257,6 +257,9 @@ type commonEndpoint interface {
 	// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
 	// transport.Endpoint.GetSockOpt.
 	GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
+
+	// LastError implements tcpip.Endpoint.LastError.
+	LastError() *tcpip.Error
 }
 
 // LINT.IfChange
@@ -1030,7 +1033,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 		}
 
 		// Get the last error and convert it.
-		err := ep.GetSockOpt(tcpip.ErrorOption{})
+		err := ep.LastError()
 		if err == nil {
 			optP := primitive.Int32(0)
 			return &optP, nil
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index ab7bab5cd..4bf06d4dc 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -199,6 +199,9 @@ type Endpoint interface {
 	// State returns the current state of the socket, as represented by Linux in
 	// procfs.
 	State() uint32
+
+	// LastError implements tcpip.Endpoint.LastError.
+	LastError() *tcpip.Error
 }
 
 // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket
@@ -942,7 +945,7 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	switch opt.(type) {
-	case tcpip.ErrorOption, *tcpip.LingerOption:
+	case *tcpip.LingerOption:
 		return nil
 
 	default:
@@ -951,6 +954,11 @@ func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	}
 }
 
+// LastError implements Endpoint.LastError.
+func (*baseEndpoint) LastError() *tcpip.Error {
+	return nil
+}
+
 // Shutdown closes the read and/or write end of the endpoint connection to its
 // peer.
 func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error {
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index d82ed5205..68a954a10 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -541,7 +541,7 @@ func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress,
 		case <-notifyCh:
 		}
 
-		err = ep.GetSockOpt(tcpip.ErrorOption{})
+		err = ep.LastError()
 	}
 	if err != nil {
 		ep.Close()
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index 3c552988a..c975ad9cf 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -104,7 +104,7 @@ func connect(s *stack.Stack, addr tcpip.FullAddress) (*testConnection, *tcpip.Er
 	err = ep.Connect(addr)
 	if err == tcpip.ErrConnectStarted {
 		<-ch
-		err = ep.GetSockOpt(tcpip.ErrorOption{})
+		err = ep.LastError()
 	}
 	if err != nil {
 		return nil, err
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index 0ab089208..91fc26722 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -182,7 +182,7 @@ func main() {
 	if terr == tcpip.ErrConnectStarted {
 		fmt.Println("Connect is pending...")
 		<-notifyCh
-		terr = ep.GetSockOpt(tcpip.ErrorOption{})
+		terr = ep.LastError()
 	}
 	wq.EventUnregister(&waitEntry)
 
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 6c6e44468..7869bb98b 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -53,11 +53,11 @@ func (f *fakeTransportEndpoint) Info() tcpip.EndpointInfo {
 	return &f.TransportEndpointInfo
 }
 
-func (f *fakeTransportEndpoint) Stats() tcpip.EndpointStats {
+func (*fakeTransportEndpoint) Stats() tcpip.EndpointStats {
 	return nil
 }
 
-func (f *fakeTransportEndpoint) SetOwner(owner tcpip.PacketOwner) {}
+func (*fakeTransportEndpoint) SetOwner(owner tcpip.PacketOwner) {}
 
 func newFakeTransportEndpoint(s *stack.Stack, proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint {
 	return &fakeTransportEndpoint{stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
@@ -100,7 +100,7 @@ func (f *fakeTransportEndpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions
 	return int64(len(v)), nil, nil
 }
 
-func (f *fakeTransportEndpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+func (*fakeTransportEndpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 	return 0, tcpip.ControlMessages{}, nil
 }
 
@@ -131,10 +131,6 @@ func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.E
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (*fakeTransportEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
-		return nil
-	}
 	return tcpip.ErrInvalidEndpointState
 }
 
@@ -169,7 +165,7 @@ func (f *fakeTransportEndpoint) UniqueID() uint64 {
 	return f.uniqueID
 }
 
-func (f *fakeTransportEndpoint) ConnectEndpoint(e tcpip.Endpoint) *tcpip.Error {
+func (*fakeTransportEndpoint) ConnectEndpoint(e tcpip.Endpoint) *tcpip.Error {
 	return nil
 }
 
@@ -239,19 +235,19 @@ func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, s
 	f.proto.controlCount++
 }
 
-func (f *fakeTransportEndpoint) State() uint32 {
+func (*fakeTransportEndpoint) State() uint32 {
 	return 0
 }
 
-func (f *fakeTransportEndpoint) ModerateRecvBuf(copied int) {}
+func (*fakeTransportEndpoint) ModerateRecvBuf(copied int) {}
 
-func (f *fakeTransportEndpoint) IPTables() (stack.IPTables, error) {
-	return stack.IPTables{}, nil
-}
+func (*fakeTransportEndpoint) Resume(*stack.Stack) {}
 
-func (f *fakeTransportEndpoint) Resume(*stack.Stack) {}
+func (*fakeTransportEndpoint) Wait() {}
 
-func (f *fakeTransportEndpoint) Wait() {}
+func (*fakeTransportEndpoint) LastError() *tcpip.Error {
+	return nil
+}
 
 type fakeTransportGoodOption bool
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 609b8af33..cae943608 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -620,6 +620,9 @@ type Endpoint interface {
 
 	// SetOwner sets the task owner to the endpoint owner.
 	SetOwner(owner PacketOwner)
+
+	// LastError clears and returns the last error reported by the endpoint.
+	LastError() *Error
 }
 
 // LinkPacketInfo holds Link layer information for a received packet.
@@ -839,10 +842,6 @@ const (
 	PMTUDiscoveryProbe
 )
 
-// ErrorOption is used in GetSockOpt to specify that the last error reported by
-// the endpoint should be cleared and returned.
-type ErrorOption struct{}
-
 // BindToDeviceOption is used by SetSockOpt/GetSockOpt to specify that sockets
 // should bind only on a specific NIC.
 type BindToDeviceOption NICID
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index bd6f49eb8..c545c8367 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -415,14 +415,8 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
-		return nil
-
-	default:
-		return tcpip.ErrUnknownProtocolOption
-	}
+func (*endpoint) GetSockOpt(interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
 }
 
 func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpip.PacketOwner) *tcpip.Error {
@@ -836,3 +830,8 @@ func (e *endpoint) Stats() tcpip.EndpointStats {
 
 // Wait implements stack.TransportEndpoint.Wait.
 func (*endpoint) Wait() {}
+
+// LastError implements tcpip.Endpoint.LastError.
+func (*endpoint) LastError() *tcpip.Error {
+	return nil
+}
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 1b03ad6bb..95dc8ed57 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -356,7 +356,7 @@ func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	}
 }
 
-func (ep *endpoint) takeLastError() *tcpip.Error {
+func (ep *endpoint) LastError() *tcpip.Error {
 	ep.lastErrorMu.Lock()
 	defer ep.lastErrorMu.Unlock()
 
@@ -366,11 +366,7 @@ func (ep *endpoint) takeLastError() *tcpip.Error {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
-		return ep.takeLastError()
-	}
+func (*endpoint) GetSockOpt(interface{}) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index edc2b5b61..2087bcfa8 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -577,14 +577,8 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
-		return nil
-
-	default:
-		return tcpip.ErrUnknownProtocolOption
-	}
+func (*endpoint) GetSockOpt(interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
 }
 
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
@@ -739,3 +733,7 @@ func (e *endpoint) Stats() tcpip.EndpointStats {
 
 // Wait implements stack.TransportEndpoint.Wait.
 func (*endpoint) Wait() {}
+
+func (*endpoint) LastError() *tcpip.Error {
+	return nil
+}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 290172ac9..72df5c2a1 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -491,7 +491,7 @@ func (h *handshake) resolveRoute() *tcpip.Error {
 				h.ep.mu.Lock()
 			}
 			if n&notifyError != 0 {
-				return h.ep.takeLastError()
+				return h.ep.LastError()
 			}
 		}
 
@@ -620,7 +620,7 @@ func (h *handshake) execute() *tcpip.Error {
 				h.ep.mu.Lock()
 			}
 			if n&notifyError != 0 {
-				return h.ep.takeLastError()
+				return h.ep.LastError()
 			}
 
 		case wakerForNewSegment:
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index 804e95aea..6074cc24e 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -86,8 +86,7 @@ func testV4Connect(t *testing.T, c *context.Context, checkers ...checker.Network
 	// Wait for connection to be established.
 	select {
 	case <-ch:
-		err = c.EP.GetSockOpt(tcpip.ErrorOption{})
-		if err != nil {
+		if err := c.EP.LastError(); err != nil {
 			t.Fatalf("Unexpected error when connecting: %v", err)
 		}
 	case <-time.After(1 * time.Second):
@@ -194,8 +193,7 @@ func testV6Connect(t *testing.T, c *context.Context, checkers ...checker.Network
 	// Wait for connection to be established.
 	select {
 	case <-ch:
-		err = c.EP.GetSockOpt(tcpip.ErrorOption{})
-		if err != nil {
+		if err := c.EP.LastError(); err != nil {
 			t.Fatalf("Unexpected error when connecting: %v", err)
 		}
 	case <-time.After(1 * time.Second):
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index ff9b8804d..8a5e993b5 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1234,7 +1234,7 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
 
-func (e *endpoint) takeLastError() *tcpip.Error {
+func (e *endpoint) LastError() *tcpip.Error {
 	e.lastErrorMu.Lock()
 	defer e.lastErrorMu.Unlock()
 	err := e.lastError
@@ -1995,9 +1995,6 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	switch o := opt.(type) {
-	case tcpip.ErrorOption:
-		return e.takeLastError()
-
 	case *tcpip.BindToDeviceOption:
 		e.LockUser()
 		*o = tcpip.BindToDeviceOption(e.bindToDevice)
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 9650bb06c..3d3034d50 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -74,8 +74,8 @@ func TestGiveUpConnect(t *testing.T) {
 
 	// Wait for ep to become writable.
 	<-notifyCh
-	if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != tcpip.ErrAborted {
-		t.Fatalf("got ep.GetSockOpt(tcpip.ErrorOption{}) = %s, want = %s", err, tcpip.ErrAborted)
+	if err := ep.LastError(); err != tcpip.ErrAborted {
+		t.Fatalf("got ep.LastError() = %s, want = %s", err, tcpip.ErrAborted)
 	}
 
 	// Call Connect again to retreive the handshake failure status
@@ -3023,8 +3023,8 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 	// Wait for connection to be established.
 	select {
 	case <-ch:
-		if err := c.EP.GetSockOpt(tcpip.ErrorOption{}); err != nil {
-			t.Fatalf("GetSockOpt failed: %s", err)
+		if err := c.EP.LastError(); err != nil {
+			t.Fatalf("Connect failed: %s", err)
 		}
 	case <-time.After(1 * time.Second):
 		t.Fatalf("Timed out waiting for connection")
@@ -4411,7 +4411,7 @@ func TestSelfConnect(t *testing.T) {
 	}
 
 	<-notifyCh
-	if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != nil {
+	if err := ep.LastError(); err != nil {
 		t.Fatalf("Connect failed: %s", err)
 	}
 
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index b6031354e..1f5340cd0 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -638,7 +638,7 @@ func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte)
 	// Wait for connection to be established.
 	select {
 	case <-notifyCh:
-		if err := c.EP.GetSockOpt(tcpip.ErrorOption{}); err != nil {
+		if err := c.EP.LastError(); err != nil {
 			c.t.Fatalf("Unexpected error when connecting: %v", err)
 		}
 	case <-time.After(1 * time.Second):
@@ -882,8 +882,7 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
 	// Wait for connection to be established.
 	select {
 	case <-notifyCh:
-		err = c.EP.GetSockOpt(tcpip.ErrorOption{})
-		if err != nil {
+		if err := c.EP.LastError(); err != nil {
 			c.t.Fatalf("Unexpected error when connecting: %v", err)
 		}
 	case <-time.After(1 * time.Second):
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 0a9d3c6cf..1d5ebe3f2 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -209,7 +209,7 @@ func (e *endpoint) UniqueID() uint64 {
 	return e.uniqueID
 }
 
-func (e *endpoint) takeLastError() *tcpip.Error {
+func (e *endpoint) LastError() *tcpip.Error {
 	e.lastErrorMu.Lock()
 	defer e.lastErrorMu.Unlock()
 
@@ -268,7 +268,7 @@ func (e *endpoint) ModerateRecvBuf(copied int) {}
 // Read reads data from the endpoint. This method does not block if
 // there is no data pending.
 func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
-	if err := e.takeLastError(); err != nil {
+	if err := e.LastError(); err != nil {
 		return buffer.View{}, tcpip.ControlMessages{}, err
 	}
 
@@ -411,7 +411,7 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 }
 
 func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
-	if err := e.takeLastError(); err != nil {
+	if err := e.LastError(); err != nil {
 		return 0, nil, err
 	}
 
@@ -962,8 +962,6 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	switch o := opt.(type) {
-	case tcpip.ErrorOption:
-		return e.takeLastError()
 	case *tcpip.MulticastInterfaceOption:
 		e.mu.Lock()
 		*o = tcpip.MulticastInterfaceOption{
-- 
cgit v1.2.3


From cc5312a42f21f34c178cd821de227f4167c00cfb Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Thu, 27 Aug 2020 15:45:02 -0700
Subject: Improve type safety for socket options

The existing implementation for {G,S}etSockOpt take arguments of an
empty interface type which all types (implicitly) implement; any
type may be passed to the functions.

This change introduces marker interfaces for socket options that may be
set or queried which socket option types implement to ensure that invalid
types are caught at compile time. Different interfaces are used to allow
the compiler to enforce read-only or set-only socket options.

Fixes #3714.

RELNOTES: n/a
PiperOrigin-RevId: 328832161
---
 pkg/sentry/socket/netstack/netstack.go    |  55 +++++++------
 pkg/sentry/socket/unix/transport/unix.go  |  14 ++--
 pkg/tcpip/stack/transport_demuxer_test.go |   4 +-
 pkg/tcpip/stack/transport_test.go         |   4 +-
 pkg/tcpip/tcpip.go                        | 127 +++++++++++++++++++++++++-----
 pkg/tcpip/transport/icmp/endpoint.go      |   6 +-
 pkg/tcpip/transport/packet/endpoint.go    |   6 +-
 pkg/tcpip/transport/raw/endpoint.go       |   6 +-
 pkg/tcpip/transport/tcp/endpoint.go       |  58 +++++++-------
 pkg/tcpip/transport/tcp/tcp_test.go       |  95 ++++++++++++++--------
 pkg/tcpip/transport/udp/endpoint.go       |  16 ++--
 pkg/tcpip/transport/udp/udp_test.go       |  45 +++++------
 12 files changed, 276 insertions(+), 160 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 8da77cc68..0bf21f7d8 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -236,7 +236,7 @@ type commonEndpoint interface {
 
 	// SetSockOpt implements tcpip.Endpoint.SetSockOpt and
 	// transport.Endpoint.SetSockOpt.
-	SetSockOpt(interface{}) *tcpip.Error
+	SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error
 
 	// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool and
 	// transport.Endpoint.SetSockOptBool.
@@ -248,7 +248,7 @@ type commonEndpoint interface {
 
 	// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
 	// transport.Endpoint.GetSockOpt.
-	GetSockOpt(interface{}) *tcpip.Error
+	GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error
 
 	// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool and
 	// transport.Endpoint.GetSockOpt.
@@ -1778,8 +1778,7 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int
 		t.Kernel().EmitUnimplementedEvent(t)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
@@ -1824,7 +1823,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 		name := string(optVal[:n])
 		if name == "" {
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(0)))
+			v := tcpip.BindToDeviceOption(0)
+			return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
 		}
 		s := t.NetworkContext()
 		if s == nil {
@@ -1832,7 +1832,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 		for nicID, nic := range s.Interfaces() {
 			if nic.Name == name {
-				return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(nicID)))
+				v := tcpip.BindToDeviceOption(nicID)
+				return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
 			}
 		}
 		return syserr.ErrUnknownDevice
@@ -1898,7 +1899,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 			socket.SetSockOptEmitUnimplementedEvent(t, name)
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.OutOfBandInlineOption(v)))
+		opt := tcpip.OutOfBandInlineOption(v)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.SO_NO_CHECK:
 		if len(optVal) < sizeOfInt32 {
@@ -1921,21 +1923,20 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 
 		return syserr.TranslateNetstackError(
-			ep.SetSockOpt(tcpip.LingerOption{
+			ep.SetSockOpt(&tcpip.LingerOption{
 				Enabled: v.OnOff != 0,
 				Timeout: time.Second * time.Duration(v.Linger)}))
 
 	case linux.SO_DETACH_FILTER:
 		// optval is ignored.
 		var v tcpip.SocketDetachFilterOption
-		return syserr.TranslateNetstackError(ep.SetSockOpt(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
 
 	default:
 		socket.SetSockOptEmitUnimplementedEvent(t, name)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 // setSockOptTCP implements SetSockOpt when level is SOL_TCP.
@@ -1982,7 +1983,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		if v < 1 || v > linux.MAX_TCP_KEEPIDLE {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))))
+		opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_KEEPINTVL:
 		if len(optVal) < sizeOfInt32 {
@@ -1993,7 +1995,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		if v < 1 || v > linux.MAX_TCP_KEEPINTVL {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
+		opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_KEEPCNT:
 		if len(optVal) < sizeOfInt32 {
@@ -2015,11 +2018,12 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		if v < 0 {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))))
+		opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_CONGESTION:
 		v := tcpip.CongestionControlOption(optVal)
-		if err := ep.SetSockOpt(v); err != nil {
+		if err := ep.SetSockOpt(&v); err != nil {
 			return syserr.TranslateNetstackError(err)
 		}
 		return nil
@@ -2030,7 +2034,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := int32(usermem.ByteOrder.Uint32(optVal))
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))))
+		opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_DEFER_ACCEPT:
 		if len(optVal) < sizeOfInt32 {
@@ -2040,7 +2045,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		if v < 0 {
 			v = 0
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))))
+		opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_SYNCNT:
 		if len(optVal) < sizeOfInt32 {
@@ -2065,8 +2071,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		emitUnimplementedEventTCP(t, name)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
@@ -2144,8 +2149,7 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name
 		emitUnimplementedEventIPv6(t, name)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 var (
@@ -2223,7 +2227,7 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.AddMembershipOption{
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
 			NIC: tcpip.NICID(req.InterfaceIndex),
 			// TODO(igudger): Change AddMembership to use the standard
 			// any address representation.
@@ -2237,7 +2241,7 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.RemoveMembershipOption{
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
 			NIC: tcpip.NICID(req.InterfaceIndex),
 			// TODO(igudger): Change DropMembership to use the standard
 			// any address representation.
@@ -2251,7 +2255,7 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastInterfaceOption{
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{
 			NIC:           tcpip.NICID(req.InterfaceIndex),
 			InterfaceAddr: bytesToIPAddress(req.InterfaceAddr[:]),
 		}))
@@ -2375,8 +2379,7 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 		t.Kernel().EmitUnimplementedEvent(t)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 // emitUnimplementedEventTCP emits unimplemented event if name is valid. This
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 4bf06d4dc..cc9d650fb 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -172,9 +172,8 @@ type Endpoint interface {
 	// connected.
 	GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error)
 
-	// SetSockOpt sets a socket option. opt should be one of the tcpip.*Option
-	// types.
-	SetSockOpt(opt interface{}) *tcpip.Error
+	// SetSockOpt sets a socket option.
+	SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error
 
 	// SetSockOptBool sets a socket option for simple cases when a value has
 	// the int type.
@@ -184,9 +183,8 @@ type Endpoint interface {
 	// the int type.
 	SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
 
-	// GetSockOpt gets a socket option. opt should be a pointer to one of the
-	// tcpip.*Option types.
-	GetSockOpt(opt interface{}) *tcpip.Error
+	// GetSockOpt gets a socket option.
+	GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error
 
 	// GetSockOptBool gets a socket option for simple cases when a return
 	// value has the int type.
@@ -841,7 +839,7 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess
 }
 
 // SetSockOpt sets a socket option. Currently not supported.
-func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+func (e *baseEndpoint) SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error {
 	return nil
 }
 
@@ -943,7 +941,7 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 	switch opt.(type) {
 	case *tcpip.LingerOption:
 		return nil
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 1339edc2d..4d6d62eec 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -312,8 +312,8 @@ func TestBindToDeviceDistribution(t *testing.T) {
 							t.Fatalf("SetSockOptBool(ReusePortOption, %t) on endpoint %d failed: %s", endpoint.reuse, i, err)
 						}
 						bindToDeviceOption := tcpip.BindToDeviceOption(endpoint.bindToDevice)
-						if err := ep.SetSockOpt(bindToDeviceOption); err != nil {
-							t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %s", bindToDeviceOption, i, err)
+						if err := ep.SetSockOpt(&bindToDeviceOption); err != nil {
+							t.Fatalf("SetSockOpt(&%T(%d)) on endpoint %d failed: %s", bindToDeviceOption, bindToDeviceOption, i, err)
 						}
 
 						var dstAddr tcpip.Address
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 7869bb98b..a1458c899 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -105,7 +105,7 @@ func (*fakeTransportEndpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcp
 }
 
 // SetSockOpt sets a socket option. Currently not supported.
-func (*fakeTransportEndpoint) SetSockOpt(interface{}) *tcpip.Error {
+func (*fakeTransportEndpoint) SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error {
 	return tcpip.ErrInvalidEndpointState
 }
 
@@ -130,7 +130,7 @@ func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.E
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (*fakeTransportEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+func (*fakeTransportEndpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error {
 	return tcpip.ErrInvalidEndpointState
 }
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index cae943608..cd72d4f02 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -578,8 +578,8 @@ type Endpoint interface {
 	// if waiter.EventIn is set, the endpoint is immediately readable.
 	Readiness(mask waiter.EventMask) waiter.EventMask
 
-	// SetSockOpt sets a socket option. opt should be one of the *Option types.
-	SetSockOpt(opt interface{}) *Error
+	// SetSockOpt sets a socket option.
+	SetSockOpt(opt SettableSocketOption) *Error
 
 	// SetSockOptBool sets a socket option, for simple cases where a value
 	// has the bool type.
@@ -589,9 +589,8 @@ type Endpoint interface {
 	// has the int type.
 	SetSockOptInt(opt SockOptInt, v int) *Error
 
-	// GetSockOpt gets a socket option. opt should be a pointer to one of the
-	// *Option types.
-	GetSockOpt(opt interface{}) *Error
+	// GetSockOpt gets a socket option.
+	GetSockOpt(opt GettableSocketOption) *Error
 
 	// GetSockOptBool gets a socket option for simple cases where a return
 	// value has the bool type.
@@ -842,10 +841,37 @@ const (
 	PMTUDiscoveryProbe
 )
 
+// DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify
+// a default TTL.
+type DefaultTTLOption uint8
+
+// AvailableCongestionControlOption is used to query the supported congestion
+// control algorithms.
+type AvailableCongestionControlOption string
+
+// ModerateReceiveBufferOption is used by buffer moderation.
+type ModerateReceiveBufferOption bool
+
+// GettableSocketOption is a marker interface for socket options that may be
+// queried.
+type GettableSocketOption interface {
+	isGettableSocketOption()
+}
+
+// SettableSocketOption is a marker interface for socket options that may be
+// configured.
+type SettableSocketOption interface {
+	isSettableSocketOption()
+}
+
 // BindToDeviceOption is used by SetSockOpt/GetSockOpt to specify that sockets
 // should bind only on a specific NIC.
 type BindToDeviceOption NICID
 
+func (*BindToDeviceOption) isGettableSocketOption() {}
+
+func (*BindToDeviceOption) isSettableSocketOption() {}
+
 // TCPInfoOption is used by GetSockOpt to expose TCP statistics.
 //
 // TODO(b/64800844): Add and populate stat fields.
@@ -854,68 +880,111 @@ type TCPInfoOption struct {
 	RTTVar time.Duration
 }
 
+func (*TCPInfoOption) isGettableSocketOption() {}
+
 // KeepaliveIdleOption is used by SetSockOpt/GetSockOpt to specify the time a
 // connection must remain idle before the first TCP keepalive packet is sent.
 // Once this time is reached, KeepaliveIntervalOption is used instead.
 type KeepaliveIdleOption time.Duration
 
+func (*KeepaliveIdleOption) isGettableSocketOption() {}
+
+func (*KeepaliveIdleOption) isSettableSocketOption() {}
+
 // KeepaliveIntervalOption is used by SetSockOpt/GetSockOpt to specify the
 // interval between sending TCP keepalive packets.
 type KeepaliveIntervalOption time.Duration
 
+func (*KeepaliveIntervalOption) isGettableSocketOption() {}
+
+func (*KeepaliveIntervalOption) isSettableSocketOption() {}
+
 // TCPUserTimeoutOption is used by SetSockOpt/GetSockOpt to specify a user
 // specified timeout for a given TCP connection.
 // See: RFC5482 for details.
 type TCPUserTimeoutOption time.Duration
 
+func (*TCPUserTimeoutOption) isGettableSocketOption() {}
+
+func (*TCPUserTimeoutOption) isSettableSocketOption() {}
+
 // CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get
 // the current congestion control algorithm.
 type CongestionControlOption string
 
-// AvailableCongestionControlOption is used to query the supported congestion
-// control algorithms.
-type AvailableCongestionControlOption string
+func (*CongestionControlOption) isGettableSocketOption() {}
 
-// ModerateReceiveBufferOption is used by buffer moderation.
-type ModerateReceiveBufferOption bool
+func (*CongestionControlOption) isSettableSocketOption() {}
 
 // TCPLingerTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum duration for which a socket lingers in the TCP_FIN_WAIT_2 state
 // before being marked closed.
 type TCPLingerTimeoutOption time.Duration
 
+func (*TCPLingerTimeoutOption) isGettableSocketOption() {}
+
+func (*TCPLingerTimeoutOption) isSettableSocketOption() {}
+
 // TCPTimeWaitTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum duration for which a socket lingers in the TIME_WAIT state
 // before being marked closed.
 type TCPTimeWaitTimeoutOption time.Duration
 
+func (*TCPTimeWaitTimeoutOption) isGettableSocketOption() {}
+
+func (*TCPTimeWaitTimeoutOption) isSettableSocketOption() {}
+
 // TCPDeferAcceptOption is used by SetSockOpt/GetSockOpt to allow a
 // accept to return a completed connection only when there is data to be
 // read. This usually means the listening socket will drop the final ACK
 // for a handshake till the specified timeout until a segment with data arrives.
 type TCPDeferAcceptOption time.Duration
 
+func (*TCPDeferAcceptOption) isGettableSocketOption() {}
+
+func (*TCPDeferAcceptOption) isSettableSocketOption() {}
+
 // TCPMinRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
 // default MinRTO used by the Stack.
 type TCPMinRTOOption time.Duration
 
+func (*TCPMinRTOOption) isGettableSocketOption() {}
+
+func (*TCPMinRTOOption) isSettableSocketOption() {}
+
 // TCPMaxRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
 // default MaxRTO used by the Stack.
 type TCPMaxRTOOption time.Duration
 
+func (*TCPMaxRTOOption) isGettableSocketOption() {}
+
+func (*TCPMaxRTOOption) isSettableSocketOption() {}
+
 // TCPMaxRetriesOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum number of retransmits after which we time out the connection.
 type TCPMaxRetriesOption uint64
 
+func (*TCPMaxRetriesOption) isGettableSocketOption() {}
+
+func (*TCPMaxRetriesOption) isSettableSocketOption() {}
+
 // TCPSynRcvdCountThresholdOption is used by SetSockOpt/GetSockOpt to specify
 // the number of endpoints that can be in SYN-RCVD state before the stack
 // switches to using SYN cookies.
 type TCPSynRcvdCountThresholdOption uint64
 
+func (*TCPSynRcvdCountThresholdOption) isGettableSocketOption() {}
+
+func (*TCPSynRcvdCountThresholdOption) isSettableSocketOption() {}
+
 // TCPSynRetriesOption is used by SetSockOpt/GetSockOpt to specify stack-wide
 // default for number of times SYN is retransmitted before aborting a connect.
 type TCPSynRetriesOption uint8
 
+func (*TCPSynRetriesOption) isGettableSocketOption() {}
+
+func (*TCPSynRetriesOption) isSettableSocketOption() {}
+
 // MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a
 // default interface for multicast.
 type MulticastInterfaceOption struct {
@@ -923,45 +992,57 @@ type MulticastInterfaceOption struct {
 	InterfaceAddr Address
 }
 
-// MembershipOption is used by SetSockOpt/GetSockOpt as an argument to
-// AddMembershipOption and RemoveMembershipOption.
+func (*MulticastInterfaceOption) isGettableSocketOption() {}
+
+func (*MulticastInterfaceOption) isSettableSocketOption() {}
+
+// MembershipOption is used to identify a multicast membership on an interface.
 type MembershipOption struct {
 	NIC           NICID
 	InterfaceAddr Address
 	MulticastAddr Address
 }
 
-// AddMembershipOption is used by SetSockOpt/GetSockOpt to join a multicast
-// group identified by the given multicast address, on the interface matching
-// the given interface address.
+// AddMembershipOption identifies a multicast group to join on some interface.
 type AddMembershipOption MembershipOption
 
-// RemoveMembershipOption is used by SetSockOpt/GetSockOpt to leave a multicast
-// group identified by the given multicast address, on the interface matching
-// the given interface address.
+func (*AddMembershipOption) isSettableSocketOption() {}
+
+// RemoveMembershipOption identifies a multicast group to leave on some
+// interface.
 type RemoveMembershipOption MembershipOption
 
+func (*RemoveMembershipOption) isSettableSocketOption() {}
+
 // OutOfBandInlineOption is used by SetSockOpt/GetSockOpt to specify whether
 // TCP out-of-band data is delivered along with the normal in-band data.
 type OutOfBandInlineOption int
 
-// DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify
-// a default TTL.
-type DefaultTTLOption uint8
+func (*OutOfBandInlineOption) isGettableSocketOption() {}
+
+func (*OutOfBandInlineOption) isSettableSocketOption() {}
 
 // SocketDetachFilterOption is used by SetSockOpt to detach a previously attached
 // classic BPF filter on a given endpoint.
 type SocketDetachFilterOption int
 
+func (*SocketDetachFilterOption) isSettableSocketOption() {}
+
 // OriginalDestinationOption is used to get the original destination address
 // and port of a redirected packet.
 type OriginalDestinationOption FullAddress
 
+func (*OriginalDestinationOption) isGettableSocketOption() {}
+
 // TCPTimeWaitReuseOption is used stack.(*Stack).TransportProtocolOption to
 // specify if the stack can reuse the port bound by an endpoint in TIME-WAIT for
 // new connections when it is safe from protocol viewpoint.
 type TCPTimeWaitReuseOption uint8
 
+func (*TCPTimeWaitReuseOption) isGettableSocketOption() {}
+
+func (*TCPTimeWaitReuseOption) isSettableSocketOption() {}
+
 const (
 	// TCPTimeWaitReuseDisabled indicates reuse of port bound by endponts in TIME-WAIT cannot
 	// be reused for new connections.
@@ -986,6 +1067,10 @@ type LingerOption struct {
 	Timeout time.Duration
 }
 
+func (*LingerOption) isGettableSocketOption() {}
+
+func (*LingerOption) isSettableSocketOption() {}
+
 // IPPacketInfo is the message structure for IP_PKTINFO.
 //
 // +stateify savable
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index c545c8367..346ca4bda 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -343,9 +343,9 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 }
 
 // SetSockOpt sets a socket option.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	switch opt.(type) {
-	case tcpip.SocketDetachFilterOption:
+	case *tcpip.SocketDetachFilterOption:
 		return nil
 	}
 	return nil
@@ -415,7 +415,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (*endpoint) GetSockOpt(interface{}) *tcpip.Error {
+func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 95dc8ed57..81093e9ca 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -297,9 +297,9 @@ func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 // SetSockOpt implements tcpip.Endpoint.SetSockOpt. Packet sockets cannot be
 // used with SetSockOpt, and this function always returns
 // tcpip.ErrNotSupported.
-func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	switch opt.(type) {
-	case tcpip.SocketDetachFilterOption:
+	case *tcpip.SocketDetachFilterOption:
 		return nil
 
 	default:
@@ -366,7 +366,7 @@ func (ep *endpoint) LastError() *tcpip.Error {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (*endpoint) GetSockOpt(interface{}) *tcpip.Error {
+func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 2087bcfa8..71feeb748 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -510,9 +510,9 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 }
 
 // SetSockOpt implements tcpip.Endpoint.SetSockOpt.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	switch opt.(type) {
-	case tcpip.SocketDetachFilterOption:
+	case *tcpip.SocketDetachFilterOption:
 		return nil
 
 	default:
@@ -577,7 +577,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (*endpoint) GetSockOpt(interface{}) *tcpip.Error {
+func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 8a5e993b5..c5d9eba5d 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1736,10 +1736,10 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 }
 
 // SetSockOpt sets a socket option.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	switch v := opt.(type) {
-	case tcpip.BindToDeviceOption:
-		id := tcpip.NICID(v)
+	case *tcpip.BindToDeviceOption:
+		id := tcpip.NICID(*v)
 		if id != 0 && !e.stack.HasNIC(id) {
 			return tcpip.ErrUnknownDevice
 		}
@@ -1747,27 +1747,27 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.bindToDevice = id
 		e.UnlockUser()
 
-	case tcpip.KeepaliveIdleOption:
+	case *tcpip.KeepaliveIdleOption:
 		e.keepalive.Lock()
-		e.keepalive.idle = time.Duration(v)
+		e.keepalive.idle = time.Duration(*v)
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
 
-	case tcpip.KeepaliveIntervalOption:
+	case *tcpip.KeepaliveIntervalOption:
 		e.keepalive.Lock()
-		e.keepalive.interval = time.Duration(v)
+		e.keepalive.interval = time.Duration(*v)
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
 
-	case tcpip.OutOfBandInlineOption:
+	case *tcpip.OutOfBandInlineOption:
 		// We don't currently support disabling this option.
 
-	case tcpip.TCPUserTimeoutOption:
+	case *tcpip.TCPUserTimeoutOption:
 		e.LockUser()
-		e.userTimeout = time.Duration(v)
+		e.userTimeout = time.Duration(*v)
 		e.UnlockUser()
 
-	case tcpip.CongestionControlOption:
+	case *tcpip.CongestionControlOption:
 		// Query the available cc algorithms in the stack and
 		// validate that the specified algorithm is actually
 		// supported in the stack.
@@ -1777,10 +1777,10 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		availCC := strings.Split(string(avail), " ")
 		for _, cc := range availCC {
-			if v == tcpip.CongestionControlOption(cc) {
+			if *v == tcpip.CongestionControlOption(cc) {
 				e.LockUser()
 				state := e.EndpointState()
-				e.cc = v
+				e.cc = *v
 				switch state {
 				case StateEstablished:
 					if e.EndpointState() == state {
@@ -1796,43 +1796,43 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		// control algorithm is specified.
 		return tcpip.ErrNoSuchFile
 
-	case tcpip.TCPLingerTimeoutOption:
+	case *tcpip.TCPLingerTimeoutOption:
 		e.LockUser()
 
 		switch {
-		case v < 0:
+		case *v < 0:
 			// Same as effectively disabling TCPLinger timeout.
-			v = -1
-		case v == 0:
+			*v = -1
+		case *v == 0:
 			// Same as the stack default.
 			var stackLingerTimeout tcpip.TCPLingerTimeoutOption
 			if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil {
 				panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err))
 			}
-			v = stackLingerTimeout
-		case v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout):
+			*v = stackLingerTimeout
+		case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout):
 			// Cap it to Stack's default TCP_LINGER2 timeout.
-			v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
+			*v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
 		default:
 		}
 
-		e.tcpLingerTimeout = time.Duration(v)
+		e.tcpLingerTimeout = time.Duration(*v)
 		e.UnlockUser()
 
-	case tcpip.TCPDeferAcceptOption:
+	case *tcpip.TCPDeferAcceptOption:
 		e.LockUser()
-		if time.Duration(v) > MaxRTO {
-			v = tcpip.TCPDeferAcceptOption(MaxRTO)
+		if time.Duration(*v) > MaxRTO {
+			*v = tcpip.TCPDeferAcceptOption(MaxRTO)
 		}
-		e.deferAccept = time.Duration(v)
+		e.deferAccept = time.Duration(*v)
 		e.UnlockUser()
 
-	case tcpip.SocketDetachFilterOption:
+	case *tcpip.SocketDetachFilterOption:
 		return nil
 
-	case tcpip.LingerOption:
+	case *tcpip.LingerOption:
 		e.LockUser()
-		e.linger = v
+		e.linger = *v
 		e.UnlockUser()
 
 	default:
@@ -1993,7 +1993,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 	switch o := opt.(type) {
 	case *tcpip.BindToDeviceOption:
 		e.LockUser()
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 3d3034d50..adb32e428 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1349,7 +1349,9 @@ func TestConnectBindToDevice(t *testing.T) {
 
 			c.Create(-1)
 			bindToDevice := tcpip.BindToDeviceOption(test.device)
-			c.EP.SetSockOpt(bindToDevice)
+			if err := c.EP.SetSockOpt(&bindToDevice); err != nil {
+				t.Fatalf("c.EP.SetSockOpt(&%T(%d)): %s", bindToDevice, bindToDevice, err)
+			}
 			// Start connection attempt.
 			waitEntry, _ := waiter.NewChannelEntry(nil)
 			c.WQ.EventRegister(&waitEntry, waiter.EventOut)
@@ -4321,16 +4323,15 @@ func TestBindToDeviceOption(t *testing.T) {
 		t.Run(testAction.name, func(t *testing.T) {
 			if testAction.setBindToDevice != nil {
 				bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
-				if gotErr, wantErr := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
-					t.Errorf("SetSockOpt(%#v) got %v, want %v", bindToDevice, gotErr, wantErr)
+				if gotErr, wantErr := ep.SetSockOpt(&bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
+					t.Errorf("got SetSockOpt(&%T(%d)) = %s, want = %s", bindToDevice, bindToDevice, gotErr, wantErr)
 				}
 			}
 			bindToDevice := tcpip.BindToDeviceOption(88888)
 			if err := ep.GetSockOpt(&bindToDevice); err != nil {
-				t.Errorf("GetSockOpt got %s, want %v", err, nil)
-			}
-			if got, want := bindToDevice, testAction.getBindToDevice; got != want {
-				t.Errorf("bindToDevice got %d, want %d", got, want)
+				t.Errorf("GetSockOpt(&%T): %s", bindToDevice, err)
+			} else if bindToDevice != testAction.getBindToDevice {
+				t.Errorf("got bindToDevice = %d, want %d", bindToDevice, testAction.getBindToDevice)
 			}
 		})
 	}
@@ -4806,20 +4807,20 @@ func TestEndpointSetCongestionControl(t *testing.T) {
 
 				var oldCC tcpip.CongestionControlOption
 				if err := c.EP.GetSockOpt(&oldCC); err != nil {
-					t.Fatalf("c.EP.SockOpt(%v) = %s", &oldCC, err)
+					t.Fatalf("c.EP.GetSockOpt(&%T) = %s", oldCC, err)
 				}
 
 				if connected {
 					c.Connect(789 /* iss */, 32768 /* rcvWnd */, nil)
 				}
 
-				if err := c.EP.SetSockOpt(tc.cc); err != tc.err {
-					t.Fatalf("c.EP.SetSockOpt(%v) = %s, want %s", tc.cc, err, tc.err)
+				if err := c.EP.SetSockOpt(&tc.cc); err != tc.err {
+					t.Fatalf("got c.EP.SetSockOpt(&%#v) = %s, want %s", tc.cc, err, tc.err)
 				}
 
 				var cc tcpip.CongestionControlOption
 				if err := c.EP.GetSockOpt(&cc); err != nil {
-					t.Fatalf("c.EP.SockOpt(%v) = %s", &cc, err)
+					t.Fatalf("c.EP.GetSockOpt(&%T): %s", cc, err)
 				}
 
 				got, want := cc, oldCC
@@ -4831,7 +4832,7 @@ func TestEndpointSetCongestionControl(t *testing.T) {
 					want = tc.cc
 				}
 				if got != want {
-					t.Fatalf("got congestion control: %v, want: %v", got, want)
+					t.Fatalf("got congestion control = %+v, want = %+v", got, want)
 				}
 			})
 		}
@@ -4852,11 +4853,23 @@ func TestKeepalive(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
+	const keepAliveIdle = 100 * time.Millisecond
 	const keepAliveInterval = 3 * time.Second
-	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(100 * time.Millisecond))
-	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
+	keepAliveIdleOpt := tcpip.KeepaliveIdleOption(keepAliveIdle)
+	if err := c.EP.SetSockOpt(&keepAliveIdleOpt); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", keepAliveIdleOpt, keepAliveIdle, err)
+	}
+	keepAliveIntervalOpt := tcpip.KeepaliveIntervalOption(keepAliveInterval)
+	if err := c.EP.SetSockOpt(&keepAliveIntervalOpt); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", keepAliveIntervalOpt, keepAliveInterval, err)
+	}
 	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5)
-	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
+	if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5); err != nil {
+		t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5): %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil {
+		t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err)
+	}
 
 	// 5 unacked keepalives are sent. ACK each one, and check that the
 	// connection stays alive after 5.
@@ -6216,15 +6229,17 @@ func TestTCPLingerTimeout(t *testing.T) {
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
-			if err := c.EP.SetSockOpt(tcpip.TCPLingerTimeoutOption(tc.tcpLingerTimeout)); err != nil {
-				t.Fatalf("SetSockOpt(%s) = %s", tc.tcpLingerTimeout, err)
+			v := tcpip.TCPLingerTimeoutOption(tc.tcpLingerTimeout)
+			if err := c.EP.SetSockOpt(&v); err != nil {
+				t.Fatalf("SetSockOpt(&%T(%s)) = %s", v, tc.tcpLingerTimeout, err)
 			}
-			var v tcpip.TCPLingerTimeoutOption
+
+			v = 0
 			if err := c.EP.GetSockOpt(&v); err != nil {
-				t.Fatalf("GetSockOpt(tcpip.TCPLingerTimeoutOption) = %s", err)
+				t.Fatalf("GetSockOpt(&%T) = %s", v, err)
 			}
 			if got, want := time.Duration(v), tc.want; got != want {
-				t.Fatalf("unexpected linger timeout got: %s, want: %s", got, want)
+				t.Fatalf("got linger timeout = %s, want = %s", got, want)
 			}
 		})
 	}
@@ -6941,7 +6956,10 @@ func TestTCPUserTimeout(t *testing.T) {
 	// expired.
 	initRTO := 1 * time.Second
 	userTimeout := initRTO / 2
-	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+	v := tcpip.TCPUserTimeoutOption(userTimeout)
+	if err := c.EP.SetSockOpt(&v); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s): %s", v, userTimeout, err)
+	}
 
 	// Send some data and wait before ACKing it.
 	view := buffer.NewView(3)
@@ -7015,18 +7033,31 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 
 	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
 
+	const keepAliveIdle = 100 * time.Millisecond
 	const keepAliveInterval = 3 * time.Second
-	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(100 * time.Millisecond))
-	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
-	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10)
-	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
+	keepAliveIdleOption := tcpip.KeepaliveIdleOption(keepAliveIdle)
+	if err := c.EP.SetSockOpt(&keepAliveIdleOption); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", keepAliveIdleOption, keepAliveIdle, err)
+	}
+	keepAliveIntervalOption := tcpip.KeepaliveIntervalOption(keepAliveInterval)
+	if err := c.EP.SetSockOpt(&keepAliveIntervalOption); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", keepAliveIntervalOption, keepAliveInterval, err)
+	}
+	if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10); err != nil {
+		t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10): %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil {
+		t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err)
+	}
 
 	// Set userTimeout to be the duration to be 1 keepalive
 	// probes. Which means that after the first probe is sent
 	// the second one should cause the connection to be
 	// closed due to userTimeout being hit.
-	userTimeout := 1 * keepAliveInterval
-	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+	userTimeout := tcpip.TCPUserTimeoutOption(keepAliveInterval)
+	if err := c.EP.SetSockOpt(&userTimeout); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", userTimeout, keepAliveInterval, err)
+	}
 
 	// Check that the connection is still alive.
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
@@ -7233,8 +7264,9 @@ func TestTCPDeferAccept(t *testing.T) {
 	}
 
 	const tcpDeferAccept = 1 * time.Second
-	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
-		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %s", tcpDeferAccept, err)
+	tcpDeferAcceptOption := tcpip.TCPDeferAcceptOption(tcpDeferAccept)
+	if err := c.EP.SetSockOpt(&tcpDeferAcceptOption); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", tcpDeferAcceptOption, tcpDeferAccept, err)
 	}
 
 	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
@@ -7290,8 +7322,9 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 	}
 
 	const tcpDeferAccept = 1 * time.Second
-	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
-		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %s", tcpDeferAccept, err)
+	tcpDeferAcceptOpt := tcpip.TCPDeferAcceptOption(tcpDeferAccept)
+	if err := c.EP.SetSockOpt(&tcpDeferAcceptOpt); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)) failed: %s", tcpDeferAcceptOpt, tcpDeferAccept, err)
 	}
 
 	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 1d5ebe3f2..c74bc4d94 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -683,9 +683,9 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 }
 
 // SetSockOpt implements tcpip.Endpoint.SetSockOpt.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	switch v := opt.(type) {
-	case tcpip.MulticastInterfaceOption:
+	case *tcpip.MulticastInterfaceOption:
 		e.mu.Lock()
 		defer e.mu.Unlock()
 
@@ -721,7 +721,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.multicastNICID = nic
 		e.multicastAddr = addr
 
-	case tcpip.AddMembershipOption:
+	case *tcpip.AddMembershipOption:
 		if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) {
 			return tcpip.ErrInvalidOptionValue
 		}
@@ -764,7 +764,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 
 		e.multicastMemberships = append(e.multicastMemberships, memToInsert)
 
-	case tcpip.RemoveMembershipOption:
+	case *tcpip.RemoveMembershipOption:
 		if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) {
 			return tcpip.ErrInvalidOptionValue
 		}
@@ -808,8 +808,8 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.multicastMemberships[memToRemoveIndex] = e.multicastMemberships[len(e.multicastMemberships)-1]
 		e.multicastMemberships = e.multicastMemberships[:len(e.multicastMemberships)-1]
 
-	case tcpip.BindToDeviceOption:
-		id := tcpip.NICID(v)
+	case *tcpip.BindToDeviceOption:
+		id := tcpip.NICID(*v)
 		if id != 0 && !e.stack.HasNIC(id) {
 			return tcpip.ErrUnknownDevice
 		}
@@ -817,7 +817,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.bindToDevice = id
 		e.mu.Unlock()
 
-	case tcpip.SocketDetachFilterOption:
+	case *tcpip.SocketDetachFilterOption:
 		return nil
 	}
 	return nil
@@ -960,7 +960,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 	switch o := opt.(type) {
 	case *tcpip.MulticastInterfaceOption:
 		e.mu.Lock()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index bd1c8ac31..0cbc045d8 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -539,7 +539,7 @@ func TestBindToDeviceOption(t *testing.T) {
 
 	opts := stack.NICOptions{Name: "my_device"}
 	if err := s.CreateNICWithOptions(321, loopback.New(), opts); err != nil {
-		t.Errorf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
+		t.Errorf("CreateNICWithOptions(_, _, %+v) failed: %s", opts, err)
 	}
 
 	// nicIDPtr is used instead of taking the address of NICID literals, which is
@@ -563,16 +563,15 @@ func TestBindToDeviceOption(t *testing.T) {
 		t.Run(testAction.name, func(t *testing.T) {
 			if testAction.setBindToDevice != nil {
 				bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
-				if gotErr, wantErr := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
-					t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, gotErr, wantErr)
+				if gotErr, wantErr := ep.SetSockOpt(&bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
+					t.Errorf("got SetSockOpt(&%T(%d)) = %s, want = %s", bindToDevice, bindToDevice, gotErr, wantErr)
 				}
 			}
 			bindToDevice := tcpip.BindToDeviceOption(88888)
 			if err := ep.GetSockOpt(&bindToDevice); err != nil {
-				t.Errorf("GetSockOpt got %v, want %v", err, nil)
-			}
-			if got, want := bindToDevice, testAction.getBindToDevice; got != want {
-				t.Errorf("bindToDevice got %d, want %d", got, want)
+				t.Errorf("GetSockOpt(&%T): %s", bindToDevice, err)
+			} else if bindToDevice != testAction.getBindToDevice {
+				t.Errorf("got bindToDevice = %d, want = %d", bindToDevice, testAction.getBindToDevice)
 			}
 		})
 	}
@@ -628,12 +627,12 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	// Check the peer address.
 	h := flow.header4Tuple(incoming)
 	if addr.Addr != h.srcAddr.Addr {
-		c.t.Fatalf("unexpected remote address: got %s, want %v", addr.Addr, h.srcAddr)
+		c.t.Fatalf("got address = %s, want = %s", addr.Addr, h.srcAddr.Addr)
 	}
 
 	// Check the payload.
 	if !bytes.Equal(payload, v) {
-		c.t.Fatalf("bad payload: got %x, want %x", v, payload)
+		c.t.Fatalf("got payload = %x, want = %x", v, payload)
 	}
 
 	// Run any checkers against the ControlMessages.
@@ -694,7 +693,7 @@ func TestBindReservedPort(t *testing.T) {
 		}
 		defer ep.Close()
 		if got, want := ep.Bind(addr), tcpip.ErrPortInUse; got != want {
-			t.Fatalf("got ep.Bind(...) = %v, want = %v", got, want)
+			t.Fatalf("got ep.Bind(...) = %s, want = %s", got, want)
 		}
 	}
 
@@ -707,7 +706,7 @@ func TestBindReservedPort(t *testing.T) {
 		// We can't bind ipv4-any on the port reserved by the connected endpoint
 		// above, since the endpoint is dual-stack.
 		if got, want := ep.Bind(tcpip.FullAddress{Port: addr.Port}), tcpip.ErrPortInUse; got != want {
-			t.Fatalf("got ep.Bind(...) = %v, want = %v", got, want)
+			t.Fatalf("got ep.Bind(...) = %s, want = %s", got, want)
 		}
 		// We can bind an ipv4 address on this port, though.
 		if err := ep.Bind(tcpip.FullAddress{Addr: stackAddr, Port: addr.Port}); err != nil {
@@ -830,7 +829,7 @@ func TestV4ReadSelfSource(t *testing.T) {
 			}
 
 			if _, _, err := c.ep.Read(nil); err != tt.wantErr {
-				t.Errorf("c.ep.Read() got error %v, want %v", err, tt.wantErr)
+				t.Errorf("got c.ep.Read(nil) = %s, want = %s", err, tt.wantErr)
 			}
 		})
 	}
@@ -871,8 +870,8 @@ func TestReadOnBoundToMulticast(t *testing.T) {
 
 			// Join multicast group.
 			ifoptSet := tcpip.AddMembershipOption{NIC: 1, MulticastAddr: mcastAddr}
-			if err := c.ep.SetSockOpt(ifoptSet); err != nil {
-				c.t.Fatal("SetSockOpt failed:", err)
+			if err := c.ep.SetSockOpt(&ifoptSet); err != nil {
+				c.t.Fatalf("SetSockOpt(&%#v): %s", ifoptSet, err)
 			}
 
 			// Check that we receive multicast packets but not unicast or broadcast
@@ -1403,8 +1402,8 @@ func TestReadIPPacketInfo(t *testing.T) {
 
 			if test.flow.isMulticast() {
 				ifoptSet := tcpip.AddMembershipOption{NIC: 1, MulticastAddr: test.flow.getMcastAddr()}
-				if err := c.ep.SetSockOpt(ifoptSet); err != nil {
-					c.t.Fatalf("SetSockOpt(%+v): %s:", ifoptSet, err)
+				if err := c.ep.SetSockOpt(&ifoptSet); err != nil {
+					c.t.Fatalf("SetSockOpt(&%#v): %s:", ifoptSet, err)
 				}
 			}
 
@@ -1547,7 +1546,7 @@ func TestSetTOS(t *testing.T) {
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(IPv4TOSOption) = 0x%x, want = 0x%x", v, 0)
+				c.t.Errorf("got GetSockOptInt(IPv4TOSOption) = 0x%x, want = 0x%x", v, 0)
 			}
 
 			if err := c.ep.SetSockOptInt(tcpip.IPv4TOSOption, tos); err != nil {
@@ -1708,19 +1707,17 @@ func TestMulticastInterfaceOption(t *testing.T) {
 								}
 							}
 
-							if err := c.ep.SetSockOpt(ifoptSet); err != nil {
-								c.t.Fatalf("SetSockOpt failed: %s", err)
+							if err := c.ep.SetSockOpt(&ifoptSet); err != nil {
+								c.t.Fatalf("SetSockOpt(&%#v): %s", ifoptSet, err)
 							}
 
 							// Verify multicast interface addr and NIC were set correctly.
 							// Note that NIC must be 1 since this is our outgoing interface.
-							ifoptWant := tcpip.MulticastInterfaceOption{NIC: 1, InterfaceAddr: ifoptSet.InterfaceAddr}
 							var ifoptGot tcpip.MulticastInterfaceOption
 							if err := c.ep.GetSockOpt(&ifoptGot); err != nil {
-								c.t.Fatalf("GetSockOpt failed: %s", err)
-							}
-							if ifoptGot != ifoptWant {
-								c.t.Errorf("got GetSockOpt() = %#v, want = %#v", ifoptGot, ifoptWant)
+								c.t.Fatalf("GetSockOpt(&%T): %s", ifoptGot, err)
+							} else if ifoptWant := (tcpip.MulticastInterfaceOption{NIC: 1, InterfaceAddr: ifoptSet.InterfaceAddr}); ifoptGot != ifoptWant {
+								c.t.Errorf("got multicast interface option = %#v, want = %#v", ifoptGot, ifoptWant)
 							}
 						})
 					}
-- 
cgit v1.2.3


From e9b5fda2f1d44a50d67ae3c30400f9b05048fc9d Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Thu, 27 Aug 2020 16:28:36 -0700
Subject: [go-marshal] Support for usermem.IOOpts.

PiperOrigin-RevId: 328839759
---
 pkg/sentry/kernel/task_usermem.go | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'pkg')

diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 4550b9f89..0cb86e390 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -301,3 +301,30 @@ func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOp
 		Opts:  opts,
 	}, nil
 }
+
+// CopyContextWithOpts wraps a task to allow copying memory to and from the
+// task memory with user specified usermem.IOOpts.
+type CopyContextWithOpts struct {
+	*Task
+	opts usermem.IOOpts
+}
+
+// AsCopyContextWithOpts wraps the task and returns it as CopyContextWithOpts.
+func (t *Task) AsCopyContextWithOpts(opts usermem.IOOpts) *CopyContextWithOpts {
+	return &CopyContextWithOpts{t, opts}
+}
+
+// CopyInString copies a string in from the task's memory.
+func (t *CopyContextWithOpts) CopyInString(addr usermem.Addr, maxLen int) (string, error) {
+	return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxLen, t.opts)
+}
+
+// CopyInBytes copies task memory into dst from an IO context.
+func (t *CopyContextWithOpts) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+	return t.MemoryManager().CopyIn(t, addr, dst, t.opts)
+}
+
+// CopyOutBytes copies src into task memoryfrom an IO context.
+func (t *CopyContextWithOpts) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+	return t.MemoryManager().CopyOut(t, addr, src, t.opts)
+}
-- 
cgit v1.2.3


From 05166f14c93323d6279987ae3fe9a803ad188ade Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 27 Aug 2020 16:50:11 -0700
Subject: unix: return ECONNREFUSE if a socket file exists but a socket isn't
 bound to it

PiperOrigin-RevId: 328843560
---
 pkg/sentry/fsimpl/gofer/filesystem.go |  4 +++-
 pkg/sentry/fsimpl/tmpfs/filesystem.go |  3 +++
 test/syscalls/linux/mknod.cc          | 19 +++++++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 4d581fc29..5d0f487db 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -1512,7 +1512,9 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 				path:   opts.Addr,
 			}, nil
 		}
-		return d.endpoint, nil
+		if d.endpoint != nil {
+			return d.endpoint, nil
+		}
 	}
 	return nil, syserror.ECONNREFUSED
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index eddfeab76..e0de04e05 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -783,6 +783,9 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	}
 	switch impl := d.inode.impl.(type) {
 	case *socketFile:
+		if impl.ep == nil {
+			return nil, syserror.ECONNREFUSED
+		}
 		return impl.ep, nil
 	default:
 		return nil, syserror.ECONNREFUSED
diff --git a/test/syscalls/linux/mknod.cc b/test/syscalls/linux/mknod.cc
index 05dfb375a..2ba8c11b8 100644
--- a/test/syscalls/linux/mknod.cc
+++ b/test/syscalls/linux/mknod.cc
@@ -14,6 +14,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/un.h>
@@ -103,6 +104,24 @@ TEST(MknodTest, UnimplementedTypesReturnError) {
   ASSERT_THAT(mknod(path.c_str(), S_IFBLK, 0), SyscallFailsWithErrno(EPERM));
 }
 
+TEST(MknodTest, Socket) {
+  ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
+
+  SKIP_IF(IsRunningOnGvisor() && IsRunningWithVFS1());
+
+  ASSERT_THAT(mknod("./file0", S_IFSOCK | S_IRUSR | S_IWUSR, 0),
+              SyscallSucceeds());
+
+  int sk;
+  ASSERT_THAT(sk = socket(AF_UNIX, SOCK_SEQPACKET, 0), SyscallSucceeds());
+  FileDescriptor fd(sk);
+
+  struct sockaddr_un addr = {.sun_family = AF_UNIX};
+  absl::SNPrintF(addr.sun_path, sizeof(addr.sun_path), "./file0");
+  ASSERT_THAT(connect(sk, (struct sockaddr *)&addr, sizeof(addr)),
+              SyscallFailsWithErrno(ECONNREFUSED));
+}
+
 TEST(MknodTest, Fifo) {
   const std::string fifo = NewTempAbsPath();
   ASSERT_THAT(mknod(fifo.c_str(), S_IFIFO | S_IRUSR | S_IWUSR, 0),
-- 
cgit v1.2.3


From bb089f9c9075a78e8bde7ff946bac77badc08894 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 27 Aug 2020 16:52:21 -0700
Subject: Fix vfs2 pipe behavior when splicing to a non-pipe.

Fixes *.sh Java runtime tests, where splice()-ing from a pipe to /dev/zero
would not actually empty the pipe.

There was no guarantee that the data would actually be consumed on a splice
operation unless the output file's implementation of Write/PWrite actually
called VFSPipeFD.CopyIn. Now, whatever bytes are "written" are consumed
regardless of whether CopyIn is called or not.

Furthermore, the number of bytes in the IOSequence for reads is now capped at
the amount of data actually available. Before, splicing to /dev/zero would
always return the requested splice size without taking the actual available
data into account.

This change also refactors the case where an input file is spliced into an
output pipe so that it follows a similar pattern, which is arguably cleaner
anyway.

Updates #3576.

PiperOrigin-RevId: 328843954
---
 pkg/buffer/BUILD                         |  2 +
 pkg/sentry/kernel/pipe/pipe.go           | 14 ++++---
 pkg/sentry/kernel/pipe/vfs.go            | 63 ++++++++++++++++++++++++-------
 pkg/sentry/syscalls/linux/vfs2/splice.go | 35 ++++++++---------
 test/syscalls/linux/sendfile.cc          | 36 +++++++++++++++++-
 test/syscalls/linux/splice.cc            | 64 +++++++++++++++++++++++++++++++-
 6 files changed, 174 insertions(+), 40 deletions(-)

(limited to 'pkg')

diff --git a/pkg/buffer/BUILD b/pkg/buffer/BUILD
index dcd086298..b03d46d18 100644
--- a/pkg/buffer/BUILD
+++ b/pkg/buffer/BUILD
@@ -26,8 +26,10 @@ go_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/context",
         "//pkg/log",
         "//pkg/safemem",
+        "//pkg/usermem",
     ],
 )
 
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 297e8f28f..c410c96aa 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -200,17 +200,17 @@ type readOps struct {
 //
 // Precondition: this pipe must have readers.
 func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
-	// Don't block for a zero-length read even if the pipe is empty.
-	if ops.left() == 0 {
-		return 0, nil
-	}
-
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	return p.readLocked(ctx, ops)
 }
 
 func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
+	// Don't block for a zero-length read even if the pipe is empty.
+	if ops.left() == 0 {
+		return 0, nil
+	}
+
 	// Is the pipe empty?
 	if p.view.Size() == 0 {
 		if !p.HasWriters() {
@@ -388,6 +388,10 @@ func (p *Pipe) rwReadiness() waiter.EventMask {
 func (p *Pipe) queued() int64 {
 	p.mu.Lock()
 	defer p.mu.Unlock()
+	return p.queuedLocked()
+}
+
+func (p *Pipe) queuedLocked() int64 {
 	return p.view.Size()
 }
 
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index 28f998e45..f223d59e1 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -244,19 +244,57 @@ func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
 	return fd.pipe.SetFifoSize(size)
 }
 
-// IOSequence returns a useremm.IOSequence that reads up to count bytes from,
-// or writes up to count bytes to, fd.
-func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence {
-	return usermem.IOSequence{
+// SpliceToNonPipe performs a splice operation from fd to a non-pipe file.
+func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) {
+	fd.pipe.mu.Lock()
+	defer fd.pipe.mu.Unlock()
+
+	// Cap the sequence at number of bytes actually available.
+	v := fd.pipe.queuedLocked()
+	if v < count {
+		count = v
+	}
+	src := usermem.IOSequence{
 		IO:    fd,
 		Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
 	}
+
+	var (
+		n   int64
+		err error
+	)
+	if off == -1 {
+		n, err = out.Write(ctx, src, vfs.WriteOptions{})
+	} else {
+		n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{})
+	}
+	if n > 0 {
+		fd.pipe.view.TrimFront(n)
+	}
+	return n, err
+}
+
+// SpliceFromNonPipe performs a splice operation from a non-pipe file to fd.
+func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) {
+	fd.pipe.mu.Lock()
+	defer fd.pipe.mu.Unlock()
+
+	dst := usermem.IOSequence{
+		IO:    fd,
+		Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+	}
+
+	if off == -1 {
+		return in.Read(ctx, dst, vfs.ReadOptions{})
+	}
+	return in.PRead(ctx, dst, off, vfs.ReadOptions{})
 }
 
-// CopyIn implements usermem.IO.CopyIn.
+// CopyIn implements usermem.IO.CopyIn. Note that it is the caller's
+// responsibility to trim fd.pipe.view after the read is completed.
 func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
 	origCount := int64(len(dst))
-	n, err := fd.pipe.read(ctx, readOps{
+	n, err := fd.pipe.readLocked(ctx, readOps{
 		left: func() int64 {
 			return int64(len(dst))
 		},
@@ -265,7 +303,6 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
 		},
 		read: func(view *buffer.View) (int64, error) {
 			n, err := view.ReadAt(dst, 0)
-			view.TrimFront(int64(n))
 			return int64(n), err
 		},
 	})
@@ -281,7 +318,7 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
 // CopyOut implements usermem.IO.CopyOut.
 func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
 	origCount := int64(len(src))
-	n, err := fd.pipe.write(ctx, writeOps{
+	n, err := fd.pipe.writeLocked(ctx, writeOps{
 		left: func() int64 {
 			return int64(len(src))
 		},
@@ -305,7 +342,7 @@ func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte,
 // ZeroOut implements usermem.IO.ZeroOut.
 func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
 	origCount := toZero
-	n, err := fd.pipe.write(ctx, writeOps{
+	n, err := fd.pipe.writeLocked(ctx, writeOps{
 		left: func() int64 {
 			return toZero
 		},
@@ -326,14 +363,15 @@ func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int6
 	return n, err
 }
 
-// CopyInTo implements usermem.IO.CopyInTo.
+// CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's
+// responsibility to trim fd.pipe.view after the read is completed.
 func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
 	count := ars.NumBytes()
 	if count == 0 {
 		return 0, nil
 	}
 	origCount := count
-	n, err := fd.pipe.read(ctx, readOps{
+	n, err := fd.pipe.readLocked(ctx, readOps{
 		left: func() int64 {
 			return count
 		},
@@ -342,7 +380,6 @@ func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst
 		},
 		read: func(view *buffer.View) (int64, error) {
 			n, err := view.ReadToSafememWriter(dst, uint64(count))
-			view.TrimFront(int64(n))
 			return int64(n), err
 		},
 	})
@@ -362,7 +399,7 @@ func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq,
 		return 0, nil
 	}
 	origCount := count
-	n, err := fd.pipe.write(ctx, writeOps{
+	n, err := fd.pipe.writeLocked(ctx, writeOps{
 		left: func() int64 {
 			return count
 		},
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
index 75bfa2c79..192411393 100644
--- a/pkg/sentry/syscalls/linux/vfs2/splice.go
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -131,18 +131,14 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		case inIsPipe && outIsPipe:
 			n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
 		case inIsPipe:
+			n, err = inPipeFD.SpliceToNonPipe(t, outFile, outOffset, count)
 			if outOffset != -1 {
-				n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{})
 				outOffset += n
-			} else {
-				n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{})
 			}
 		case outIsPipe:
+			n, err = outPipeFD.SpliceFromNonPipe(t, inFile, inOffset, count)
 			if inOffset != -1 {
-				n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{})
 				inOffset += n
-			} else {
-				n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
 			}
 		default:
 			panic("not possible")
@@ -341,17 +337,15 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	if outIsPipe {
 		for n < count {
 			var spliceN int64
-			if offset != -1 {
-				spliceN, err = inFile.PRead(t, outPipeFD.IOSequence(count), offset, vfs.ReadOptions{})
-				offset += spliceN
-			} else {
-				spliceN, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
-			}
+			spliceN, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count)
 			if spliceN == 0 && err == io.EOF {
 				// We reached the end of the file. Eat the error and exit the loop.
 				err = nil
 				break
 			}
+			if offset != -1 {
+				offset += spliceN
+			}
 			n += spliceN
 			if err == syserror.ErrWouldBlock && !nonBlock {
 				err = dw.waitForBoth(t)
@@ -371,19 +365,18 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 			} else {
 				readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
 			}
-			if readN == 0 && err == io.EOF {
-				// We reached the end of the file. Eat the error and exit the loop.
-				err = nil
+			if readN == 0 && err != nil {
+				if err == io.EOF {
+					// We reached the end of the file. Eat the error before exiting the loop.
+					err = nil
+				}
 				break
 			}
 			n += readN
-			if err != nil {
-				break
-			}
 
 			// Write all of the bytes that we read. This may need
 			// multiple write calls to complete.
-			wbuf := buf[:n]
+			wbuf := buf[:readN]
 			for len(wbuf) > 0 {
 				var writeN int64
 				writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{})
@@ -398,6 +391,10 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 					notWritten := int64(len(wbuf))
 					n -= notWritten
 					if offset != -1 {
+						// TODO(gvisor.dev/issue/3779): The inFile offset will be incorrect if we
+						// roll back, because it has already been advanced by the full amount.
+						// Merely seeking on inFile does not work, because there may be concurrent
+						// file operations.
 						offset -= notWritten
 					}
 					break
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index 64123e904..e65387f59 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -198,7 +198,39 @@ TEST(SendFileTest, SendAndUpdateFileOffset) {
   EXPECT_EQ(absl::string_view(kData, kHalfDataSize),
             absl::string_view(actual, bytes_sent));
 
-  // Verify that the input file offset has been updated
+  // Verify that the input file offset has been updated.
+  ASSERT_THAT(read(inf.get(), &actual, kDataSize - bytes_sent),
+              SyscallSucceedsWithValue(kHalfDataSize));
+  EXPECT_EQ(
+      absl::string_view(kData + kDataSize - bytes_sent, kDataSize - bytes_sent),
+      absl::string_view(actual, kHalfDataSize));
+}
+
+TEST(SendFileTest, SendToDevZeroAndUpdateFileOffset) {
+  // Create temp files.
+  // Test input string length must be > 2 AND even.
+  constexpr char kData[] = "The slings and arrows of outrageous fortune,";
+  constexpr int kDataSize = sizeof(kData) - 1;
+  constexpr int kHalfDataSize = kDataSize / 2;
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open /dev/zero as write only.
+  const FileDescriptor outf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_WRONLY));
+
+  // Send data and verify that sendfile returns the correct value.
+  int bytes_sent;
+  EXPECT_THAT(
+      bytes_sent = sendfile(outf.get(), inf.get(), nullptr, kHalfDataSize),
+      SyscallSucceedsWithValue(kHalfDataSize));
+
+  char actual[kHalfDataSize];
+  // Verify that the input file offset has been updated.
   ASSERT_THAT(read(inf.get(), &actual, kDataSize - bytes_sent),
               SyscallSucceedsWithValue(kHalfDataSize));
   EXPECT_EQ(
@@ -250,7 +282,7 @@ TEST(SendFileTest, SendAndUpdateFileOffsetFromNonzeroStartingPoint) {
   EXPECT_EQ(absl::string_view(kData + kQuarterDataSize, kHalfDataSize),
             absl::string_view(actual, bytes_sent));
 
-  // Verify that the input file offset has been updated
+  // Verify that the input file offset has been updated.
   ASSERT_THAT(read(inf.get(), &actual, kQuarterDataSize),
               SyscallSucceedsWithValue(kQuarterDataSize));
 
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index 08fc4b1b7..be3fb4840 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -342,7 +342,7 @@ TEST(SpliceTest, FromPipe) {
   ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
               SyscallSucceedsWithValue(kPageSize));
 
-  // Open the input file.
+  // Open the output file.
   const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
   const FileDescriptor out_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
@@ -364,6 +364,40 @@ TEST(SpliceTest, FromPipe) {
   EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
 }
 
+TEST(SpliceTest, FromPipeMultiple) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  std::string buf = "abcABC123";
+  ASSERT_THAT(write(wfd.get(), buf.c_str(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  // Open the output file.
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor out_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
+
+  // Splice from the pipe to the output file over several calls.
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), nullptr, 3, 0),
+              SyscallSucceedsWithValue(3));
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), nullptr, 3, 0),
+              SyscallSucceedsWithValue(3));
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), nullptr, 3, 0),
+              SyscallSucceedsWithValue(3));
+
+  // Reset cursor to zero so that we can check the contents.
+  ASSERT_THAT(lseek(out_fd.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+  // Contents should be equal.
+  std::vector<char> rbuf(buf.size());
+  ASSERT_THAT(read(out_fd.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(rbuf.size()));
+  EXPECT_EQ(memcmp(rbuf.data(), buf.c_str(), buf.size()), 0);
+}
+
 TEST(SpliceTest, FromPipeOffset) {
   // Create a new pipe.
   int fds[2];
@@ -693,6 +727,34 @@ TEST(SpliceTest, FromPipeMaxFileSize) {
   EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
 }
 
+TEST(SpliceTest, FromPipeToDevZero) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  FileDescriptor wfd(fds[1]);
+
+  // Fill with some random data.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  const FileDescriptor zero =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_WRONLY));
+
+  // Close the write end to prevent blocking below.
+  wfd.reset();
+
+  // Splice to /dev/zero. The first call should empty the pipe, and the return
+  // value should not exceed the number of bytes available for reading.
+  EXPECT_THAT(
+      splice(rfd.get(), nullptr, zero.get(), nullptr, kPageSize + 123, 0),
+      SyscallSucceedsWithValue(kPageSize));
+  EXPECT_THAT(splice(rfd.get(), nullptr, zero.get(), nullptr, 1, 0),
+              SyscallSucceedsWithValue(0));
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From cd6374cad39d4aea4a97b425de681b16e05851d3 Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Thu, 27 Aug 2020 19:25:23 -0700
Subject: [go-marshal] Enable auto-marshalling for tundev.

PiperOrigin-RevId: 328863725
---
 pkg/sentry/devices/tundev/tundev.go | 17 +++++++----------
 pkg/sentry/fs/dev/net_tun.go        | 17 +++++++----------
 2 files changed, 14 insertions(+), 20 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go
index a40625e19..0b701a289 100644
--- a/pkg/sentry/devices/tundev/tundev.go
+++ b/pkg/sentry/devices/tundev/tundev.go
@@ -64,12 +64,13 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg
 	request := args[1].Uint()
 	data := args[2].Pointer()
 
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		panic("Ioctl should be called from a task context")
+	}
+
 	switch request {
 	case linux.TUNSETIFF:
-		t := kernel.TaskFromContext(ctx)
-		if t == nil {
-			panic("Ioctl should be called from a task context")
-		}
 		if !t.HasCapability(linux.CAP_NET_ADMIN) {
 			return 0, syserror.EPERM
 		}
@@ -79,9 +80,7 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg
 		}
 
 		var req linux.IFReq
-		if _, err := usermem.CopyObjectIn(ctx, uio, data, &req, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := req.CopyIn(t, data); err != nil {
 			return 0, err
 		}
 		flags := usermem.ByteOrder.Uint16(req.Data[:])
@@ -97,9 +96,7 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg
 		flags := fd.device.Flags() | linux.IFF_NOFILTER
 		usermem.ByteOrder.PutUint16(req.Data[:], flags)
 
-		_, err := usermem.CopyObjectOut(ctx, uio, data, &req, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err := req.CopyOut(t, data)
 		return 0, err
 
 	default:
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
index ec474e554..5f8c9b5a2 100644
--- a/pkg/sentry/fs/dev/net_tun.go
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -89,12 +89,13 @@ func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io u
 	request := args[1].Uint()
 	data := args[2].Pointer()
 
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		panic("Ioctl should be called from a task context")
+	}
+
 	switch request {
 	case linux.TUNSETIFF:
-		t := kernel.TaskFromContext(ctx)
-		if t == nil {
-			panic("Ioctl should be called from a task context")
-		}
 		if !t.HasCapability(linux.CAP_NET_ADMIN) {
 			return 0, syserror.EPERM
 		}
@@ -104,9 +105,7 @@ func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io u
 		}
 
 		var req linux.IFReq
-		if _, err := usermem.CopyObjectIn(ctx, io, data, &req, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := req.CopyIn(t, data); err != nil {
 			return 0, err
 		}
 		flags := usermem.ByteOrder.Uint16(req.Data[:])
@@ -122,9 +121,7 @@ func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io u
 		flags := fops.device.Flags() | linux.IFF_NOFILTER
 		usermem.ByteOrder.PutUint16(req.Data[:], flags)
 
-		_, err := usermem.CopyObjectOut(ctx, io, data, &req, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err := req.CopyOut(t, data)
 		return 0, err
 
 	default:
-- 
cgit v1.2.3


From c77a532936f245b0525703eb7e72a6cdf62c00b0 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 28 Aug 2020 05:06:50 -0700
Subject: Use a single NetworkEndpoint per address

This change was already done as of
https://github.com/google/gvisor/commit/1736b2208f but
https://github.com/google/gvisor/commit/a174aa7597 conflicted with that
change and it was missed in reviews.

This change fixes the conflict.

PiperOrigin-RevId: 328920372
---
 pkg/tcpip/stack/nic.go | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 0c811efdb..8e700990d 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -137,6 +137,7 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 	nic.mu.ndp.initializeTempAddrState()
 
 	// Check for Neighbor Unreachability Detection support.
+	var nud NUDHandler
 	if ep.Capabilities()&CapabilityResolutionRequired != 0 && len(stack.linkAddrResolvers) != 0 && stack.useNeighborCache {
 		rng := rand.New(rand.NewSource(stack.clock.NowNanoseconds()))
 		nic.neigh = &neighborCache{
@@ -144,16 +145,24 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 			state: NewNUDState(stack.nudConfigs, rng),
 			cache: make(map[tcpip.Address]*neighborEntry, neighborCacheSize),
 		}
+
+		// An interface value that holds a nil pointer but non-nil type is not the
+		// same as the nil interface. Because of this, nud must only be assignd if
+		// nic.neigh is non-nil since a nil reference to a neighborCache is not
+		// valid.
+		//
+		// See https://golang.org/doc/faq#nil_error for more information.
+		nud = nic.neigh
 	}
 
-	// Register supported packet endpoint protocols.
+	// Register supported packet and network endpoint protocols.
 	for _, netProto := range header.Ethertypes {
 		nic.mu.packetEPs[netProto] = []PacketEndpoint{}
 	}
 	for _, netProto := range stack.networkProtocols {
 		netNum := netProto.Number()
 		nic.mu.packetEPs[netNum] = nil
-		nic.networkEndpoints[netNum] = netProto.NewEndpoint(id, stack, nic.neigh, nic, ep, stack)
+		nic.networkEndpoints[netNum] = netProto.NewEndpoint(id, stack, nud, nic, ep, stack)
 	}
 
 	nic.linkEP.Attach(nic)
@@ -819,24 +828,11 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 		}
 	}
 
-	netProto, ok := n.stack.networkProtocols[protocolAddress.Protocol]
+	ep, ok := n.networkEndpoints[protocolAddress.Protocol]
 	if !ok {
 		return nil, tcpip.ErrUnknownProtocol
 	}
 
-	var nud NUDHandler
-	if n.neigh != nil {
-		// An interface value that holds a nil concrete value is itself non-nil.
-		// For this reason, n.neigh cannot be passed directly to NewEndpoint so
-		// NetworkEndpoints don't confuse it for non-nil.
-		//
-		// See https://golang.org/doc/faq#nil_error for more information.
-		nud = n.neigh
-	}
-
-	// Create the new network endpoint.
-	ep := netProto.NewEndpoint(n.id, n.stack, nud, n, n.linkEP, n.stack)
-
 	isIPv6Unicast := protocolAddress.Protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(protocolAddress.AddressWithPrefix.Address)
 
 	// If the address is an IPv6 address and it is a permanent address,
-- 
cgit v1.2.3


From c9842f21ce4a9308dba983fd712cc688b26237d5 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 28 Aug 2020 10:33:44 -0700
Subject: fix panic when calling SO_ORIGINAL_DST without initializing iptables

Reported-by: syzbot+074ec22c42305725b79f@syzkaller.appspotmail.com
PiperOrigin-RevId: 328963899
---
 pkg/tcpip/stack/iptables.go     |  5 +++++
 test/syscalls/linux/iptables.cc | 13 +++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index 30aa41db2..0e33cbe92 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -427,5 +427,10 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx
 // OriginalDst returns the original destination of redirected connections. It
 // returns an error if the connection doesn't exist or isn't redirected.
 func (it *IPTables) OriginalDst(epID TransportEndpointID) (tcpip.Address, uint16, *tcpip.Error) {
+	it.mu.RLock()
+	defer it.mu.RUnlock()
+	if !it.modified {
+		return "", 0, tcpip.ErrNotConnected
+	}
 	return it.connections.originalDst(epID)
 }
diff --git a/test/syscalls/linux/iptables.cc b/test/syscalls/linux/iptables.cc
index f1af8f097..83b6a164a 100644
--- a/test/syscalls/linux/iptables.cc
+++ b/test/syscalls/linux/iptables.cc
@@ -104,6 +104,19 @@ TEST(IPTablesBasic, GetEntriesErrorPrecedence) {
       SyscallFailsWithErrno(EINVAL));
 }
 
+TEST(IPTablesBasic, OriginalDstErrors) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET, SOCK_STREAM, 0), SyscallSucceeds());
+
+  // Sockets not affected by NAT should fail to find an original destination.
+  struct sockaddr_in addr = {};
+  socklen_t addr_len = sizeof(addr);
+  EXPECT_THAT(getsockopt(sock, SOL_IP, SO_ORIGINAL_DST, &addr, &addr_len),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
 // Fixture for iptables tests.
 class IPTablesTest : public ::testing::Test {
  protected:
-- 
cgit v1.2.3


From 4346e36ba286338f6615eb9b22425808cf186775 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 28 Aug 2020 11:26:25 -0700
Subject: Fix EOF handling for splice.

Also, add corresponding EOF tests for splice/sendfile.

Discovered by syzkaller.

PiperOrigin-RevId: 328975990
---
 pkg/sentry/syscalls/linux/vfs2/splice.go |  7 ++++++-
 test/syscalls/linux/sendfile.cc          | 16 ++++++++++++++++
 test/syscalls/linux/splice.cc            | 17 +++++++++++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
index 192411393..68ce94778 100644
--- a/pkg/sentry/syscalls/linux/vfs2/splice.go
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -141,9 +141,14 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 				inOffset += n
 			}
 		default:
-			panic("not possible")
+			panic("at least one end of splice must be a pipe")
 		}
 
+		if n == 0 && err == io.EOF {
+			// We reached the end of the file. Eat the error and exit the loop.
+			err = nil
+			break
+		}
 		if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
 			break
 		}
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index e65387f59..a8bfb01f1 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -533,6 +533,22 @@ TEST(SendFileTest, SendPipeWouldBlock) {
               SyscallFailsWithErrno(EWOULDBLOCK));
 }
 
+TEST(SendFileTest, SendPipeEOF) {
+  // Create and open an empty input file.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Setup the output named pipe.
+  int fds[2];
+  ASSERT_THAT(pipe2(fds, O_NONBLOCK), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  EXPECT_THAT(sendfile(wfd.get(), inf.get(), nullptr, 123),
+              SyscallSucceedsWithValue(0));
+}
+
 TEST(SendFileTest, SendPipeBlocks) {
   // Create temp file.
   constexpr char kData[] =
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index be3fb4840..a1d2b9b11 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -298,6 +298,23 @@ TEST(SpliceTest, ToPipe) {
   EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
 }
 
+TEST(SpliceTest, ToPipeEOF) {
+  // Create and open an empty input file.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor in_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Splice from the empty file to the pipe.
+  EXPECT_THAT(splice(in_fd.get(), nullptr, wfd.get(), nullptr, 123, 0),
+              SyscallSucceedsWithValue(0));
+}
+
 TEST(SpliceTest, ToPipeOffset) {
   // Open the input file.
   const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-- 
cgit v1.2.3


From 91e81aaf69ac5fc4cd7b677139c6a23801eabb02 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 28 Aug 2020 11:47:58 -0700
Subject: Improve type safety for network protocol options

The existing implementation for NetworkProtocol.{Set}Option take
arguments of an empty interface type which all types (implicitly)
implement; any type may be passed to the functions.

This change introduces marker interfaces for network protocol options
that may be set or queried which network protocol option types implement
to ensure that invalid types are caught at compile time. Different
interfaces are used to allow the compiler to enforce read-only or
set-only socket options.

PiperOrigin-RevId: 328980359
---
 pkg/tcpip/network/arp/arp.go      |  4 +--
 pkg/tcpip/network/ipv4/ipv4.go    |  8 ++---
 pkg/tcpip/network/ipv6/ipv6.go    |  8 ++---
 pkg/tcpip/stack/forwarder_test.go | 10 +++---
 pkg/tcpip/stack/nic_test.go       |  4 +--
 pkg/tcpip/stack/registration.go   |  4 +--
 pkg/tcpip/stack/stack.go          |  4 +--
 pkg/tcpip/stack/stack_test.go     | 68 ++++++++++-----------------------------
 pkg/tcpip/tcpip.go                | 16 +++++++++
 runsc/boot/loader.go              |  9 ++++--
 10 files changed, 61 insertions(+), 74 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index cbbe5b77f..7aaee08c4 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -217,12 +217,12 @@ func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bo
 }
 
 // SetOption implements stack.NetworkProtocol.SetOption.
-func (*protocol) SetOption(option interface{}) *tcpip.Error {
+func (*protocol) SetOption(tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
 // Option implements stack.NetworkProtocol.Option.
-func (*protocol) Option(option interface{}) *tcpip.Error {
+func (*protocol) Option(tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 55ca94268..fa4ae2012 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -486,10 +486,10 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 }
 
 // SetOption implements NetworkProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case tcpip.DefaultTTLOption:
-		p.SetDefaultTTL(uint8(v))
+	case *tcpip.DefaultTTLOption:
+		p.SetDefaultTTL(uint8(*v))
 		return nil
 	default:
 		return tcpip.ErrUnknownProtocolOption
@@ -497,7 +497,7 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 }
 
 // Option implements NetworkProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
 	case *tcpip.DefaultTTLOption:
 		*v = tcpip.DefaultTTLOption(p.DefaultTTL())
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 36fbbebf0..af3cd91c6 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -469,10 +469,10 @@ func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddres
 }
 
 // SetOption implements NetworkProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case tcpip.DefaultTTLOption:
-		p.SetDefaultTTL(uint8(v))
+	case *tcpip.DefaultTTLOption:
+		p.SetDefaultTTL(uint8(*v))
 		return nil
 	default:
 		return tcpip.ErrUnknownProtocolOption
@@ -480,7 +480,7 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 }
 
 // Option implements NetworkProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
 	case *tcpip.DefaultTTLOption:
 		*v = tcpip.DefaultTTLOption(p.DefaultTTL())
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index 91165ebc7..54759091a 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -154,17 +154,17 @@ func (f *fwdTestNetworkProtocol) NewEndpoint(nicID tcpip.NICID, _ LinkAddressCac
 	}
 }
 
-func (f *fwdTestNetworkProtocol) SetOption(option interface{}) *tcpip.Error {
+func (*fwdTestNetworkProtocol) SetOption(tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
-func (f *fwdTestNetworkProtocol) Option(option interface{}) *tcpip.Error {
+func (*fwdTestNetworkProtocol) Option(tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
-func (f *fwdTestNetworkProtocol) Close() {}
+func (*fwdTestNetworkProtocol) Close() {}
 
-func (f *fwdTestNetworkProtocol) Wait() {}
+func (*fwdTestNetworkProtocol) Wait() {}
 
 func (f *fwdTestNetworkProtocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error {
 	if f.onLinkAddressResolved != nil {
@@ -182,7 +182,7 @@ func (f *fwdTestNetworkProtocol) ResolveStaticAddress(addr tcpip.Address) (tcpip
 	return "", false
 }
 
-func (f *fwdTestNetworkProtocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+func (*fwdTestNetworkProtocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 	return fwdTestNetNumber
 }
 
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
index 1e065b5c1..dd6474297 100644
--- a/pkg/tcpip/stack/nic_test.go
+++ b/pkg/tcpip/stack/nic_test.go
@@ -201,12 +201,12 @@ func (p *testIPv6Protocol) NewEndpoint(nicID tcpip.NICID, _ LinkAddressCache, _
 }
 
 // SetOption implements NetworkProtocol.SetOption.
-func (*testIPv6Protocol) SetOption(interface{}) *tcpip.Error {
+func (*testIPv6Protocol) SetOption(tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	return nil
 }
 
 // Option implements NetworkProtocol.Option.
-func (*testIPv6Protocol) Option(interface{}) *tcpip.Error {
+func (*testIPv6Protocol) Option(tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	return nil
 }
 
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 21ac38583..2d88fa1f7 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -303,12 +303,12 @@ type NetworkProtocol interface {
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
 	// provided option value is invalid.
-	SetOption(option interface{}) *tcpip.Error
+	SetOption(option tcpip.SettableNetworkProtocolOption) *tcpip.Error
 
 	// Option allows retrieving protocol specific option values.
 	// Option returns an error if the option is not supported or the
 	// provided option value is invalid.
-	Option(option interface{}) *tcpip.Error
+	Option(option tcpip.GettableNetworkProtocolOption) *tcpip.Error
 
 	// Close requests that any worker goroutines owned by the protocol
 	// stop.
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 7f5ed9e83..c86ee1c13 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -785,7 +785,7 @@ func (s *Stack) UniqueID() uint64 {
 // options. This method returns an error if the protocol is not supported or
 // option is not supported by the protocol implementation or the provided value
 // is incorrect.
-func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error {
+func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, option tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	netProto, ok := s.networkProtocols[network]
 	if !ok {
 		return tcpip.ErrUnknownProtocol
@@ -802,7 +802,7 @@ func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, op
 // if err != nil {
 //   ...
 // }
-func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error {
+func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, option tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	netProto, ok := s.networkProtocols[network]
 	if !ok {
 		return tcpip.ErrUnknownProtocol
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 1deeccb89..60b54c244 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -158,23 +158,13 @@ func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack
 
 func (*fakeNetworkEndpoint) Close() {}
 
-type fakeNetGoodOption bool
-
-type fakeNetBadOption bool
-
-type fakeNetInvalidValueOption int
-
-type fakeNetOptions struct {
-	good bool
-}
-
 // fakeNetworkProtocol is a network-layer protocol descriptor. It aggregates the
 // number of packets sent and received via endpoints of this protocol. The index
 // where packets are added is given by the packet's destination address MOD 10.
 type fakeNetworkProtocol struct {
 	packetCount     [10]int
 	sendPacketCount [10]int
-	opts            fakeNetOptions
+	defaultTTL      uint8
 }
 
 func (f *fakeNetworkProtocol) Number() tcpip.NetworkProtocolNumber {
@@ -206,22 +196,20 @@ func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, _ stack.LinkAddress
 	}
 }
 
-func (f *fakeNetworkProtocol) SetOption(option interface{}) *tcpip.Error {
+func (f *fakeNetworkProtocol) SetOption(option tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case fakeNetGoodOption:
-		f.opts.good = bool(v)
+	case *tcpip.DefaultTTLOption:
+		f.defaultTTL = uint8(*v)
 		return nil
-	case fakeNetInvalidValueOption:
-		return tcpip.ErrInvalidOptionValue
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
 
-func (f *fakeNetworkProtocol) Option(option interface{}) *tcpip.Error {
+func (f *fakeNetworkProtocol) Option(option tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case *fakeNetGoodOption:
-		*v = fakeNetGoodOption(f.opts.good)
+	case *tcpip.DefaultTTLOption:
+		*v = tcpip.DefaultTTLOption(f.defaultTTL)
 		return nil
 	default:
 		return tcpip.ErrUnknownProtocolOption
@@ -1640,46 +1628,24 @@ func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) {
 	}
 }
 
-func TestNetworkOptions(t *testing.T) {
+func TestNetworkOption(t *testing.T) {
 	s := stack.New(stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
 		TransportProtocols: []stack.TransportProtocol{},
 	})
 
-	// Try an unsupported network protocol.
-	if err := s.SetNetworkProtocolOption(tcpip.NetworkProtocolNumber(99999), fakeNetGoodOption(false)); err != tcpip.ErrUnknownProtocol {
-		t.Fatalf("SetNetworkProtocolOption(fakeNet2, blah, false) = %v, want = tcpip.ErrUnknownProtocol", err)
+	opt := tcpip.DefaultTTLOption(5)
+	if err := s.SetNetworkProtocolOption(fakeNetNumber, &opt); err != nil {
+		t.Fatalf("s.SetNetworkProtocolOption(%d, &%T(%d)): %s", fakeNetNumber, opt, opt, err)
 	}
 
-	testCases := []struct {
-		option   interface{}
-		wantErr  *tcpip.Error
-		verifier func(t *testing.T, p stack.NetworkProtocol)
-	}{
-		{fakeNetGoodOption(true), nil, func(t *testing.T, p stack.NetworkProtocol) {
-			t.Helper()
-			fakeNet := p.(*fakeNetworkProtocol)
-			if fakeNet.opts.good != true {
-				t.Fatalf("fakeNet.opts.good = false, want = true")
-			}
-			var v fakeNetGoodOption
-			if err := s.NetworkProtocolOption(fakeNetNumber, &v); err != nil {
-				t.Fatalf("s.NetworkProtocolOption(fakeNetNumber, &v) = %v, want = nil, where v is option %T", v, err)
-			}
-			if v != true {
-				t.Fatalf("s.NetworkProtocolOption(fakeNetNumber, &v) returned v = %v, want = true", v)
-			}
-		}},
-		{fakeNetBadOption(true), tcpip.ErrUnknownProtocolOption, nil},
-		{fakeNetInvalidValueOption(1), tcpip.ErrInvalidOptionValue, nil},
+	var optGot tcpip.DefaultTTLOption
+	if err := s.NetworkProtocolOption(fakeNetNumber, &optGot); err != nil {
+		t.Fatalf("s.NetworkProtocolOption(%d, &%T): %s", fakeNetNumber, optGot, err)
 	}
-	for _, tc := range testCases {
-		if got := s.SetNetworkProtocolOption(fakeNetNumber, tc.option); got != tc.wantErr {
-			t.Errorf("s.SetNetworkProtocolOption(fakeNet, %v) = %v, want = %v", tc.option, got, tc.wantErr)
-		}
-		if tc.verifier != nil {
-			tc.verifier(t, s.NetworkProtocolInstance(fakeNetNumber))
-		}
+
+	if opt != optGot {
+		t.Errorf("got optGot = %d, want = %d", optGot, opt)
 	}
 }
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index cd72d4f02..47a8d7c86 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -841,10 +841,26 @@ const (
 	PMTUDiscoveryProbe
 )
 
+// GettableNetworkProtocolOption is a marker interface for network protocol
+// options that may be queried.
+type GettableNetworkProtocolOption interface {
+	isGettableNetworkProtocolOption()
+}
+
+// SettableNetworkProtocolOption is a marker interface for network protocol
+// options that may be set.
+type SettableNetworkProtocolOption interface {
+	isSettableNetworkProtocolOption()
+}
+
 // DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify
 // a default TTL.
 type DefaultTTLOption uint8
 
+func (*DefaultTTLOption) isGettableNetworkProtocolOption() {}
+
+func (*DefaultTTLOption) isSettableNetworkProtocolOption() {}
+
 // AvailableCongestionControlOption is used to query the supported congestion
 // control algorithms.
 type AvailableCongestionControlOption string
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index e8ea5093b..c3c754046 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -1066,8 +1066,13 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
 	}
 
 	// Set default TTLs as required by socket/netstack.
-	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
-	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
+	if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
+		return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
+	}
+	if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
+		return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
+	}
 
 	// Enable Receive Buffer Auto-Tuning.
 	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-- 
cgit v1.2.3


From 8d75fc4883ca8c10fb615203993d56d33a9e36b6 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Fri, 28 Aug 2020 14:29:16 -0700
Subject: Implement StatFS for various VFS2 filesystems.

This mainly involved enabling kernfs' client filesystems to provide a
StatFS implementation.

Fixes #3411, #3515.

PiperOrigin-RevId: 329009864
---
 pkg/abi/linux/fs.go                            |  1 +
 pkg/sentry/fsimpl/devpts/devpts.go             | 10 +++++++++-
 pkg/sentry/fsimpl/devpts/master.go             |  1 +
 pkg/sentry/fsimpl/devpts/slave.go              |  1 +
 pkg/sentry/fsimpl/fuse/fusefs.go               |  8 +++++++-
 pkg/sentry/fsimpl/host/host.go                 |  1 +
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go |  1 +
 pkg/sentry/fsimpl/kernfs/filesystem.go         |  5 ++---
 pkg/sentry/fsimpl/kernfs/inode_impl_util.go    | 16 +++++++++++++---
 pkg/sentry/fsimpl/kernfs/kernfs.go             |  5 +++++
 pkg/sentry/fsimpl/kernfs/kernfs_test.go        |  8 +++++---
 pkg/sentry/fsimpl/kernfs/symlink.go            |  1 +
 pkg/sentry/fsimpl/pipefs/pipefs.go             |  8 +++++---
 pkg/sentry/fsimpl/proc/filesystem.go           |  7 +++++++
 pkg/sentry/fsimpl/proc/subtasks.go             |  9 +++++----
 pkg/sentry/fsimpl/proc/task.go                 |  7 ++++---
 pkg/sentry/fsimpl/proc/task_fds.go             | 19 +++++++++++--------
 pkg/sentry/fsimpl/proc/task_files.go           |  2 ++
 pkg/sentry/fsimpl/proc/tasks.go                |  9 +++++----
 pkg/sentry/fsimpl/proc/tasks_files.go          |  2 ++
 pkg/sentry/fsimpl/sockfs/sockfs.go             |  9 +++++++--
 pkg/sentry/fsimpl/sys/kcov.go                  |  3 ++-
 pkg/sentry/fsimpl/sys/sys.go                   | 14 ++++++++++++++
 pkg/sentry/vfs/filesystem_impl_util.go         | 13 +++++++++++++
 test/syscalls/BUILD                            |  1 +
 test/syscalls/linux/pipe.cc                    | 12 ++++++++++++
 test/syscalls/linux/proc.cc                    | 14 ++++++++++++++
 test/syscalls/linux/socket.cc                  | 17 +++++++++++++++++
 test/syscalls/linux/statfs.cc                  | 16 ++++++++++------
 29 files changed, 178 insertions(+), 42 deletions(-)

(limited to 'pkg')

diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 2b1ef0d4e..0d921ed6f 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -29,6 +29,7 @@ const (
 	SYSFS_MAGIC           = 0x62656572
 	TMPFS_MAGIC           = 0x01021994
 	V9FS_MAGIC            = 0x01021997
+	FUSE_SUPER_MAGIC      = 0x65735546
 )
 
 // Filesystem path limits, from uapi/linux/limits.h.
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 0eaff9087..57580f4d4 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -111,12 +111,13 @@ func (fs *filesystem) Release(ctx context.Context) {
 
 // rootInode is the root directory inode for the devpts mounts.
 type rootInode struct {
-	rootInodeRefs
+	implStatFS
 	kernfs.AlwaysValid
 	kernfs.InodeAttrs
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
+	rootInodeRefs
 
 	locks vfs.FileLocks
 
@@ -240,3 +241,10 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
 func (i *rootInode) DecRef(context.Context) {
 	i.rootInodeRefs.DecRef(i.Destroy)
 }
+
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.DEVPTS_SUPER_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 3bb397f71..60feb1993 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -30,6 +30,7 @@ import (
 
 // masterInode is the inode for the master end of the Terminal.
 type masterInode struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeNotDirectory
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
index 32e4e1908..a9da7af64 100644
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ b/pkg/sentry/fsimpl/devpts/slave.go
@@ -29,6 +29,7 @@ import (
 
 // slaveInode is the inode for the slave end of the Terminal.
 type slaveInode struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeNotDirectory
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index 9717c0e15..810819ae4 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -200,9 +200,9 @@ func (fs *filesystem) Release(ctx context.Context) {
 type inode struct {
 	inodeRefs
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeNoDynamicLookup
 	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
 	kernfs.OrderedChildren
 
 	locks vfs.FileLocks
@@ -331,3 +331,9 @@ func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptio
 func (i *inode) DecRef(context.Context) {
 	i.inodeRefs.DecRef(i.Destroy)
 }
+
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	// TODO(gvisor.dev/issues/3413): Complete the implementation of statfs.
+	return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 2d3821f33..7561f821c 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -186,6 +186,7 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 
 // inode implements kernfs.Inode.
 type inode struct {
+	kernfs.InodeNoStatFS
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
 
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 12adf727a..1ee089620 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -35,6 +35,7 @@ import (
 // +stateify savable
 type DynamicBytesFile struct {
 	InodeAttrs
+	InodeNoStatFS
 	InodeNoopRefCount
 	InodeNotDirectory
 	InodeNotSymlink
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index e5d6b5c35..0e3011689 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -721,14 +721,13 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
 func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
 	fs.mu.RUnlock()
 	fs.processDeferredDecRefs(ctx)
 	if err != nil {
 		return linux.Statfs{}, err
 	}
-	// TODO(gvisor.dev/issue/1193): actually implement statfs.
-	return linux.Statfs{}, syserror.ENOSYS
+	return inode.StatFS(ctx, fs.VFSFilesystem())
 }
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index f442a5606..c0b863ba4 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -546,12 +546,13 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
 //
 // +stateify savable
 type StaticDirectory struct {
-	StaticDirectoryRefs
-	InodeNotSymlink
-	InodeDirectoryNoNewChildren
 	InodeAttrs
+	InodeDirectoryNoNewChildren
 	InodeNoDynamicLookup
+	InodeNoStatFS
+	InodeNotSymlink
 	OrderedChildren
+	StaticDirectoryRefs
 
 	locks  vfs.FileLocks
 	fdOpts GenericDirectoryFDOptions
@@ -609,3 +610,12 @@ type AlwaysValid struct{}
 func (*AlwaysValid) Valid(context.Context) bool {
 	return true
 }
+
+// InodeNoStatFS partially implements the Inode interface, where the client
+// filesystem doesn't support statfs(2).
+type InodeNoStatFS struct{}
+
+// StatFS implements Inode.StatFS.
+func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return linux.Statfs{}, syserror.ENOSYS
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index ca3685800..88fcd54aa 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -320,6 +320,11 @@ type Inode interface {
 	// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
 	// the inode on which Open() is being called.
 	Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+
+	// StatFS returns filesystem statistics for the client filesystem. This
+	// corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem
+	// doesn't support statfs(2), this should return ENOSYS.
+	StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error)
 }
 
 type inodeRefs interface {
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index e376d1736..675587c6b 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -98,9 +98,10 @@ func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.S
 type readonlyDir struct {
 	readonlyDirRefs
 	attrs
-	kernfs.InodeNotSymlink
-	kernfs.InodeNoDynamicLookup
 	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNoDynamicLookup
+	kernfs.InodeNoStatFS
+	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
 
 	locks vfs.FileLocks
@@ -137,9 +138,10 @@ func (d *readonlyDir) DecRef(context.Context) {
 type dir struct {
 	dirRefs
 	attrs
-	kernfs.InodeNotSymlink
 	kernfs.InodeNoDynamicLookup
+	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
+	kernfs.InodeNoStatFS
 
 	locks vfs.FileLocks
 
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 2ab3f53fd..64731a3e4 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -28,6 +28,7 @@ type StaticSymlink struct {
 	InodeAttrs
 	InodeNoopRefCount
 	InodeSymlink
+	InodeNoStatFS
 
 	target string
 }
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index 2ca793db9..7053ad6db 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -143,14 +143,16 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.
 	return syserror.EPERM
 }
 
-// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement
-// statfs, from which we should indicate PIPEFS_MAGIC.
-
 // Open implements kernfs.Inode.Open.
 func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks)
 }
 
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.PIPEFS_MAGIC), nil
+}
+
 // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
 // and write ends of a newly-created pipe, as for pipe(2) and pipe2(2).
 //
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index c350ec127..03b5941b9 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -121,3 +121,10 @@ func newStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64
 type InternalData struct {
 	Cgroups map[string]string
 }
+
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.PROC_SUPER_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 01c0efb3a..d57d94dbc 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -31,12 +31,13 @@ import (
 //
 // +stateify savable
 type subtasksInode struct {
-	subtasksInodeRefs
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	implStatFS
+	kernfs.AlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
+	subtasksInodeRefs
 
 	locks vfs.FileLocks
 
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 66b557abd..dbdb5d929 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -32,12 +32,13 @@ import (
 //
 // +stateify savable
 type taskInode struct {
-	taskInodeRefs
-	kernfs.InodeNotSymlink
+	implStatFS
+	kernfs.InodeAttrs
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeNoDynamicLookup
-	kernfs.InodeAttrs
+	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
+	taskInodeRefs
 
 	locks vfs.FileLocks
 
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 0527b2de8..3f0d78461 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -100,13 +100,14 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off
 //
 // +stateify savable
 type fdDirInode struct {
+	fdDir
 	fdDirInodeRefs
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	implStatFS
+	kernfs.AlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
-	fdDir
 }
 
 var _ kernfs.Inode = (*fdDirInode)(nil)
@@ -185,6 +186,7 @@ func (i *fdDirInode) DecRef(context.Context) {
 //
 // +stateify savable
 type fdSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -233,13 +235,14 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen
 //
 // +stateify savable
 type fdInfoDirInode struct {
+	fdDir
 	fdInfoDirInodeRefs
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	implStatFS
+	kernfs.AlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
-	fdDir
 }
 
 var _ kernfs.Inode = (*fdInfoDirInode)(nil)
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 830b78949..356036b9b 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -648,6 +648,7 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset
 //
 // +stateify savable
 type exeSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -832,6 +833,7 @@ func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vir
 // namespaceInode is a synthetic inode created to represent a namespace in
 // /proc/[pid]/ns/*.
 type namespaceInode struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeNotDirectory
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 863c4467e..3ea00ab87 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -37,12 +37,13 @@ const (
 //
 // +stateify savable
 type tasksInode struct {
-	tasksInodeRefs
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	implStatFS
+	kernfs.AlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
+	tasksInodeRefs
 
 	locks vfs.FileLocks
 
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 7d8983aa5..8c41729e4 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -32,6 +32,7 @@ import (
 )
 
 type selfSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -74,6 +75,7 @@ func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials,
 }
 
 type threadSelfSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index c61818ff6..94a998568 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -81,10 +81,10 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 
 // inode implements kernfs.Inode.
 type inode struct {
-	kernfs.InodeNotDirectory
-	kernfs.InodeNotSymlink
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
 }
 
 // Open implements kernfs.Inode.Open.
@@ -92,6 +92,11 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 	return nil, syserror.ENXIO
 }
 
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.SOCKFS_MAGIC), nil
+}
+
 // NewDentry constructs and returns a sockfs dentry.
 //
 // Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go
index 92710d877..73f3d3309 100644
--- a/pkg/sentry/fsimpl/sys/kcov.go
+++ b/pkg/sentry/fsimpl/sys/kcov.go
@@ -39,8 +39,9 @@ func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials)
 type kcovInode struct {
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
-	kernfs.InodeNotSymlink
 	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	implStatFS
 }
 
 func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index ea30a4ec2..39952d2d0 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -163,9 +163,16 @@ func (d *dir) DecRef(context.Context) {
 	d.dirRefs.DecRef(d.Destroy)
 }
 
+// StatFS implements kernfs.Inode.StatFS.
+func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
+}
+
 // cpuFile implements kernfs.Inode.
 type cpuFile struct {
+	implStatFS
 	kernfs.DynamicBytesFile
+
 	maxCores uint
 }
 
@@ -182,3 +189,10 @@ func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode li
 	d.Init(c)
 	return d
 }
+
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
+}
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
index 465e610e0..2620cf975 100644
--- a/pkg/sentry/vfs/filesystem_impl_util.go
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -16,6 +16,9 @@ package vfs
 
 import (
 	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // GenericParseMountOptions parses a comma-separated list of options of the
@@ -41,3 +44,13 @@ func GenericParseMountOptions(str string) map[string]string {
 	}
 	return m
 }
+
+// GenericStatFS returns a statfs struct filled with the common fields for a
+// general filesystem. This is analogous to Linux's fs/libfs.cs:simple_statfs().
+func GenericStatFS(fsMagic uint64) linux.Statfs {
+	return linux.Statfs{
+		Type:       fsMagic,
+		BlockSize:  usermem.PageSize,
+		NameLength: linux.NAME_MAX,
+	}
+}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 65e8299c3..f949bc0e3 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -803,6 +803,7 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:statfs_test",
+    use_tmpfs = True,  # Test specifically relies on TEST_TMPDIR to be tmpfs.
 )
 
 syscall_test(
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index 34291850d..c097c9187 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <fcntl.h> /* Obtain O_* constant definitions */
+#include <linux/magic.h>
 #include <sys/ioctl.h>
+#include <sys/statfs.h>
 #include <sys/uio.h>
 #include <unistd.h>
 
@@ -198,6 +200,16 @@ TEST_P(PipeTest, NonBlocking) {
               SyscallFailsWithErrno(EWOULDBLOCK));
 }
 
+TEST(PipeTest, StatFS) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  struct statfs st;
+  EXPECT_THAT(fstatfs(fds[0], &st), SyscallSucceeds());
+  EXPECT_EQ(st.f_type, PIPEFS_MAGIC);
+  EXPECT_EQ(st.f_bsize, getpagesize());
+  EXPECT_EQ(st.f_namelen, NAME_MAX);
+}
+
 TEST(Pipe2Test, CloExec) {
   int fds[2];
   ASSERT_THAT(pipe2(fds, O_CLOEXEC), SyscallSucceeds());
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index d6b875dbf..b73189e55 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -16,6 +16,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <limits.h>
+#include <linux/magic.h>
 #include <sched.h>
 #include <signal.h>
 #include <stddef.h>
@@ -26,6 +27,7 @@
 #include <sys/mman.h>
 #include <sys/prctl.h>
 #include <sys/stat.h>
+#include <sys/statfs.h>
 #include <sys/utsname.h>
 #include <syscall.h>
 #include <unistd.h>
@@ -2159,6 +2161,18 @@ TEST(Proc, PidTidIOAccounting) {
   noop.Join();
 }
 
+TEST(Proc, Statfs) {
+  struct statfs st;
+  EXPECT_THAT(statfs("/proc", &st), SyscallSucceeds());
+  if (IsRunningWithVFS1()) {
+    EXPECT_EQ(st.f_type, ANON_INODE_FS_MAGIC);
+  } else {
+    EXPECT_EQ(st.f_type, PROC_SUPER_MAGIC);
+  }
+  EXPECT_EQ(st.f_bsize, getpagesize());
+  EXPECT_EQ(st.f_namelen, NAME_MAX);
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket.cc b/test/syscalls/linux/socket.cc
index c20cd3fcc..e680d3dd7 100644
--- a/test/syscalls/linux/socket.cc
+++ b/test/syscalls/linux/socket.cc
@@ -14,6 +14,7 @@
 
 #include <sys/socket.h>
 #include <sys/stat.h>
+#include <sys/statfs.h>
 #include <sys/types.h>
 #include <unistd.h>
 
@@ -26,6 +27,9 @@
 namespace gvisor {
 namespace testing {
 
+// From linux/magic.h, but we can't depend on linux headers here.
+#define SOCKFS_MAGIC 0x534F434B
+
 TEST(SocketTest, UnixSocketPairProtocol) {
   int socks[2];
   ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, PF_UNIX, socks),
@@ -94,6 +98,19 @@ TEST(SocketTest, UnixSocketStat) {
   }
 }
 
+TEST(SocketTest, UnixSocketStatFS) {
+  SKIP_IF(IsRunningWithVFS1());
+
+  FileDescriptor bound =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX));
+
+  struct statfs st;
+  EXPECT_THAT(fstatfs(bound.get(), &st), SyscallSucceeds());
+  EXPECT_EQ(st.f_type, SOCKFS_MAGIC);
+  EXPECT_EQ(st.f_bsize, getpagesize());
+  EXPECT_EQ(st.f_namelen, NAME_MAX);
+}
+
 using SocketOpenTest = ::testing::TestWithParam<int>;
 
 // UDS cannot be opened.
diff --git a/test/syscalls/linux/statfs.cc b/test/syscalls/linux/statfs.cc
index aca51d30f..49f2f156c 100644
--- a/test/syscalls/linux/statfs.cc
+++ b/test/syscalls/linux/statfs.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <fcntl.h>
+#include <linux/magic.h>
 #include <sys/statfs.h>
 #include <unistd.h>
 
@@ -26,6 +27,10 @@ namespace testing {
 
 namespace {
 
+// From linux/magic.h. For some reason, not defined in the headers for some
+// build environments.
+#define OVERLAYFS_SUPER_MAGIC 0x794c7630
+
 TEST(StatfsTest, CannotStatBadPath) {
   auto temp_file = NewTempAbsPathInDir("/tmp");
 
@@ -38,19 +43,18 @@ TEST(StatfsTest, InternalTmpfs) {
 
   struct statfs st;
   EXPECT_THAT(statfs(temp_file.path().c_str(), &st), SyscallSucceeds());
+  // Note: We could be an overlay or goferfs on some configurations.
+  EXPECT_TRUE(st.f_type == TMPFS_MAGIC || st.f_type == OVERLAYFS_SUPER_MAGIC ||
+              st.f_type == V9FS_MAGIC);
 }
 
 TEST(StatfsTest, InternalDevShm) {
   struct statfs st;
   EXPECT_THAT(statfs("/dev/shm", &st), SyscallSucceeds());
-}
-
-TEST(StatfsTest, NameLen) {
-  struct statfs st;
-  EXPECT_THAT(statfs("/dev/shm", &st), SyscallSucceeds());
 
   // This assumes that /dev/shm is tmpfs.
-  EXPECT_EQ(st.f_namelen, NAME_MAX);
+  // Note: We could be an overlay on some configurations.
+  EXPECT_TRUE(st.f_type == TMPFS_MAGIC || st.f_type == OVERLAYFS_SUPER_MAGIC);
 }
 
 TEST(FstatfsTest, CannotStatBadFd) {
-- 
cgit v1.2.3


From aaae7109d23cc9a97aea27efcf6f541a594eddf4 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Fri, 28 Aug 2020 14:37:53 -0700
Subject: Don't bind loopback to all IPs in an IPv6 subnet

An earlier change considered the loopback bound to all addresses in an
assigned subnet. This should have only be done for IPv4 to maintain
compatability with Linux:

```
$ ip addr show dev lo
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group ...
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
$ ping 2001:db8::1
PING 2001:db8::1(2001:db8::1) 56 data bytes
^C
--- 2001:db8::1 ping statistics ---
4 packets transmitted, 0 received, 100% packet loss, time 3062ms

$ ping 2001:db8::2
PING 2001:db8::2(2001:db8::2) 56 data bytes
^C
--- 2001:db8::2 ping statistics ---
3 packets transmitted, 0 received, 100% packet loss, time 2030ms

$ sudo ip addr add 2001:db8::1/64 dev lo
$ ping 2001:db8::1
PING 2001:db8::1(2001:db8::1) 56 data bytes
64 bytes from 2001:db8::1: icmp_seq=1 ttl=64 time=0.055 ms
64 bytes from 2001:db8::1: icmp_seq=2 ttl=64 time=0.074 ms
64 bytes from 2001:db8::1: icmp_seq=3 ttl=64 time=0.073 ms
64 bytes from 2001:db8::1: icmp_seq=4 ttl=64 time=0.071 ms
^C
--- 2001:db8::1 ping statistics ---
4 packets transmitted, 4 received, 0% packet loss, time 3075ms
rtt min/avg/max/mdev = 0.055/0.068/0.074/0.007 ms
$ ping 2001:db8::2
PING 2001:db8::2(2001:db8::2) 56 data bytes
From 2001:db8::1 icmp_seq=1 Destination unreachable: No route
From 2001:db8::1 icmp_seq=2 Destination unreachable: No route
From 2001:db8::1 icmp_seq=3 Destination unreachable: No route
From 2001:db8::1 icmp_seq=4 Destination unreachable: No route
^C
--- 2001:db8::2 ping statistics ---
4 packets transmitted, 0 received, +4 errors, 100% packet loss, time 3070ms
```

Test: integration_test.TestLoopbackAcceptAllInSubnet
PiperOrigin-RevId: 329011566
---
 pkg/tcpip/stack/nic.go                             |  6 +--
 pkg/tcpip/tests/integration/loopback_test.go       | 40 ---------------
 test/syscalls/linux/BUILD                          | 19 +------
 .../linux/socket_ip_udp_unbound_netlink_util.cc    | 58 ----------------------
 .../linux/socket_ip_udp_unbound_netlink_util.h     | 34 -------------
 .../linux/socket_ipv4_udp_unbound_netlink.cc       | 32 +++++++++++-
 .../linux/socket_ipv4_udp_unbound_netlink.h        |  4 +-
 .../linux/socket_ipv6_udp_unbound_netlink.cc       | 28 ++++-------
 .../linux/socket_ipv6_udp_unbound_netlink.h        |  4 +-
 9 files changed, 49 insertions(+), 176 deletions(-)
 delete mode 100644 test/syscalls/linux/socket_ip_udp_unbound_netlink_util.cc
 delete mode 100644 test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 8e700990d..863ef6bee 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -676,10 +676,10 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 	}
 
 	// A usable reference was not found, create a temporary one if requested by
-	// the caller or if the address is found in the NIC's subnets and the NIC is
-	// a loopback interface.
+	// the caller or if the IPv4 address is found in the NIC's subnets and the NIC
+	// is a loopback interface.
 	createTempEP := spoofingOrPromiscuous
-	if !createTempEP && n.isLoopback() {
+	if !createTempEP && n.isLoopback() && protocol == header.IPv4ProtocolNumber {
 		for _, r := range n.mu.endpoints {
 			addr := r.addrWithPrefix()
 			subnet := addr.Subnet()
diff --git a/pkg/tcpip/tests/integration/loopback_test.go b/pkg/tcpip/tests/integration/loopback_test.go
index 3a2f75837..1b18023c5 100644
--- a/pkg/tcpip/tests/integration/loopback_test.go
+++ b/pkg/tcpip/tests/integration/loopback_test.go
@@ -109,52 +109,12 @@ func TestLoopbackAcceptAllInSubnet(t *testing.T) {
 			dstAddr:    ipv6Addr.Address,
 			expectRx:   true,
 		},
-		{
-			name:       "IPv6 bind to wildcard and send to assigned address",
-			addAddress: ipv6ProtocolAddress,
-			dstAddr:    ipv6Addr.Address,
-			expectRx:   true,
-		},
 		{
 			name:       "IPv6 bind to wildcard and send to other subnet-local address",
 			addAddress: ipv6ProtocolAddress,
 			dstAddr:    otherIPv6Address,
-			expectRx:   true,
-		},
-		{
-			name:       "IPv6 bind to wildcard send to other address",
-			addAddress: ipv6ProtocolAddress,
-			dstAddr:    remoteIPv6Addr,
-			expectRx:   false,
-		},
-		{
-			name:       "IPv6 bind to other subnet-local address and send to assigned address",
-			addAddress: ipv6ProtocolAddress,
-			bindAddr:   otherIPv6Address,
-			dstAddr:    ipv6Addr.Address,
-			expectRx:   false,
-		},
-		{
-			name:       "IPv6 bind and send to other subnet-local address",
-			addAddress: ipv6ProtocolAddress,
-			bindAddr:   otherIPv6Address,
-			dstAddr:    otherIPv6Address,
-			expectRx:   true,
-		},
-		{
-			name:       "IPv6 bind to assigned address and send to other subnet-local address",
-			addAddress: ipv6ProtocolAddress,
-			bindAddr:   ipv6Addr.Address,
-			dstAddr:    otherIPv6Address,
 			expectRx:   false,
 		},
-		{
-			name:       "IPv6 bind and send to assigned address",
-			addAddress: ipv6ProtocolAddress,
-			bindAddr:   ipv6Addr.Address,
-			dstAddr:    ipv6Addr.Address,
-			expectRx:   true,
-		},
 	}
 
 	for _, test := range tests {
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 5a323d331..fad3be7bf 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2416,21 +2416,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "socket_ip_udp_unbound_netlink_test_utils",
-    testonly = 1,
-    srcs = [
-        "socket_ip_udp_unbound_netlink_util.cc",
-    ],
-    hdrs = [
-        "socket_ip_udp_unbound_netlink_util.h",
-    ],
-    deps = [
-        ":socket_test_util",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "socket_ipv4_udp_unbound_netlink_test_cases",
     testonly = 1,
@@ -2441,8 +2426,8 @@ cc_library(
         "socket_ipv4_udp_unbound_netlink.h",
     ],
     deps = [
-        ":socket_ip_udp_unbound_netlink_test_utils",
         ":socket_netlink_route_util",
+        ":socket_test_util",
         "//test/util:capability_util",
         gtest,
     ],
@@ -2459,8 +2444,8 @@ cc_library(
         "socket_ipv6_udp_unbound_netlink.h",
     ],
     deps = [
-        ":socket_ip_udp_unbound_netlink_test_utils",
         ":socket_netlink_route_util",
+        ":socket_test_util",
         "//test/util:capability_util",
         gtest,
     ],
diff --git a/test/syscalls/linux/socket_ip_udp_unbound_netlink_util.cc b/test/syscalls/linux/socket_ip_udp_unbound_netlink_util.cc
deleted file mode 100644
index 13ffafde7..000000000
--- a/test/syscalls/linux/socket_ip_udp_unbound_netlink_util.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h"
-
-namespace gvisor {
-namespace testing {
-
-const size_t kSendBufSize = 200;
-
-void IPUDPUnboundSocketNetlinkTest::TestSendRecv(TestAddress sender_addr,
-                                                 TestAddress receiver_addr) {
-  auto snd_sock = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
-  auto rcv_sock = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
-
-  EXPECT_THAT(
-      bind(snd_sock->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
-           sender_addr.addr_len),
-      SyscallSucceeds());
-
-  EXPECT_THAT(
-      bind(rcv_sock->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-           receiver_addr.addr_len),
-      SyscallSucceeds());
-  socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(rcv_sock->get(),
-                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                          &receiver_addr_len),
-              SyscallSucceeds());
-  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
-  char send_buf[kSendBufSize];
-  RandomizeBuffer(send_buf, kSendBufSize);
-  EXPECT_THAT(
-      RetryEINTR(sendto)(snd_sock->get(), send_buf, kSendBufSize, 0,
-                         reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                         receiver_addr.addr_len),
-      SyscallSucceedsWithValue(kSendBufSize));
-
-  // Check that we received the packet.
-  char recv_buf[kSendBufSize] = {};
-  ASSERT_THAT(RetryEINTR(recv)(rcv_sock->get(), recv_buf, kSendBufSize, 0),
-              SyscallSucceedsWithValue(kSendBufSize));
-  EXPECT_EQ(0, memcmp(send_buf, recv_buf, kSendBufSize));
-}
-
-}  // namespace testing
-}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h b/test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h
deleted file mode 100644
index 157fb0939..000000000
--- a/test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_UNBOUND_NETLINK_UTIL_H_
-#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_UNBOUND_NETLINK_UTIL_H_
-
-#include "test/syscalls/linux/socket_test_util.h"
-
-namespace gvisor {
-namespace testing {
-
-// Test fixture for tests that apply to IP UDP sockets.
-class IPUDPUnboundSocketNetlinkTest : public SimpleSocketTest {
- public:
-  // TestSendRecv tests sending and receiving a UDP packet from |sender_addr| to
-  // |receiver_addr|.
-  void TestSendRecv(TestAddress sender_addr, TestAddress receiver_addr);
-};
-
-}  // namespace testing
-}  // namespace gvisor
-
-#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_UNBOUND_NETLINK_UTIL_H_
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.cc
index 696fbb189..79eb48afa 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.cc
@@ -23,6 +23,8 @@
 namespace gvisor {
 namespace testing {
 
+constexpr size_t kSendBufSize = 200;
+
 // Checks that the loopback interface considers itself bound to all IPs in an
 // associated subnet.
 TEST_P(IPv4UDPUnboundSocketNetlinkTest, JoinSubnet) {
@@ -35,6 +37,9 @@ TEST_P(IPv4UDPUnboundSocketNetlinkTest, JoinSubnet) {
   EXPECT_NO_ERRNO(LinkAddLocalAddr(loopback_link.index, AF_INET,
                                    /*prefixlen=*/24, &addr, sizeof(addr)));
 
+  auto snd_sock = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto rcv_sock = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
   // Send from an unassigned address but an address that is in the subnet
   // associated with the loopback interface.
   TestAddress sender_addr("V4NotAssignd1");
@@ -43,6 +48,10 @@ TEST_P(IPv4UDPUnboundSocketNetlinkTest, JoinSubnet) {
   EXPECT_EQ(1, inet_pton(AF_INET, "192.0.2.2",
                          &(reinterpret_cast<sockaddr_in*>(&sender_addr.addr)
                                ->sin_addr.s_addr)));
+  EXPECT_THAT(
+      bind(snd_sock->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+           sender_addr.addr_len),
+      SyscallSucceeds());
 
   // Send the packet to an unassigned address but an address that is in the
   // subnet associated with the loopback interface.
@@ -52,8 +61,29 @@ TEST_P(IPv4UDPUnboundSocketNetlinkTest, JoinSubnet) {
   EXPECT_EQ(1, inet_pton(AF_INET, "192.0.2.254",
                          &(reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)
                                ->sin_addr.s_addr)));
+  EXPECT_THAT(
+      bind(rcv_sock->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(rcv_sock->get(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+  char send_buf[kSendBufSize];
+  RandomizeBuffer(send_buf, kSendBufSize);
+  EXPECT_THAT(
+      RetryEINTR(sendto)(snd_sock->get(), send_buf, kSendBufSize, 0,
+                         reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                         receiver_addr.addr_len),
+      SyscallSucceedsWithValue(kSendBufSize));
 
-  TestSendRecv(sender_addr, receiver_addr);
+  // Check that we received the packet.
+  char recv_buf[kSendBufSize] = {};
+  ASSERT_THAT(RetryEINTR(recv)(rcv_sock->get(), recv_buf, kSendBufSize, 0),
+              SyscallSucceedsWithValue(kSendBufSize));
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, kSendBufSize));
 }
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h
index fcfb3318e..73e7836d5 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h
@@ -15,13 +15,13 @@
 #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_NETLINK_UTIL_H_
 #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_NETLINK_UTIL_H_
 
-#include "test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
 
 namespace gvisor {
 namespace testing {
 
 // Test fixture for tests that apply to IPv4 UDP sockets.
-using IPv4UDPUnboundSocketNetlinkTest = IPUDPUnboundSocketNetlinkTest;
+using IPv4UDPUnboundSocketNetlinkTest = SimpleSocketTest;
 
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.cc b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.cc
index 539a4ec55..2ee218231 100644
--- a/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.cc
+++ b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.cc
@@ -23,13 +23,10 @@
 namespace gvisor {
 namespace testing {
 
-// Checks that the loopback interface considers itself bound to all IPs in an
-// associated subnet.
+// Checks that the loopback interface does not consider itself bound to all IPs
+// in an associated subnet.
 TEST_P(IPv6UDPUnboundSocketNetlinkTest, JoinSubnet) {
-  // TODO(b/166440211): Only run this test on gvisor or remove if the loopback
-  // interface should not consider itself bound to all IPs in an IPv6 subnet.
-  SKIP_IF(!IsRunningOnGvisor() ||
-          !ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
 
   // Add an IP address to the loopback interface.
   Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
@@ -38,25 +35,18 @@ TEST_P(IPv6UDPUnboundSocketNetlinkTest, JoinSubnet) {
   EXPECT_NO_ERRNO(LinkAddLocalAddr(loopback_link.index, AF_INET6,
                                    /*prefixlen=*/64, &addr, sizeof(addr)));
 
-  // Send from an unassigned address but an address that is in the subnet
-  // associated with the loopback interface.
+  // Binding to an unassigned address but an address that is in the subnet
+  // associated with the loopback interface should fail.
   TestAddress sender_addr("V6NotAssignd1");
   sender_addr.addr.ss_family = AF_INET6;
   sender_addr.addr_len = sizeof(sockaddr_in6);
   EXPECT_EQ(1, inet_pton(AF_INET6, "2001:db8::2",
                          reinterpret_cast<sockaddr_in6*>(&sender_addr.addr)
                              ->sin6_addr.s6_addr));
-
-  // Send the packet to an unassigned address but an address that is in the
-  // subnet associated with the loopback interface.
-  TestAddress receiver_addr("V6NotAssigned2");
-  receiver_addr.addr.ss_family = AF_INET6;
-  receiver_addr.addr_len = sizeof(sockaddr_in6);
-  EXPECT_EQ(1, inet_pton(AF_INET6, "2001:db8::ffff:ffff:ffff:ffff",
-                         reinterpret_cast<sockaddr_in6*>(&receiver_addr.addr)
-                             ->sin6_addr.s6_addr));
-
-  TestSendRecv(sender_addr, receiver_addr);
+  auto sock = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  EXPECT_THAT(bind(sock->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+                   sender_addr.addr_len),
+              SyscallFailsWithErrno(EADDRNOTAVAIL));
 }
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h
index 6a2b0a5be..88098be82 100644
--- a/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h
+++ b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h
@@ -15,13 +15,13 @@
 #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_NETLINK_UTIL_H_
 #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_NETLINK_UTIL_H_
 
-#include "test/syscalls/linux/socket_ip_udp_unbound_netlink_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
 
 namespace gvisor {
 namespace testing {
 
 // Test fixture for tests that apply to IPv6 UDP sockets.
-using IPv6UDPUnboundSocketNetlinkTest = IPUDPUnboundSocketNetlinkTest;
+using IPv6UDPUnboundSocketNetlinkTest = SimpleSocketTest;
 
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From fef6124b9dfa1e5c86e7b7b8c20f039d24291992 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 28 Aug 2020 17:18:43 -0700
Subject: Fix kernfs.Dentry reference leak.

PiperOrigin-RevId: 329036994
---
 pkg/sentry/socket/netstack/netstack_vfs2.go | 1 +
 1 file changed, 1 insertion(+)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
index 59fa4c58f..1f7d17f5f 100644
--- a/pkg/sentry/socket/netstack/netstack_vfs2.go
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -56,6 +56,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu
 
 	mnt := t.Kernel().SocketMount()
 	d := sockfs.NewDentry(t.Credentials(), mnt)
+	defer d.DecRef(t)
 
 	s := &SocketVFS2{
 		socketOpsCommon: socketOpsCommon{
-- 
cgit v1.2.3


From 661c6bbb180129f2a81484005571233df6da16d2 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Mon, 31 Aug 2020 12:01:46 -0700
Subject: stateify: Bring back struct field and type names in pretty print

PiperOrigin-RevId: 329349158
---
 pkg/state/pretty/pretty.go | 103 +++++++++++++++++++++++++++++----------------
 1 file changed, 66 insertions(+), 37 deletions(-)

(limited to 'pkg')

diff --git a/pkg/state/pretty/pretty.go b/pkg/state/pretty/pretty.go
index 1375fcc38..887f453a9 100644
--- a/pkg/state/pretty/pretty.go
+++ b/pkg/state/pretty/pretty.go
@@ -26,12 +26,17 @@ import (
 	"gvisor.dev/gvisor/pkg/state/wire"
 )
 
-func formatRef(x *wire.Ref, graph uint64, html bool) string {
+type printer struct {
+	html      bool
+	typeSpecs map[string]*wire.Type
+}
+
+func (p *printer) formatRef(x *wire.Ref, graph uint64) string {
 	baseRef := fmt.Sprintf("g%dr%d", graph, x.Root)
 	fullRef := baseRef
 	if len(x.Dots) > 0 {
 		// See wire.Ref; Type valid if Dots non-zero.
-		typ, _ := formatType(x.Type, graph, html)
+		typ, _ := p.formatType(x.Type, graph)
 		var buf strings.Builder
 		buf.WriteString("(*")
 		buf.WriteString(typ)
@@ -51,34 +56,40 @@ func formatRef(x *wire.Ref, graph uint64, html bool) string {
 		buf.WriteString(")")
 		fullRef = buf.String()
 	}
-	if html {
+	if p.html {
 		return fmt.Sprintf("<a href=\"#%s\">%s</a>", baseRef, fullRef)
 	}
 	return fullRef
 }
 
-func formatType(t wire.TypeSpec, graph uint64, html bool) (string, bool) {
+func (p *printer) formatType(t wire.TypeSpec, graph uint64) (string, bool) {
 	switch x := t.(type) {
 	case wire.TypeID:
-		base := fmt.Sprintf("g%dt%d", graph, x)
-		if html {
-			return fmt.Sprintf("<a href=\"#%s\">%s</a>", base, base), true
+		tag := fmt.Sprintf("g%dt%d", graph, x)
+		desc := tag
+		if spec, ok := p.typeSpecs[tag]; ok {
+			desc += fmt.Sprintf("=%s", spec.Name)
+		} else {
+			desc += "!missing-type-spec"
+		}
+		if p.html {
+			return fmt.Sprintf("<a href=\"#%s\">%s</a>", tag, desc), true
 		}
-		return fmt.Sprintf("%s", base), true
+		return desc, true
 	case wire.TypeSpecNil:
 		return "", false // Only nil type.
 	case *wire.TypeSpecPointer:
-		element, _ := formatType(x.Type, graph, html)
+		element, _ := p.formatType(x.Type, graph)
 		return fmt.Sprintf("(*%s)", element), true
 	case *wire.TypeSpecArray:
-		element, _ := formatType(x.Type, graph, html)
+		element, _ := p.formatType(x.Type, graph)
 		return fmt.Sprintf("[%d](%s)", x.Count, element), true
 	case *wire.TypeSpecSlice:
-		element, _ := formatType(x.Type, graph, html)
+		element, _ := p.formatType(x.Type, graph)
 		return fmt.Sprintf("([]%s)", element), true
 	case *wire.TypeSpecMap:
-		key, _ := formatType(x.Key, graph, html)
-		value, _ := formatType(x.Value, graph, html)
+		key, _ := p.formatType(x.Key, graph)
+		value, _ := p.formatType(x.Value, graph)
 		return fmt.Sprintf("(map[%s]%s)", key, value), true
 	default:
 		panic(fmt.Sprintf("unreachable: unknown type %T", t))
@@ -87,7 +98,7 @@ func formatType(t wire.TypeSpec, graph uint64, html bool) (string, bool) {
 
 // format formats a single object, for pretty-printing. It also returns whether
 // the value is a non-zero value.
-func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bool) {
+func (p *printer) format(graph uint64, depth int, encoded wire.Object) (string, bool) {
 	switch x := encoded.(type) {
 	case wire.Nil:
 		return "nil", false
@@ -98,7 +109,7 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 	case *wire.Complex128:
 		return fmt.Sprintf("%f+%fi", real(*x), imag(*x)), *x != 0.0
 	case *wire.Ref:
-		return formatRef(x, graph, html), x.Root != 0
+		return p.formatRef(x, graph), x.Root != 0
 	case *wire.Type:
 		tabs := "\n" + strings.Repeat("\t", depth)
 		items := make([]string, 0, len(x.Fields)+2)
@@ -109,7 +120,7 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		items = append(items, "}")
 		return strings.Join(items, tabs), true // No zero value.
 	case *wire.Slice:
-		return fmt.Sprintf("%s{len:%d,cap:%d}", formatRef(&x.Ref, graph, html), x.Length, x.Capacity), x.Capacity != 0
+		return fmt.Sprintf("%s{len:%d,cap:%d}", p.formatRef(&x.Ref, graph), x.Length, x.Capacity), x.Capacity != 0
 	case *wire.Array:
 		if len(x.Contents) == 0 {
 			return "[]", false
@@ -119,7 +130,7 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		items = append(items, "[")
 		tabs := "\n" + strings.Repeat("\t", depth)
 		for i := 0; i < len(x.Contents); i++ {
-			item, ok := format(graph, depth+1, x.Contents[i], html)
+			item, ok := p.format(graph, depth+1, x.Contents[i])
 			if !ok {
 				zeros = append(zeros, fmt.Sprintf("\t%s,", item))
 				continue
@@ -136,7 +147,9 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		items = append(items, "]")
 		return strings.Join(items, tabs), len(zeros) < len(x.Contents)
 	case *wire.Struct:
-		typ, _ := formatType(x.TypeID, graph, html)
+		tag := fmt.Sprintf("g%dt%d", graph, x.TypeID)
+		spec, _ := p.typeSpecs[tag]
+		typ, _ := p.formatType(x.TypeID, graph)
 		if x.Fields() == 0 {
 			return fmt.Sprintf("struct[%s]{}", typ), false
 		}
@@ -145,9 +158,15 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		tabs := "\n" + strings.Repeat("\t", depth)
 		allZero := true
 		for i := 0; i < x.Fields(); i++ {
-			element, ok := format(graph, depth+1, *x.Field(i), html)
+			var name string
+			if spec != nil && i < len(spec.Fields) {
+				name = spec.Fields[i]
+			} else {
+				name = fmt.Sprintf("%d", i)
+			}
+			element, ok := p.format(graph, depth+1, *x.Field(i))
 			allZero = allZero && !ok
-			items = append(items, fmt.Sprintf("\t%d: %s,", i, element))
+			items = append(items, fmt.Sprintf("\t%s: %s,", name, element))
 		}
 		items = append(items, "}")
 		return strings.Join(items, tabs), !allZero
@@ -159,15 +178,15 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		items = append(items, "map{")
 		tabs := "\n" + strings.Repeat("\t", depth)
 		for i := 0; i < len(x.Keys); i++ {
-			key, _ := format(graph, depth+1, x.Keys[i], html)
-			value, _ := format(graph, depth+1, x.Values[i], html)
+			key, _ := p.format(graph, depth+1, x.Keys[i])
+			value, _ := p.format(graph, depth+1, x.Values[i])
 			items = append(items, fmt.Sprintf("\t%s: %s,", key, value))
 		}
 		items = append(items, "}")
 		return strings.Join(items, tabs), true
 	case *wire.Interface:
-		typ, typOk := formatType(x.Type, graph, html)
-		element, elementOk := format(graph, depth+1, x.Value, html)
+		typ, typOk := p.formatType(x.Type, graph)
+		element, elementOk := p.format(graph, depth+1, x.Value)
 		return fmt.Sprintf("interface[%s]{%s}", typ, element), typOk || elementOk
 	default:
 		// Must be a primitive; use reflection.
@@ -176,11 +195,11 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 }
 
 // printStream is the basic print implementation.
-func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
+func (p *printer) printStream(w io.Writer, r wire.Reader) (err error) {
 	// current graph ID.
 	var graph uint64
 
-	if html {
+	if p.html {
 		fmt.Fprintf(w, "<pre>")
 		defer fmt.Fprintf(w, "</pre>")
 	}
@@ -195,6 +214,8 @@ func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
 		}
 	}()
 
+	p.typeSpecs = make(map[string]*wire.Type)
+
 	for {
 		// Find the first object to begin generation.
 		length, object, err := state.ReadHeader(r)
@@ -222,18 +243,19 @@ func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
 		// loop in decode.go. But we don't register type information,
 		// etc. and just print the raw structures.
 		var (
-			oid uint64 = 1
-			tid uint64 = 1
+			tid     uint64 = 1
+			objects []wire.Object
 		)
-		for oid <= length {
+		for oid := uint64(1); oid <= length; {
 			// Unmarshal the object.
 			encoded := wire.Load(r)
 
 			// Is this a type?
-			if _, ok := encoded.(*wire.Type); ok {
-				str, _ := format(graph, 0, encoded, html)
+			if typ, ok := encoded.(*wire.Type); ok {
+				str, _ := p.format(graph, 0, encoded)
 				tag := fmt.Sprintf("g%dt%d", graph, tid)
-				if html {
+				p.typeSpecs[tag] = typ
+				if p.html {
 					// See below.
 					tag = fmt.Sprintf("<a name=\"%s\">%s</a><a href=\"#%s\">&#9875;</a>", tag, tag, tag)
 				}
@@ -244,17 +266,24 @@ func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
 				continue
 			}
 
+			// Otherwise, it is a node.
+			objects = append(objects, encoded)
+			oid++
+		}
+
+		for i, encoded := range objects {
+			// oid starts at 1.
+			oid := i + 1
 			// Format the node.
-			str, _ := format(graph, 0, encoded, html)
+			str, _ := p.format(graph, 0, encoded)
 			tag := fmt.Sprintf("g%dr%d", graph, oid)
-			if html {
+			if p.html {
 				// Create a little tag with an anchor next to it for linking.
 				tag = fmt.Sprintf("<a name=\"%s\">%s</a><a href=\"#%s\">&#9875;</a>", tag, tag, tag)
 			}
 			if _, err := fmt.Fprintf(w, "%s = %s\n", tag, str); err != nil {
 				return err
 			}
-			oid++
 		}
 	}
 
@@ -263,10 +292,10 @@ func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
 
 // PrintText reads the stream from r and prints text to w.
 func PrintText(w io.Writer, r wire.Reader) error {
-	return printStream(w, r, false /* html */)
+	return (&printer{}).printStream(w, r)
 }
 
 // PrintHTML reads the stream from r and prints html to w.
 func PrintHTML(w io.Writer, r wire.Reader) error {
-	return printStream(w, r, true /* html */)
+	return (&printer{html: true}).printStream(w, r)
 }
-- 
cgit v1.2.3


From 1b879d8276c39dca6a43b656df9224e21b8b80e1 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Mon, 31 Aug 2020 12:50:31 -0700
Subject: Implement walk in gvisor verity fs

Implement walk directories in gvisor verity file system. For each step,
the child dentry is verified against a verified parent root hash.

PiperOrigin-RevId: 329358747
---
 pkg/merkletree/merkletree.go           |   6 +
 pkg/sentry/fsimpl/verity/BUILD         |   1 +
 pkg/sentry/fsimpl/verity/filesystem.go | 405 ++++++++++++++++++++++++++++++++-
 pkg/sentry/fsimpl/verity/verity.go     |  19 ++
 4 files changed, 425 insertions(+), 6 deletions(-)

(limited to 'pkg')

diff --git a/pkg/merkletree/merkletree.go b/pkg/merkletree/merkletree.go
index 1a0477c6a..36832ec86 100644
--- a/pkg/merkletree/merkletree.go
+++ b/pkg/merkletree/merkletree.go
@@ -29,6 +29,12 @@ const (
 	sha256DigestSize = 32
 )
 
+// DigestSize returns the size (in bytes) of a digest.
+// TODO(b/156980949): Allow config other hash methods (SHA384/SHA512).
+func DigestSize() int {
+	return sha256DigestSize
+}
+
 // Layout defines the scale of a Merkle tree.
 type Layout struct {
 	// blockSize is the size of a data block to be hashed.
diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD
index 28d2a4bcb..326c4ed90 100644
--- a/pkg/sentry/fsimpl/verity/BUILD
+++ b/pkg/sentry/fsimpl/verity/BUILD
@@ -13,6 +13,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/merkletree",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index 78c6074bd..0e17dbddc 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -15,9 +15,15 @@
 package verity
 
 import (
+	"bytes"
+	"fmt"
+	"io"
+	"strconv"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/merkletree"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -91,10 +97,366 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de
 	putDentrySlice(*ds)
 }
 
-// resolveLocked resolves rp to an existing file.
-func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
-	// TODO(b/159261227): Implement resolveLocked.
-	return nil, nil
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may have a reference count of zero, and which therefore
+// should be dropped once traversal is complete, are appended to ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done().
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+
+afterSymlink:
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
+	}
+	if name == ".." {
+		if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return d, nil
+		}
+		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return d.parent, nil
+	}
+	child, err := fs.getChildLocked(ctx, d, name, ds)
+	if err != nil {
+		return nil, err
+	}
+	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+		return nil, err
+	}
+	if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return child, nil
+}
+
+// verifyChild verifies the root hash of child against the already verified
+// root hash of the parent to ensure the child is expected.  verifyChild
+// triggers a sentry panic if unexpected modifications to the file system are
+// detected. In noCrashOnVerificationFailure mode it returns a syserror
+// instead.
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// TODO(b/166474175): Investigate all possible errors returned in this
+// function, and make sure we differentiate all errors that indicate unexpected
+// modifications to the file system from the ones that are not harmful.
+func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+
+	// Get the path to the child dentry. This is only used to provide path
+	// information in failure case.
+	childPath, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD)
+	if err != nil {
+		return nil, err
+	}
+
+	verityMu.RLock()
+	defer verityMu.RUnlock()
+	// Read the offset of the child from the extended attributes of the
+	// corresponding Merkle tree file.
+	// This is the offset of the root hash for child in its parent's Merkle
+	// tree file.
+	off, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  child.lowerMerkleVD,
+		Start: child.lowerMerkleVD,
+	}, &vfs.GetxattrOptions{
+		Name: merkleOffsetInParentXattr,
+		// Offset is a 32 bit integer.
+		Size: sizeOfInt32,
+	})
+
+	// The Merkle tree file for the child should have been created and
+	// contains the expected xattrs. If the file or the xattr does not
+	// exist, it indicates unexpected modifications to the file system.
+	if err == syserror.ENOENT || err == syserror.ENODATA {
+		if noCrashOnVerificationFailure {
+			return nil, err
+		}
+		panic(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+	// The offset xattr should be an integer. If it's not, it indicates
+	// unexpected modifications to the file system.
+	offset, err := strconv.Atoi(off)
+	if err != nil {
+		if noCrashOnVerificationFailure {
+			return nil, syserror.EINVAL
+		}
+		panic(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err))
+	}
+
+	// Open parent Merkle tree file to read and verify child's root hash.
+	parentMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  parent.lowerMerkleVD,
+		Start: parent.lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+
+	// The parent Merkle tree file should have been created. If it's
+	// missing, it indicates an unexpected modification to the file system.
+	if err == syserror.ENOENT {
+		if noCrashOnVerificationFailure {
+			return nil, err
+		}
+		panic(fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	// dataSize is the size of raw data for the Merkle tree. For a file,
+	// dataSize is the size of the whole file. For a directory, dataSize is
+	// the size of all its children's root hashes.
+	dataSize, err := parentMerkleFD.Getxattr(ctx, &vfs.GetxattrOptions{
+		Name: merkleSizeXattr,
+		Size: sizeOfInt32,
+	})
+
+	// The Merkle tree file for the child should have been created and
+	// contains the expected xattrs. If the file or the xattr does not
+	// exist, it indicates unexpected modifications to the file system.
+	if err == syserror.ENOENT || err == syserror.ENODATA {
+		if noCrashOnVerificationFailure {
+			return nil, err
+		}
+		panic(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	// The dataSize xattr should be an integer. If it's not, it indicates
+	// unexpected modifications to the file system.
+	parentSize, err := strconv.Atoi(dataSize)
+	if err != nil {
+		if noCrashOnVerificationFailure {
+			return nil, syserror.EINVAL
+		}
+		panic(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
+	}
+
+	fdReader := vfs.FileReadWriteSeeker{
+		FD:  parentMerkleFD,
+		Ctx: ctx,
+	}
+
+	// Since we are verifying against a directory Merkle tree, buf should
+	// contain the root hash of the children in the parent Merkle tree when
+	// Verify returns with success.
+	var buf bytes.Buffer
+	if err := merkletree.Verify(&buf, &fdReader, &fdReader, int64(parentSize), int64(offset), int64(merkletree.DigestSize()), parent.rootHash, true /* dataAndTreeInSameFile */); err != nil && err != io.EOF {
+		if noCrashOnVerificationFailure {
+			return nil, syserror.EIO
+		}
+		panic(fmt.Sprintf("Verification for %s failed: %v", childPath, err))
+	}
+
+	// Cache child root hash when it's verified the first time.
+	if len(child.rootHash) == 0 {
+		child.rootHash = buf.Bytes()
+	}
+	return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+	if child, ok := parent.children[name]; ok {
+		// If enabling verification on files/directories is not allowed
+		// during runtime, all cached children are already verified. If
+		// runtime enable is allowed and the parent directory is
+		// enabled, we should verify the child root hash here because
+		// it may be cached before enabled.
+		if fs.allowRuntimeEnable && len(parent.rootHash) != 0 {
+			if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+				return nil, err
+			}
+		}
+		return child, nil
+	}
+	child, err := fs.lookupAndVerifyLocked(ctx, parent, name)
+	if err != nil {
+		return nil, err
+	}
+	if parent.children == nil {
+		parent.children = make(map[string]*dentry)
+	}
+	parent.children[name] = child
+	// child's refcount is initially 0, so it may be dropped after traversal.
+	*ds = appendDentry(*ds, child)
+	return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+
+	childFilename := fspath.Parse(name)
+	childVD, childErr := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  parent.lowerVD,
+		Start: parent.lowerVD,
+		Path:  childFilename,
+	}, &vfs.GetDentryOptions{})
+
+	// We will handle ENOENT separately, as it may indicate unexpected
+	// modifications to the file system, and may cause a sentry panic.
+	if childErr != nil && childErr != syserror.ENOENT {
+		return nil, childErr
+	}
+
+	// The dentry needs to be cleaned up if any error occurs. IncRef will be
+	// called if a verity child dentry is successfully created.
+	if childErr == nil {
+		defer childVD.DecRef(ctx)
+	}
+
+	childMerkleFilename := merklePrefix + name
+	childMerkleVD, childMerkleErr := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  parent.lowerVD,
+		Start: parent.lowerVD,
+		Path:  fspath.Parse(childMerkleFilename),
+	}, &vfs.GetDentryOptions{})
+
+	// We will handle ENOENT separately, as it may indicate unexpected
+	// modifications to the file system, and may cause a sentry panic.
+	if childMerkleErr != nil && childMerkleErr != syserror.ENOENT {
+		return nil, childMerkleErr
+	}
+
+	// The dentry needs to be cleaned up if any error occurs. IncRef will be
+	// called if a verity child dentry is successfully created.
+	if childMerkleErr == nil {
+		defer childMerkleVD.DecRef(ctx)
+	}
+
+	// Get the path to the parent dentry. This is only used to provide path
+	// information in failure case.
+	parentPath, err := vfsObj.PathnameWithDeleted(ctx, parent.fs.rootDentry.lowerVD, parent.lowerVD)
+	if err != nil {
+		return nil, err
+	}
+
+	// TODO(b/166474175): Investigate all possible errors of childErr and
+	// childMerkleErr, and make sure we differentiate all errors that
+	// indicate unexpected modifications to the file system from the ones
+	// that are not harmful.
+	if childErr == syserror.ENOENT && childMerkleErr == nil {
+		// Failed to get child file/directory dentry. However the
+		// corresponding Merkle tree is found. This indicates an
+		// unexpected modification to the file system that
+		// removed/renamed the child.
+		if noCrashOnVerificationFailure {
+			return nil, childErr
+		}
+		panic(fmt.Sprintf("Target file %s is expected but missing", parentPath+"/"+name))
+	} else if childErr == nil && childMerkleErr == syserror.ENOENT {
+		// If in allowRuntimeEnable mode, and the Merkle tree file is
+		// not created yet, we create an empty Merkle tree file, so that
+		// if the file is enabled through ioctl, we have the Merkle tree
+		// file open and ready to use.
+		// This may cause empty and unused Merkle tree files in
+		// allowRuntimeEnable mode, if they are never enabled. This
+		// does not affect verification, as we rely on cached root hash
+		// to decide whether to perform verification, not the existence
+		// of the Merkle tree file. Also, those Merkle tree files are
+		// always hidden and cannot be accessed by verity fs users.
+		if fs.allowRuntimeEnable {
+			childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+				Root:  parent.lowerVD,
+				Start: parent.lowerVD,
+				Path:  fspath.Parse(childMerkleFilename),
+			}, &vfs.OpenOptions{
+				Flags: linux.O_RDWR | linux.O_CREAT,
+			})
+			if err != nil {
+				return nil, err
+			}
+			childMerkleFD.DecRef(ctx)
+			childMerkleVD, err = vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+				Root:  parent.lowerVD,
+				Start: parent.lowerVD,
+				Path:  fspath.Parse(childMerkleFilename),
+			}, &vfs.GetDentryOptions{})
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			// If runtime enable is not allowed. This indicates an
+			// unexpected modification to the file system that
+			// removed/renamed the Merkle tree file.
+			if noCrashOnVerificationFailure {
+				return nil, childMerkleErr
+			}
+			panic(fmt.Sprintf("Expected Merkle file for target %s but none found", parentPath+"/"+name))
+		}
+	}
+
+	mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
+	stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  childVD,
+		Start: childVD,
+	}, &vfs.StatOptions{
+		Mask: mask,
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	child := fs.newDentry()
+	child.lowerVD = childVD
+	child.lowerMerkleVD = childMerkleVD
+
+	// Increase the reference for both childVD and childMerkleVD as they are
+	// held by child. If this function fails and the child is destroyed, the
+	// references will be decreased in destroyLocked.
+	childVD.IncRef()
+	childMerkleVD.IncRef()
+
+	parent.IncRef()
+	child.parent = parent
+	child.name = name
+
+	// TODO(b/162788573): Verify child metadata.
+	child.mode = uint32(stat.Mode)
+	child.uid = stat.UID
+	child.gid = stat.GID
+
+	// Verify child root hash. This should always be performed unless in
+	// allowRuntimeEnable mode and the parent directory hasn't been enabled
+	// yet.
+	if !(fs.allowRuntimeEnable && len(parent.rootHash) == 0) {
+		if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+			child.destroyLocked(ctx)
+			return nil, err
+		}
+	}
+
+	return child, nil
 }
 
 // walkParentDirLocked resolves all but the last path component of rp to an
@@ -104,8 +466,39 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 //
 // Preconditions: fs.renameMu must be locked. !rp.Done().
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
-	// TODO(b/159261227): Implement walkParentDirLocked.
-	return nil, nil
+	for !rp.Final() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	for !rp.Done() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
 }
 
 // AccessAt implements vfs.Filesystem.Impl.AccessAt.
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index 1c5b07aa5..eedb5f484 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -41,6 +41,18 @@ const Name = "verity"
 // tree file for "/foo" is "/.merkle.verity.foo".
 const merklePrefix = ".merkle.verity."
 
+// merkleoffsetInParentXattr is the extended attribute name specifying the
+// offset of child root hash in its parent's Merkle tree.
+const merkleOffsetInParentXattr = "user.merkle.offset"
+
+// merkleSizeXattr is the extended attribute name specifying the size of data
+// hashed by the corresponding Merkle tree. For a file, it's the size of the
+// whole file. For a directory, it's the size of all its children's root hashes.
+const merkleSizeXattr = "user.merkle.size"
+
+// sizeOfInt32 is the size in bytes for a 32 bit integer in extended attributes.
+const sizeOfInt32 = 4
+
 // noCrashOnVerificationFailure indicates whether the sandbox should panic
 // whenever verification fails. If true, an error is returned instead of
 // panicking. This should only be set for tests.
@@ -48,6 +60,11 @@ const merklePrefix = ".merkle.verity."
 // flag.
 var noCrashOnVerificationFailure bool
 
+// verityMu synchronizes enabling verity files, protects files or directories
+// from being enabled by different threads simultaneously. It also ensures that
+// verity does not access files that are being enabled.
+var verityMu sync.RWMutex
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
@@ -215,6 +232,8 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	copy(d.rootHash, iopts.RootHash)
 	d.vfsd.Init(d)
 
+	fs.rootDentry = d
+
 	return &fs.vfsfs, &d.vfsd, nil
 }
 
-- 
cgit v1.2.3


From 47b496054e05c2dd33c0ecf1386a36b3edf7c6ef Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 31 Aug 2020 13:55:18 -0700
Subject: Don't use read-only host FD for writable gofer dentries in VFS2.

As documented for gofer.dentry.hostFD.

PiperOrigin-RevId: 329372319
---
 pkg/sentry/fsimpl/gofer/gofer.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 81d34cfe3..57bff1789 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -1472,8 +1472,9 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 			return err
 		}
 
-		if d.hostFD < 0 && openReadable && h.fd >= 0 {
-			// We have no existing FD; use the new FD for at least reading.
+		if d.hostFD < 0 && h.fd >= 0 && openReadable && (d.writeFile.isNil() || openWritable) {
+			// We have no existing FD, and the new FD meets the requirements
+			// for d.hostFD, so start using it.
 			d.hostFD = h.fd
 		} else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable {
 			// We have an existing read-only FD, but the file has just been
-- 
cgit v1.2.3


From fcd85296f964c5965936039dfea7ed221d5bf68a Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Tue, 1 Sep 2020 09:52:52 -0700
Subject: Automated rollback of changelist 328350576

PiperOrigin-RevId: 329526153
---
 pkg/sentry/socket/netstack/netstack.go       |  45 +----
 pkg/sentry/socket/unix/transport/unix.go     |  10 +-
 pkg/tcpip/tcpip.go                           |  13 --
 pkg/tcpip/transport/tcp/endpoint.go          |  33 ----
 test/packetimpact/dut/posix_server.cc        |   9 -
 test/packetimpact/proto/posix_server.proto   |  11 --
 test/packetimpact/testbench/dut.go           |  42 -----
 test/packetimpact/tests/BUILD                |  10 --
 test/packetimpact/tests/tcp_linger_test.go   | 253 ---------------------------
 test/syscalls/linux/socket_inet_loopback.cc  |   3 +
 test/syscalls/linux/socket_ip_tcp_generic.cc | 119 -------------
 test/syscalls/linux/socket_ip_udp_generic.cc |  30 ----
 12 files changed, 8 insertions(+), 570 deletions(-)
 delete mode 100644 test/packetimpact/tests/tcp_linger_test.go

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 0bf21f7d8..36c17d1ba 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -482,35 +482,8 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error {
 }
 
 // Release implements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release(ctx context.Context) {
-	e, ch := waiter.NewChannelEntry(nil)
-	s.EventRegister(&e, waiter.EventHUp|waiter.EventErr)
-	defer s.EventUnregister(&e)
-
+func (s *socketOpsCommon) Release(context.Context) {
 	s.Endpoint.Close()
-
-	// SO_LINGER option is valid only for TCP. For other socket types
-	// return after endpoint close.
-	if family, skType, _ := s.Type(); skType != linux.SOCK_STREAM || (family != linux.AF_INET && family != linux.AF_INET6) {
-		return
-	}
-
-	var v tcpip.LingerOption
-	if err := s.Endpoint.GetSockOpt(&v); err != nil {
-		return
-	}
-
-	// The case for zero timeout is handled in tcp endpoint close function.
-	// Close is blocked until either:
-	// 1. The endpoint state is not in any of the states: FIN-WAIT1,
-	// CLOSING and LAST_ACK.
-	// 2. Timeout is reached.
-	if v.Enabled && v.Timeout != 0 {
-		t := kernel.TaskFromContext(ctx)
-		start := t.Kernel().MonotonicClock().Now()
-		deadline := start.Add(v.Timeout)
-		t.BlockWithDeadline(ch, true, deadline)
-	}
 }
 
 // Read implements fs.FileOperations.Read.
@@ -1184,16 +1157,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.LingerOption
-		var linger linux.Linger
-		if err := ep.GetSockOpt(&v); err != nil {
-			return &linger, nil
-		}
-
-		if v.Enabled {
-			linger.OnOff = 1
-		}
-		linger.Linger = int32(v.Timeout.Seconds())
+		linger := linux.Linger{}
 		return &linger, nil
 
 	case linux.SO_SNDTIMEO:
@@ -1922,10 +1886,7 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 			socket.SetSockOptEmitUnimplementedEvent(t, name)
 		}
 
-		return syserr.TranslateNetstackError(
-			ep.SetSockOpt(&tcpip.LingerOption{
-				Enabled: v.OnOff != 0,
-				Timeout: time.Second * time.Duration(v.Linger)}))
+		return nil
 
 	case linux.SO_DETACH_FILTER:
 		// optval is ignored.
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index cc9d650fb..1200cf9bb 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -942,14 +942,8 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
-	switch opt.(type) {
-	case *tcpip.LingerOption:
-		return nil
-
-	default:
-		log.Warningf("Unsupported socket option: %T", opt)
-		return tcpip.ErrUnknownProtocolOption
-	}
+	log.Warningf("Unsupported socket option: %T", opt)
+	return tcpip.ErrUnknownProtocolOption
 }
 
 // LastError implements Endpoint.LastError.
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 47a8d7c86..b113d8613 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1074,19 +1074,6 @@ const (
 	TCPTimeWaitReuseLoopbackOnly
 )
 
-// LingerOption is used by SetSockOpt/GetSockOpt to set/get the
-// duration for which a socket lingers before returning from Close.
-//
-// +stateify savable
-type LingerOption struct {
-	Enabled bool
-	Timeout time.Duration
-}
-
-func (*LingerOption) isGettableSocketOption() {}
-
-func (*LingerOption) isSettableSocketOption() {}
-
 // IPPacketInfo is the message structure for IP_PKTINFO.
 //
 // +stateify savable
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index c5d9eba5d..3f18efeef 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -654,9 +654,6 @@ type endpoint struct {
 
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
-
-	// linger is used for SO_LINGER socket option.
-	linger tcpip.LingerOption
 }
 
 // UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -1010,26 +1007,6 @@ func (e *endpoint) Close() {
 		return
 	}
 
-	if e.linger.Enabled && e.linger.Timeout == 0 {
-		s := e.EndpointState()
-		isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv
-		if isResetState {
-			// Close the endpoint without doing full shutdown and
-			// send a RST.
-			e.resetConnectionLocked(tcpip.ErrConnectionAborted)
-			e.closeNoShutdownLocked()
-
-			// Wake up worker to close the endpoint.
-			switch s {
-			case StateSynRecv:
-				e.notifyProtocolGoroutine(notifyClose)
-			default:
-				e.notifyProtocolGoroutine(notifyTickleWorker)
-			}
-			return
-		}
-	}
-
 	// Issue a shutdown so that the peer knows we won't send any more data
 	// if we're connected, or stop accepting if we're listening.
 	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
@@ -1830,11 +1807,6 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	case *tcpip.SocketDetachFilterOption:
 		return nil
 
-	case *tcpip.LingerOption:
-		e.LockUser()
-		e.linger = *v
-		e.UnlockUser()
-
 	default:
 		return nil
 	}
@@ -2057,11 +2029,6 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 			Port: port,
 		}
 
-	case *tcpip.LingerOption:
-		e.LockUser()
-		*o = e.linger
-		e.UnlockUser()
-
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
diff --git a/test/packetimpact/dut/posix_server.cc b/test/packetimpact/dut/posix_server.cc
index de5b4be93..2476998f8 100644
--- a/test/packetimpact/dut/posix_server.cc
+++ b/test/packetimpact/dut/posix_server.cc
@@ -336,15 +336,6 @@ class PosixImpl final : public posix_server::Posix::Service {
     return ::grpc::Status::OK;
   }
 
-  ::grpc::Status Shutdown(grpc_impl::ServerContext *context,
-                          const ::posix_server::ShutdownRequest *request,
-                          ::posix_server::ShutdownResponse *response) override {
-    if (shutdown(request->fd(), request->how()) < 0) {
-      response->set_errno_(errno);
-    }
-    return ::grpc::Status::OK;
-  }
-
   ::grpc::Status Recv(::grpc::ServerContext *context,
                       const ::posix_server::RecvRequest *request,
                       ::posix_server::RecvResponse *response) override {
diff --git a/test/packetimpact/proto/posix_server.proto b/test/packetimpact/proto/posix_server.proto
index f32ed54ef..ccd20b10d 100644
--- a/test/packetimpact/proto/posix_server.proto
+++ b/test/packetimpact/proto/posix_server.proto
@@ -188,15 +188,6 @@ message SocketResponse {
   int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
-message ShutdownRequest {
-  int32 fd = 1;
-  int32 how = 2;
-}
-
-message ShutdownResponse {
-  int32 errno_ = 1;  // "errno" may fail to compile in c++.
-}
-
 message RecvRequest {
   int32 sockfd = 1;
   int32 len = 2;
@@ -234,8 +225,6 @@ service Posix {
   rpc SetSockOpt(SetSockOptRequest) returns (SetSockOptResponse);
   // Call socket() on the DUT.
   rpc Socket(SocketRequest) returns (SocketResponse);
-  // Call shutdown() on the DUT.
-  rpc Shutdown(ShutdownRequest) returns (ShutdownResponse);
   // Call recv() on the DUT.
   rpc Recv(RecvRequest) returns (RecvResponse);
 }
diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go
index 6165ab293..73c532e75 100644
--- a/test/packetimpact/testbench/dut.go
+++ b/test/packetimpact/testbench/dut.go
@@ -16,13 +16,11 @@ package testbench
 
 import (
 	"context"
-	"encoding/binary"
 	"flag"
 	"net"
 	"strconv"
 	"syscall"
 	"testing"
-	"time"
 
 	pb "gvisor.dev/gvisor/test/packetimpact/proto/posix_server_go_proto"
 
@@ -702,43 +700,3 @@ func (dut *DUT) RecvWithErrno(ctx context.Context, t *testing.T, sockfd, len, fl
 	}
 	return resp.GetRet(), resp.GetBuf(), syscall.Errno(resp.GetErrno_())
 }
-
-// SetSockLingerOption sets SO_LINGER socket option on the DUT.
-func (dut *DUT) SetSockLingerOption(t *testing.T, sockfd int32, timeout time.Duration, enable bool) {
-	var linger unix.Linger
-	if enable {
-		linger.Onoff = 1
-	}
-	linger.Linger = int32(timeout / time.Second)
-
-	buf := make([]byte, 8)
-	binary.LittleEndian.PutUint32(buf, uint32(linger.Onoff))
-	binary.LittleEndian.PutUint32(buf[4:], uint32(linger.Linger))
-	dut.SetSockOpt(t, sockfd, unix.SOL_SOCKET, unix.SO_LINGER, buf)
-}
-
-// Shutdown calls shutdown on the DUT and causes a fatal test failure if it doesn't
-// succeed. If more control over the timeout or error handling is needed, use
-// ShutdownWithErrno.
-func (dut *DUT) Shutdown(t *testing.T, fd, how int32) error {
-	t.Helper()
-
-	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
-	defer cancel()
-	return dut.ShutdownWithErrno(ctx, t, fd, how)
-}
-
-// ShutdownWithErrno calls shutdown on the DUT.
-func (dut *DUT) ShutdownWithErrno(ctx context.Context, t *testing.T, fd, how int32) error {
-	t.Helper()
-
-	req := pb.ShutdownRequest{
-		Fd:  fd,
-		How: how,
-	}
-	resp, err := dut.posixServer.Shutdown(ctx, &req)
-	if err != nil {
-		t.Fatalf("failed to call Shutdown: %s", err)
-	}
-	return syscall.Errno(resp.GetErrno_())
-}
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 7a7152fa5..74658fea0 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -308,13 +308,3 @@ packetimpact_go_test(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
-
-packetimpact_go_test(
-    name = "tcp_linger",
-    srcs = ["tcp_linger_test.go"],
-    deps = [
-        "//pkg/tcpip/header",
-        "//test/packetimpact/testbench",
-        "@org_golang_x_sys//unix:go_default_library",
-    ],
-)
diff --git a/test/packetimpact/tests/tcp_linger_test.go b/test/packetimpact/tests/tcp_linger_test.go
deleted file mode 100644
index 913e49e06..000000000
--- a/test/packetimpact/tests/tcp_linger_test.go
+++ /dev/null
@@ -1,253 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tcp_linger_test
-
-import (
-	"context"
-	"flag"
-	"syscall"
-	"testing"
-	"time"
-
-	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/test/packetimpact/testbench"
-)
-
-func init() {
-	testbench.RegisterFlags(flag.CommandLine)
-}
-
-func createSocket(t *testing.T, dut testbench.DUT) (int32, int32, testbench.TCPIPv4) {
-	listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
-	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
-	conn.Connect(t)
-	acceptFD, _ := dut.Accept(t, listenFD)
-	return acceptFD, listenFD, conn
-}
-
-func closeAll(t *testing.T, dut testbench.DUT, listenFD int32, conn testbench.TCPIPv4) {
-	conn.Close(t)
-	dut.Close(t, listenFD)
-	dut.TearDown()
-}
-
-// lingerDuration is the timeout value used with SO_LINGER socket option.
-const lingerDuration = 3 * time.Second
-
-// TestTCPLingerZeroTimeout tests when SO_LINGER is set with zero timeout. DUT
-// should send RST-ACK when socket is closed.
-func TestTCPLingerZeroTimeout(t *testing.T) {
-	// Create a socket, listen, TCP connect, and accept.
-	dut := testbench.NewDUT(t)
-	acceptFD, listenFD, conn := createSocket(t, dut)
-	defer closeAll(t, dut, listenFD, conn)
-
-	dut.SetSockLingerOption(t, acceptFD, 0, true)
-	dut.Close(t, acceptFD)
-
-	// If the linger timeout is set to zero, the DUT should send a RST.
-	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}, time.Second); err != nil {
-		t.Errorf("expected RST-ACK packet within a second but got none: %s", err)
-	}
-	conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
-}
-
-// TestTCPLingerOff tests when SO_LINGER is not set. DUT should send FIN-ACK
-// when socket is closed.
-func TestTCPLingerOff(t *testing.T) {
-	// Create a socket, listen, TCP connect, and accept.
-	dut := testbench.NewDUT(t)
-	acceptFD, listenFD, conn := createSocket(t, dut)
-	defer closeAll(t, dut, listenFD, conn)
-
-	dut.Close(t, acceptFD)
-
-	// If SO_LINGER is not set, DUT should send a FIN-ACK.
-	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
-		t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
-	}
-	conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
-}
-
-// TestTCPLingerNonZeroTimeout tests when SO_LINGER is set with non-zero timeout.
-// DUT should close the socket after timeout.
-func TestTCPLingerNonZeroTimeout(t *testing.T) {
-	for _, tt := range []struct {
-		description string
-		lingerOn    bool
-	}{
-		{"WithNonZeroLinger", true},
-		{"WithoutLinger", false},
-	} {
-		t.Run(tt.description, func(t *testing.T) {
-			// Create a socket, listen, TCP connect, and accept.
-			dut := testbench.NewDUT(t)
-			acceptFD, listenFD, conn := createSocket(t, dut)
-			defer closeAll(t, dut, listenFD, conn)
-
-			dut.SetSockLingerOption(t, acceptFD, lingerDuration, tt.lingerOn)
-
-			// Increase timeout as Close will take longer time to
-			// return when SO_LINGER is set with non-zero timeout.
-			timeout := lingerDuration + 1*time.Second
-			ctx, cancel := context.WithTimeout(context.Background(), timeout)
-			defer cancel()
-			start := time.Now()
-			dut.CloseWithErrno(ctx, t, acceptFD)
-			end := time.Now()
-			diff := end.Sub(start)
-
-			if tt.lingerOn && diff < lingerDuration {
-				t.Errorf("expected close to return after %v seconds, but returned sooner", lingerDuration)
-			} else if !tt.lingerOn && diff > 1*time.Second {
-				t.Errorf("expected close to return within a second, but returned later")
-			}
-
-			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
-				t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
-			}
-			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
-		})
-	}
-}
-
-// TestTCPLingerSendNonZeroTimeout tests when SO_LINGER is set with non-zero
-// timeout and send a packet. DUT should close the socket after timeout.
-func TestTCPLingerSendNonZeroTimeout(t *testing.T) {
-	for _, tt := range []struct {
-		description string
-		lingerOn    bool
-	}{
-		{"WithSendNonZeroLinger", true},
-		{"WithoutLinger", false},
-	} {
-		t.Run(tt.description, func(t *testing.T) {
-			// Create a socket, listen, TCP connect, and accept.
-			dut := testbench.NewDUT(t)
-			acceptFD, listenFD, conn := createSocket(t, dut)
-			defer closeAll(t, dut, listenFD, conn)
-
-			dut.SetSockLingerOption(t, acceptFD, lingerDuration, tt.lingerOn)
-
-			// Send data.
-			sampleData := []byte("Sample Data")
-			dut.Send(t, acceptFD, sampleData, 0)
-
-			// Increase timeout as Close will take longer time to
-			// return when SO_LINGER is set with non-zero timeout.
-			timeout := lingerDuration + 1*time.Second
-			ctx, cancel := context.WithTimeout(context.Background(), timeout)
-			defer cancel()
-			start := time.Now()
-			dut.CloseWithErrno(ctx, t, acceptFD)
-			end := time.Now()
-			diff := end.Sub(start)
-
-			if tt.lingerOn && diff < lingerDuration {
-				t.Errorf("expected close to return after %v seconds, but returned sooner", lingerDuration)
-			} else if !tt.lingerOn && diff > 1*time.Second {
-				t.Errorf("expected close to return within a second, but returned later")
-			}
-
-			samplePayload := &testbench.Payload{Bytes: sampleData}
-			if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil {
-				t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
-			}
-
-			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
-				t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
-			}
-			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
-		})
-	}
-}
-
-// TestTCPLingerShutdownZeroTimeout tests SO_LINGER with shutdown() and zero
-// timeout. DUT should send RST-ACK when socket is closed.
-func TestTCPLingerShutdownZeroTimeout(t *testing.T) {
-	// Create a socket, listen, TCP connect, and accept.
-	dut := testbench.NewDUT(t)
-	acceptFD, listenFD, conn := createSocket(t, dut)
-	defer closeAll(t, dut, listenFD, conn)
-
-	dut.SetSockLingerOption(t, acceptFD, 0, true)
-	dut.Shutdown(t, acceptFD, syscall.SHUT_RDWR)
-	dut.Close(t, acceptFD)
-
-	// Shutdown will send FIN-ACK with read/write option.
-	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
-		t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
-	}
-
-	// If the linger timeout is set to zero, the DUT should send a RST.
-	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}, time.Second); err != nil {
-		t.Errorf("expected RST-ACK packet within a second but got none: %s", err)
-	}
-	conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
-}
-
-// TestTCPLingerShutdownSendNonZeroTimeout tests SO_LINGER with shutdown() and
-// non-zero timeout. DUT should close the socket after timeout.
-func TestTCPLingerShutdownSendNonZeroTimeout(t *testing.T) {
-	for _, tt := range []struct {
-		description string
-		lingerOn    bool
-	}{
-		{"shutdownRDWR", true},
-		{"shutdownRDWR", false},
-	} {
-		t.Run(tt.description, func(t *testing.T) {
-			// Create a socket, listen, TCP connect, and accept.
-			dut := testbench.NewDUT(t)
-			acceptFD, listenFD, conn := createSocket(t, dut)
-			defer closeAll(t, dut, listenFD, conn)
-
-			dut.SetSockLingerOption(t, acceptFD, lingerDuration, tt.lingerOn)
-
-			// Send data.
-			sampleData := []byte("Sample Data")
-			dut.Send(t, acceptFD, sampleData, 0)
-
-			dut.Shutdown(t, acceptFD, syscall.SHUT_RDWR)
-
-			// Increase timeout as Close will take longer time to
-			// return when SO_LINGER is set with non-zero timeout.
-			timeout := lingerDuration + 1*time.Second
-			ctx, cancel := context.WithTimeout(context.Background(), timeout)
-			defer cancel()
-			start := time.Now()
-			dut.CloseWithErrno(ctx, t, acceptFD)
-			end := time.Now()
-			diff := end.Sub(start)
-
-			if tt.lingerOn && diff < lingerDuration {
-				t.Errorf("expected close to return after %v seconds, but returned sooner", lingerDuration)
-			} else if !tt.lingerOn && diff > 1*time.Second {
-				t.Errorf("expected close to return within a second, but returned later")
-			}
-
-			samplePayload := &testbench.Payload{Bytes: sampleData}
-			if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil {
-				t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
-			}
-
-			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
-				t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
-			}
-			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
-		})
-	}
-}
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 425084228..ffcd90475 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -1116,6 +1116,9 @@ TEST_P(SocketInetLoopbackTest, TCPAcceptAfterReset) {
   TestAddress const& listener = param.listener;
   TestAddress const& connector = param.connector;
 
+  // TODO(gvisor.dev/issue/1400): Remove this after SO_LINGER is fixed.
+  SKIP_IF(IsRunningOnGvisor());
+
   // Create the listening socket.
   const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
       Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index f4b69c46c..04356b780 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -1080,124 +1080,5 @@ TEST_P(TCPSocketPairTest, TCPResetDuringClose_NoRandomSave) {
   }
 }
 
-// Test setsockopt and getsockopt for a socket with SO_LINGER option.
-TEST_P(TCPSocketPairTest, SetAndGetLingerOption) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  // Check getsockopt before SO_LINGER option is set.
-  struct linger got_linger = {-1, -1};
-  socklen_t got_len = sizeof(got_linger);
-
-  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
-                         &got_linger, &got_len),
-              SyscallSucceeds());
-  ASSERT_THAT(got_len, sizeof(got_linger));
-  struct linger want_linger = {};
-  EXPECT_EQ(0, memcmp(&want_linger, &got_linger, got_len));
-
-  // Set and get SO_LINGER with negative values.
-  struct linger sl;
-  sl.l_onoff = 1;
-  sl.l_linger = -3;
-  ASSERT_THAT(
-      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
-      SyscallSucceeds());
-  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
-                         &got_linger, &got_len),
-              SyscallSucceeds());
-  ASSERT_EQ(got_len, sizeof(got_linger));
-  EXPECT_EQ(sl.l_onoff, got_linger.l_onoff);
-  // Linux returns a different value as it uses HZ to convert the seconds to
-  // jiffies which overflows for negative values. We want to be compatible with
-  // linux for getsockopt return value.
-  if (IsRunningOnGvisor()) {
-    EXPECT_EQ(sl.l_linger, got_linger.l_linger);
-  }
-
-  // Set and get SO_LINGER option with positive values.
-  sl.l_onoff = 1;
-  sl.l_linger = 5;
-  ASSERT_THAT(
-      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
-      SyscallSucceeds());
-  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
-                         &got_linger, &got_len),
-              SyscallSucceeds());
-  ASSERT_EQ(got_len, sizeof(got_linger));
-  EXPECT_EQ(0, memcmp(&sl, &got_linger, got_len));
-}
-
-// Test socket to disable SO_LINGER option.
-TEST_P(TCPSocketPairTest, SetOffLingerOption) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  // Set the SO_LINGER option.
-  struct linger sl;
-  sl.l_onoff = 1;
-  sl.l_linger = 5;
-  ASSERT_THAT(
-      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
-      SyscallSucceeds());
-
-  // Check getsockopt after SO_LINGER option is set.
-  struct linger got_linger = {-1, -1};
-  socklen_t got_len = sizeof(got_linger);
-  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
-                         &got_linger, &got_len),
-              SyscallSucceeds());
-  ASSERT_EQ(got_len, sizeof(got_linger));
-  EXPECT_EQ(0, memcmp(&sl, &got_linger, got_len));
-
-  sl.l_onoff = 0;
-  sl.l_linger = 5;
-  ASSERT_THAT(
-      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
-      SyscallSucceeds());
-
-  // Check getsockopt after SO_LINGER option is set to zero.
-  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
-                         &got_linger, &got_len),
-              SyscallSucceeds());
-  ASSERT_EQ(got_len, sizeof(got_linger));
-  EXPECT_EQ(0, memcmp(&sl, &got_linger, got_len));
-}
-
-// Test close on dup'd socket with SO_LINGER option set.
-TEST_P(TCPSocketPairTest, CloseWithLingerOption) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  // Set the SO_LINGER option.
-  struct linger sl;
-  sl.l_onoff = 1;
-  sl.l_linger = 5;
-  ASSERT_THAT(
-      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
-      SyscallSucceeds());
-
-  // Check getsockopt after SO_LINGER option is set.
-  struct linger got_linger = {-1, -1};
-  socklen_t got_len = sizeof(got_linger);
-  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
-                         &got_linger, &got_len),
-              SyscallSucceeds());
-  ASSERT_EQ(got_len, sizeof(got_linger));
-  EXPECT_EQ(0, memcmp(&sl, &got_linger, got_len));
-
-  FileDescriptor dupFd = FileDescriptor(dup(sockets->first_fd()));
-  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
-  char buf[10] = {};
-  // Write on dupFd should succeed as socket will not be closed until
-  // all references are removed.
-  ASSERT_THAT(RetryEINTR(write)(dupFd.get(), buf, sizeof(buf)),
-              SyscallSucceedsWithValue(sizeof(buf)));
-  ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
-              SyscallFailsWithErrno(EBADF));
-
-  // Close the socket.
-  dupFd.reset();
-  // Write on dupFd should fail as all references for socket are removed.
-  ASSERT_THAT(RetryEINTR(write)(dupFd.get(), buf, sizeof(buf)),
-              SyscallFailsWithErrno(EBADF));
-}
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 6e4ecd680..bbe356116 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -450,35 +450,5 @@ TEST_P(UDPSocketPairTest, TClassRecvMismatch) {
               SyscallFailsWithErrno(EOPNOTSUPP));
 }
 
-// Test the SO_LINGER option can be set/get on udp socket.
-TEST_P(UDPSocketPairTest, SoLingerFail) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-  int level = SOL_SOCKET;
-  int type = SO_LINGER;
-
-  struct linger sl;
-  sl.l_onoff = 1;
-  sl.l_linger = 5;
-  ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &sl, sizeof(sl)),
-              SyscallSucceedsWithValue(0));
-
-  struct linger got_linger = {};
-  socklen_t length = sizeof(sl);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), level, type, &got_linger, &length),
-      SyscallSucceedsWithValue(0));
-
-  ASSERT_EQ(length, sizeof(got_linger));
-  // Linux returns the values which are set in the SetSockOpt for SO_LINGER.
-  // In gVisor, we do not store the linger values for UDP as SO_LINGER for UDP
-  // is a no-op.
-  if (IsRunningOnGvisor()) {
-    struct linger want_linger = {};
-    EXPECT_EQ(0, memcmp(&want_linger, &got_linger, length));
-  } else {
-    EXPECT_EQ(0, memcmp(&sl, &got_linger, length));
-  }
-}
-
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 114d1268b26e71c99fabb9d5b13d7c95c7c5c34b Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Tue, 1 Sep 2020 12:59:49 -0700
Subject: [go-marshal] Enable auto-marshalling for  fs/tty.

PiperOrigin-RevId: 329564614
---
 pkg/abi/linux/tty.go                        |  1 +
 pkg/sentry/fs/tty/BUILD                     |  1 +
 pkg/sentry/fs/tty/line_discipline.go        | 33 ++++++++++++-----------------
 pkg/sentry/fs/tty/master.go                 | 33 +++++++++++++++++------------
 pkg/sentry/fs/tty/queue.go                  | 12 +++++------
 pkg/sentry/fs/tty/slave.go                  | 33 +++++++++++++++++------------
 pkg/sentry/fs/tty/terminal.go               | 21 ++++++++----------
 pkg/sentry/fsimpl/devpts/BUILD              |  2 ++
 pkg/sentry/fsimpl/devpts/line_discipline.go | 33 ++++++++++++-----------------
 pkg/sentry/fsimpl/devpts/master.go          | 33 +++++++++++++++++------------
 pkg/sentry/fsimpl/devpts/queue.go           | 12 +++++------
 pkg/sentry/fsimpl/devpts/slave.go           | 33 +++++++++++++++++------------
 pkg/sentry/fsimpl/devpts/terminal.go        | 21 ++++++++----------
 13 files changed, 140 insertions(+), 128 deletions(-)

(limited to 'pkg')

diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index e640969a6..5a5ff0aa2 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -341,6 +341,7 @@ var DefaultSlaveTermios = KernelTermios{
 // include/uapi/asm-generic/termios.h.
 //
 // +stateify savable
+// +marshal
 type WindowSize struct {
 	Rows uint16
 	Cols uint16
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 5cb0e0417..b3f5a8244 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -31,6 +31,7 @@ go_library(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/primitive",
     ],
 )
 
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 2e9dd2d55..b6bc011a9 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -115,27 +116,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
 }
 
 // getTermios gets the linux.Termios for the tty.
-func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	// We must copy a Termios struct, not KernelTermios.
 	t := l.termios.ToTermios()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // setTermios sets a linux.Termios for the tty.
-func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.Lock()
 	defer l.termiosMu.Unlock()
 	oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
 	// We must copy a Termios struct, not KernelTermios.
 	var t linux.Termios
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyIn(task, args[2].Pointer())
 	l.termios.FromTermios(t)
 
 	// If canonical mode is turned off, move bytes from inQueue's wait
@@ -152,21 +149,17 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
 	return 0, err
 }
 
-func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyOut(t, args[2].Pointer())
 	return err
 }
 
-func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyIn(t, args[2].Pointer())
 	return err
 }
 
@@ -182,8 +175,8 @@ func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
 	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
 }
 
-func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.inQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error {
+	return l.inQueue.readableSize(t, args)
 }
 
 func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -217,8 +210,8 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
 	return 0, syserror.ErrWouldBlock
 }
 
-func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.outQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error {
+	return l.outQueue.readableSize(t, args)
 }
 
 func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index e00746017..175457297 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -20,10 +20,12 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // LINT.IfChange
@@ -152,46 +154,51 @@ func (mf *masterFileOperations) Write(ctx context.Context, _ *fs.File, src userm
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
 	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the output queue read buffer.
-		return 0, mf.t.ld.outputQueueReadSize(ctx, io, args)
+		return 0, mf.t.ld.outputQueueReadSize(t, args)
 	case linux.TCGETS:
 		// N.B. TCGETS on the master actually returns the configuration
 		// of the slave end.
-		return mf.t.ld.getTermios(ctx, io, args)
+		return mf.t.ld.getTermios(t, args)
 	case linux.TCSETS:
 		// N.B. TCSETS on the master actually affects the configuration
 		// of the slave end.
-		return mf.t.ld.setTermios(ctx, io, args)
+		return mf.t.ld.setTermios(t, args)
 	case linux.TCSETSW:
 		// TODO(b/29356795): This should drain the output queue first.
-		return mf.t.ld.setTermios(ctx, io, args)
+		return mf.t.ld.setTermios(t, args)
 	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mf.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		nP := primitive.Uint32(mf.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
 		return 0, err
 	case linux.TIOCSPTLCK:
 		// TODO(b/29356795): Implement pty locking. For now just pretend we do.
 		return 0, nil
 	case linux.TIOCGWINSZ:
-		return 0, mf.t.ld.windowSize(ctx, io, args)
+		return 0, mf.t.ld.windowSize(t, args)
 	case linux.TIOCSWINSZ:
-		return 0, mf.t.ld.setWindowSize(ctx, io, args)
+		return 0, mf.t.ld.setWindowSize(t, args)
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		return 0, mf.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mf.t.setControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCNOTTY:
 		// Release this process's controlling terminal.
-		return 0, mf.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mf.t.releaseControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCGPGRP:
 		// Get the foreground process group.
-		return mf.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mf.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
 	case linux.TIOCSPGRP:
 		// Set the foreground process group.
-		return mf.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mf.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index c5d7ec717..f0a18c75a 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -19,10 +19,12 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // LINT.IfChange
@@ -85,17 +87,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
 }
 
 // readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (q *queue) readableSize(t *kernel.Task, args arch.SyscallArguments) error {
 	q.mu.Lock()
 	defer q.mu.Unlock()
-	var size int32
+	size := primitive.Int32(0)
 	if q.readable {
-		size = int32(len(q.readBuf))
+		size = primitive.Int32(len(q.readBuf))
 	}
 
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := size.CopyOut(t, args[2].Pointer())
 	return err
 
 }
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 7c7292687..933d2c3ff 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -20,9 +20,11 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // LINT.IfChange
@@ -136,39 +138,44 @@ func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src userme
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
 	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the input queue read buffer.
-		return 0, sf.si.t.ld.inputQueueReadSize(ctx, io, args)
+		return 0, sf.si.t.ld.inputQueueReadSize(t, args)
 	case linux.TCGETS:
-		return sf.si.t.ld.getTermios(ctx, io, args)
+		return sf.si.t.ld.getTermios(t, args)
 	case linux.TCSETS:
-		return sf.si.t.ld.setTermios(ctx, io, args)
+		return sf.si.t.ld.setTermios(t, args)
 	case linux.TCSETSW:
 		// TODO(b/29356795): This should drain the output queue first.
-		return sf.si.t.ld.setTermios(ctx, io, args)
+		return sf.si.t.ld.setTermios(t, args)
 	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sf.si.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		nP := primitive.Uint32(sf.si.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
 		return 0, err
 	case linux.TIOCGWINSZ:
-		return 0, sf.si.t.ld.windowSize(ctx, io, args)
+		return 0, sf.si.t.ld.windowSize(t, args)
 	case linux.TIOCSWINSZ:
-		return 0, sf.si.t.ld.setWindowSize(ctx, io, args)
+		return 0, sf.si.t.ld.setWindowSize(t, args)
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		return 0, sf.si.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+		return 0, sf.si.t.setControllingTTY(ctx, args, false /* isMaster */)
 	case linux.TIOCNOTTY:
 		// Release this process's controlling terminal.
-		return 0, sf.si.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+		return 0, sf.si.t.releaseControllingTTY(ctx, args, false /* isMaster */)
 	case linux.TIOCGPGRP:
 		// Get the foreground process group.
-		return sf.si.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+		return sf.si.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
 	case linux.TIOCSPGRP:
 		// Set the foreground process group.
-		return sf.si.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
+		return sf.si.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index ddcccf4da..56b59632d 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -20,7 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // LINT.IfChange
@@ -64,7 +64,7 @@ func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal
 
 // setControllingTTY makes tm the controlling terminal of the calling thread
 // group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setControllingTTY must be called from a task context")
@@ -75,7 +75,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a
 
 // releaseControllingTTY removes tm as the controlling terminal of the calling
 // thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("releaseControllingTTY must be called from a task context")
@@ -85,7 +85,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar
 }
 
 // foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("foregroundProcessGroup must be called from a task context")
@@ -97,24 +97,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a
 	}
 
 	// Write it out to *arg.
-	_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	retP := primitive.Int32(ret)
+	_, err = retP.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setForegroundProcessGroup must be called from a task context")
 	}
 
 	// Read in the process group ID.
-	var pgid int32
-	if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
-		AddressSpaceActive: true,
-	}); err != nil {
+	var pgid primitive.Int32
+	if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
 		return 0, err
 	}
 
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 3f64fab3a..3e8c5e3fd 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -43,6 +43,8 @@ go_library(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/marshal",
+        "//tools/go_marshal/primitive",
     ],
 )
 
diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go
index f7bc325d1..b954c1ba1 100644
--- a/pkg/sentry/fsimpl/devpts/line_discipline.go
+++ b/pkg/sentry/fsimpl/devpts/line_discipline.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -113,27 +114,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
 }
 
 // getTermios gets the linux.Termios for the tty.
-func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	// We must copy a Termios struct, not KernelTermios.
 	t := l.termios.ToTermios()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // setTermios sets a linux.Termios for the tty.
-func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.Lock()
 	defer l.termiosMu.Unlock()
 	oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
 	// We must copy a Termios struct, not KernelTermios.
 	var t linux.Termios
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyIn(task, args[2].Pointer())
 	l.termios.FromTermios(t)
 
 	// If canonical mode is turned off, move bytes from inQueue's wait
@@ -150,21 +147,17 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
 	return 0, err
 }
 
-func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyOut(t, args[2].Pointer())
 	return err
 }
 
-func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyIn(t, args[2].Pointer())
 	return err
 }
 
@@ -180,8 +173,8 @@ func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
 	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
 }
 
-func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.inQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+	return l.inQueue.readableSize(t, io, args)
 }
 
 func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -215,8 +208,8 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
 	return 0, syserror.ErrWouldBlock
 }
 
-func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.outQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+	return l.outQueue.readableSize(t, io, args)
 }
 
 func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 60feb1993..3422db6a4 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -20,12 +20,14 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // masterInode is the inode for the master end of the Terminal.
@@ -131,46 +133,51 @@ func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSeque
 
 // Ioctl implements vfs.FileDescriptionImpl.Ioctl.
 func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
 	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the output queue read buffer.
-		return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args)
+		return 0, mfd.t.ld.outputQueueReadSize(t, io, args)
 	case linux.TCGETS:
 		// N.B. TCGETS on the master actually returns the configuration
 		// of the slave end.
-		return mfd.t.ld.getTermios(ctx, io, args)
+		return mfd.t.ld.getTermios(t, args)
 	case linux.TCSETS:
 		// N.B. TCSETS on the master actually affects the configuration
 		// of the slave end.
-		return mfd.t.ld.setTermios(ctx, io, args)
+		return mfd.t.ld.setTermios(t, args)
 	case linux.TCSETSW:
 		// TODO(b/29356795): This should drain the output queue first.
-		return mfd.t.ld.setTermios(ctx, io, args)
+		return mfd.t.ld.setTermios(t, args)
 	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		nP := primitive.Uint32(mfd.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
 		return 0, err
 	case linux.TIOCSPTLCK:
 		// TODO(b/29356795): Implement pty locking. For now just pretend we do.
 		return 0, nil
 	case linux.TIOCGWINSZ:
-		return 0, mfd.t.ld.windowSize(ctx, io, args)
+		return 0, mfd.t.ld.windowSize(t, args)
 	case linux.TIOCSWINSZ:
-		return 0, mfd.t.ld.setWindowSize(ctx, io, args)
+		return 0, mfd.t.ld.setWindowSize(t, args)
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mfd.t.setControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCNOTTY:
 		// Release this process's controlling terminal.
-		return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mfd.t.releaseControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCGPGRP:
 		// Get the foreground process group.
-		return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mfd.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
 	case linux.TIOCSPGRP:
 		// Set the foreground process group.
-		return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mfd.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
index 331c13997..08eca2589 100644
--- a/pkg/sentry/fsimpl/devpts/queue.go
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -19,10 +19,12 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // waitBufMaxBytes is the maximum size of a wait buffer. It is based on
@@ -83,17 +85,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
 }
 
 // readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (q *queue) readableSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
 	q.mu.Lock()
 	defer q.mu.Unlock()
-	var size int32
+	size := primitive.Int32(0)
 	if q.readable {
-		size = int32(len(q.readBuf))
+		size = primitive.Int32(len(q.readBuf))
 	}
 
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := size.CopyOut(t, args[2].Pointer())
 	return err
 
 }
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
index a9da7af64..5f4b474b3 100644
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ b/pkg/sentry/fsimpl/devpts/slave.go
@@ -20,11 +20,13 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // slaveInode is the inode for the slave end of the Terminal.
@@ -135,39 +137,44 @@ func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequen
 
 // Ioctl implements vfs.FileDescriptionImpl.Ioctl.
 func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
 	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the input queue read buffer.
-		return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args)
+		return 0, sfd.inode.t.ld.inputQueueReadSize(t, io, args)
 	case linux.TCGETS:
-		return sfd.inode.t.ld.getTermios(ctx, io, args)
+		return sfd.inode.t.ld.getTermios(t, args)
 	case linux.TCSETS:
-		return sfd.inode.t.ld.setTermios(ctx, io, args)
+		return sfd.inode.t.ld.setTermios(t, args)
 	case linux.TCSETSW:
 		// TODO(b/29356795): This should drain the output queue first.
-		return sfd.inode.t.ld.setTermios(ctx, io, args)
+		return sfd.inode.t.ld.setTermios(t, args)
 	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		nP := primitive.Uint32(sfd.inode.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
 		return 0, err
 	case linux.TIOCGWINSZ:
-		return 0, sfd.inode.t.ld.windowSize(ctx, io, args)
+		return 0, sfd.inode.t.ld.windowSize(t, args)
 	case linux.TIOCSWINSZ:
-		return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args)
+		return 0, sfd.inode.t.ld.setWindowSize(t, args)
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+		return 0, sfd.inode.t.setControllingTTY(ctx, args, false /* isMaster */)
 	case linux.TIOCNOTTY:
 		// Release this process's controlling terminal.
-		return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+		return 0, sfd.inode.t.releaseControllingTTY(ctx, args, false /* isMaster */)
 	case linux.TIOCGPGRP:
 		// Get the foreground process group.
-		return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+		return sfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
 	case linux.TIOCSPGRP:
 		// Set the foreground process group.
-		return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
+		return sfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go
index 7d2781c54..e88eb6360 100644
--- a/pkg/sentry/fsimpl/devpts/terminal.go
+++ b/pkg/sentry/fsimpl/devpts/terminal.go
@@ -19,7 +19,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // Terminal is a pseudoterminal.
@@ -54,7 +54,7 @@ func newTerminal(n uint32) *Terminal {
 
 // setControllingTTY makes tm the controlling terminal of the calling thread
 // group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setControllingTTY must be called from a task context")
@@ -65,7 +65,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a
 
 // releaseControllingTTY removes tm as the controlling terminal of the calling
 // thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("releaseControllingTTY must be called from a task context")
@@ -75,7 +75,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar
 }
 
 // foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("foregroundProcessGroup must be called from a task context")
@@ -87,24 +87,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a
 	}
 
 	// Write it out to *arg.
-	_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	retP := primitive.Int32(ret)
+	_, err = retP.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setForegroundProcessGroup must be called from a task context")
 	}
 
 	// Read in the process group ID.
-	var pgid int32
-	if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
-		AddressSpaceActive: true,
-	}); err != nil {
+	var pgid primitive.Int32
+	if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
 		return 0, err
 	}
 
-- 
cgit v1.2.3


From 6b992edc8ab85070f50a6860b936b6cef82d48f8 Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Tue, 1 Sep 2020 13:38:44 -0700
Subject: Fix panic when calling dup2().

PiperOrigin-RevId: 329572337
---
 pkg/sentry/kernel/fd_table.go            | 43 ++++++++++++++++----------------
 pkg/sentry/kernel/fd_table_test.go       |  8 +++---
 pkg/sentry/kernel/fd_table_unsafe.go     | 15 +++++------
 pkg/sentry/syscalls/linux/sys_file.go    | 12 ++++++---
 pkg/sentry/syscalls/linux/sys_pipe.go    |  2 +-
 pkg/sentry/syscalls/linux/sys_socket.go  |  2 +-
 pkg/sentry/syscalls/linux/vfs2/fd.go     |  4 +--
 pkg/sentry/syscalls/linux/vfs2/ioctl.go  |  4 +--
 pkg/sentry/syscalls/linux/vfs2/pipe.go   |  2 +-
 pkg/sentry/syscalls/linux/vfs2/socket.go |  2 +-
 10 files changed, 49 insertions(+), 45 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 5773244ac..89223fa36 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -112,7 +112,7 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
 	ctx := context.Background()
 	f.init() // Initialize table.
 	for fd, d := range m {
-		f.setAll(fd, d.file, d.fileVFS2, d.flags)
+		f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags)
 
 		// Note that we do _not_ need to acquire a extra table reference here. The
 		// table reference will already be accounted for in the file, so we drop the
@@ -127,7 +127,7 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
 }
 
 // drop drops the table reference.
-func (f *FDTable) drop(file *fs.File) {
+func (f *FDTable) drop(ctx context.Context, file *fs.File) {
 	// Release locks.
 	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF})
 
@@ -145,14 +145,13 @@ func (f *FDTable) drop(file *fs.File) {
 	d.InotifyEvent(ev, 0)
 
 	// Drop the table reference.
-	file.DecRef(context.Background())
+	file.DecRef(ctx)
 }
 
 // dropVFS2 drops the table reference.
-func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
+func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) {
 	// Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the
 	// entire file.
-	ctx := context.Background()
 	err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET)
 	if err != nil && err != syserror.ENOLCK {
 		panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
@@ -289,15 +288,15 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	// Install all entries.
 	for i := fd; i < end && len(fds) < len(files); i++ {
 		if d, _, _ := f.get(i); d == nil {
-			f.set(i, files[len(fds)], flags) // Set the descriptor.
-			fds = append(fds, i)             // Record the file descriptor.
+			f.set(ctx, i, files[len(fds)], flags) // Set the descriptor.
+			fds = append(fds, i)                  // Record the file descriptor.
 		}
 	}
 
 	// Failure? Unwind existing FDs.
 	if len(fds) < len(files) {
 		for _, i := range fds {
-			f.set(i, nil, FDFlags{}) // Zap entry.
+			f.set(ctx, i, nil, FDFlags{}) // Zap entry.
 		}
 		return nil, syscall.EMFILE
 	}
@@ -344,15 +343,15 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
 	// Install all entries.
 	for i := fd; i < end && len(fds) < len(files); i++ {
 		if d, _, _ := f.getVFS2(i); d == nil {
-			f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
-			fds = append(fds, i)                 // Record the file descriptor.
+			f.setVFS2(ctx, i, files[len(fds)], flags) // Set the descriptor.
+			fds = append(fds, i)                      // Record the file descriptor.
 		}
 	}
 
 	// Failure? Unwind existing FDs.
 	if len(fds) < len(files) {
 		for _, i := range fds {
-			f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+			f.setVFS2(ctx, i, nil, FDFlags{}) // Zap entry.
 		}
 		return nil, syscall.EMFILE
 	}
@@ -397,7 +396,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
 	}
 	for fd < end {
 		if d, _, _ := f.getVFS2(fd); d == nil {
-			f.setVFS2(fd, file, flags)
+			f.setVFS2(ctx, fd, file, flags)
 			if fd == f.next {
 				// Update next search start position.
 				f.next = fd + 1
@@ -439,14 +438,14 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2
 	// Install the entry.
 	f.mu.Lock()
 	defer f.mu.Unlock()
-	f.setAll(fd, file, fileVFS2, flags)
+	f.setAll(ctx, fd, file, fileVFS2, flags)
 	return nil
 }
 
 // SetFlags sets the flags for the given file descriptor.
 //
 // True is returned iff flags were changed.
-func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
+func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error {
 	if fd < 0 {
 		// Don't accept negative FDs.
 		return syscall.EBADF
@@ -462,14 +461,14 @@ func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
 	}
 
 	// Update the flags.
-	f.set(fd, file, flags)
+	f.set(ctx, fd, file, flags)
 	return nil
 }
 
 // SetFlagsVFS2 sets the flags for the given file descriptor.
 //
 // True is returned iff flags were changed.
-func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
+func (f *FDTable) SetFlagsVFS2(ctx context.Context, fd int32, flags FDFlags) error {
 	if fd < 0 {
 		// Don't accept negative FDs.
 		return syscall.EBADF
@@ -485,7 +484,7 @@ func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
 	}
 
 	// Update the flags.
-	f.setVFS2(fd, file, flags)
+	f.setVFS2(ctx, fd, file, flags)
 	return nil
 }
 
@@ -584,9 +583,9 @@ func (f *FDTable) Fork(ctx context.Context) *FDTable {
 		// reference for the clone. We don't need anything else.
 		switch {
 		case file != nil:
-			clone.set(fd, file, flags)
+			clone.set(ctx, fd, file, flags)
 		case fileVFS2 != nil:
-			clone.setVFS2(fd, fileVFS2, flags)
+			clone.setVFS2(ctx, fd, fileVFS2, flags)
 		}
 	})
 	return clone
@@ -595,7 +594,7 @@ func (f *FDTable) Fork(ctx context.Context) *FDTable {
 // Remove removes an FD from and returns a non-file iff successful.
 //
 // N.B. Callers are required to use DecRef when they are done.
-func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
+func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDescription) {
 	if fd < 0 {
 		return nil, nil
 	}
@@ -618,7 +617,7 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
 		orig2.IncRef()
 	}
 	if orig != nil || orig2 != nil {
-		f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+		f.setAll(ctx, fd, nil, nil, FDFlags{}) // Zap entry.
 	}
 	return orig, orig2
 }
@@ -630,7 +629,7 @@ func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDes
 
 	f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
 		if cond(file, fileVFS2, flags) {
-			f.set(fd, nil, FDFlags{}) // Clear from table.
+			f.set(ctx, fd, nil, FDFlags{}) // Clear from table.
 			// Update current available position.
 			if fd < f.next {
 				f.next = fd
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index e3f30ba2a..bf5460083 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -72,7 +72,7 @@ func TestFDTableMany(t *testing.T) {
 		}
 
 		i := int32(2)
-		fdTable.Remove(i)
+		fdTable.Remove(ctx, i)
 		if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i {
 			t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err)
 		}
@@ -93,7 +93,7 @@ func TestFDTableOverLimit(t *testing.T) {
 			t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err)
 		} else {
 			for _, fd := range fds {
-				fdTable.Remove(fd)
+				fdTable.Remove(ctx, fd)
 			}
 		}
 
@@ -150,13 +150,13 @@ func TestFDTable(t *testing.T) {
 			t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref)
 		}
 
-		ref, _ := fdTable.Remove(1)
+		ref, _ := fdTable.Remove(ctx, 1)
 		if ref == nil {
 			t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success")
 		}
 		ref.DecRef(ctx)
 
-		if ref, _ := fdTable.Remove(1); ref != nil {
+		if ref, _ := fdTable.Remove(ctx, 1); ref != nil {
 			t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
 		}
 	})
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index 6b8feb107..555b14f8e 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -18,6 +18,7 @@ import (
 	"sync/atomic"
 	"unsafe"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
@@ -84,8 +85,8 @@ func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, boo
 // reference needed by the table iff the file is different.
 //
 // Precondition: mu must be held.
-func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
-	f.setAll(fd, file, nil, flags)
+func (f *FDTable) set(ctx context.Context, fd int32, file *fs.File, flags FDFlags) {
+	f.setAll(ctx, fd, file, nil, flags)
 }
 
 // setVFS2 sets an entry.
@@ -94,8 +95,8 @@ func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
 // reference needed by the table iff the file is different.
 //
 // Precondition: mu must be held.
-func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) {
-	f.setAll(fd, nil, file, flags)
+func (f *FDTable) setVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) {
+	f.setAll(ctx, fd, nil, file, flags)
 }
 
 // setAll sets an entry.
@@ -104,7 +105,7 @@ func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) {
 // reference needed by the table iff the file is different.
 //
 // Precondition: mu must be held.
-func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
 	if file != nil && fileVFS2 != nil {
 		panic("VFS1 and VFS2 files set")
 	}
@@ -152,11 +153,11 @@ func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription,
 		switch {
 		case orig.file != nil:
 			if desc == nil || desc.file != orig.file {
-				f.drop(orig.file)
+				f.drop(ctx, orig.file)
 			}
 		case orig.fileVFS2 != nil:
 			if desc == nil || desc.fileVFS2 != orig.fileVFS2 {
-				f.dropVFS2(orig.fileVFS2)
+				f.dropVFS2(ctx, orig.fileVFS2)
 			}
 		}
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 256422689..07c77e442 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -601,12 +601,12 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Shared flags between file and socket.
 	switch request {
 	case linux.FIONCLEX:
-		t.FDTable().SetFlags(fd, kernel.FDFlags{
+		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
 			CloseOnExec: false,
 		})
 		return 0, nil, nil
 	case linux.FIOCLEX:
-		t.FDTable().SetFlags(fd, kernel.FDFlags{
+		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
 			CloseOnExec: true,
 		})
 		return 0, nil, nil
@@ -787,7 +787,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Note that Remove provides a reference on the file that we may use to
 	// flush. It is still active until we drop the final reference below
 	// (and other reference-holding operations complete).
-	file, _ := t.FDTable().Remove(fd)
+	file, _ := t.FDTable().Remove(t, fd)
 	if file == nil {
 		return 0, nil, syserror.EBADF
 	}
@@ -941,7 +941,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return uintptr(flags.ToLinuxFDFlags()), nil, nil
 	case linux.F_SETFD:
 		flags := args[2].Uint()
-		err := t.FDTable().SetFlags(fd, kernel.FDFlags{
+		err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{
 			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
 		})
 		return 0, nil, err
@@ -1154,6 +1154,10 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, nil
 }
 
+// LINT.ThenChange(vfs2/fd.go)
+
+// LINT.IfChange
+
 func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
 	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 3149e4aad..c55beb39b 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -48,7 +48,7 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
 
 	if _, err := t.CopyOut(addr, fds); err != nil {
 		for _, fd := range fds {
-			if file, _ := t.FDTable().Remove(fd); file != nil {
+			if file, _ := t.FDTable().Remove(t, fd); file != nil {
 				file.DecRef(t)
 			}
 		}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 38f573c14..e4528d095 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -249,7 +249,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	// Copy the file descriptors out.
 	if _, err := t.CopyOut(socks, fds); err != nil {
 		for _, fd := range fds {
-			if file, _ := t.FDTable().Remove(fd); file != nil {
+			if file, _ := t.FDTable().Remove(t, fd); file != nil {
 				file.DecRef(t)
 			}
 		}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
index 4856554fe..fdd8f88c5 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -34,7 +34,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Note that Remove provides a reference on the file that we may use to
 	// flush. It is still active until we drop the final reference below
 	// (and other reference-holding operations complete).
-	_, file := t.FDTable().Remove(fd)
+	_, file := t.FDTable().Remove(t, fd)
 	if file == nil {
 		return 0, nil, syserror.EBADF
 	}
@@ -137,7 +137,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return uintptr(flags.ToLinuxFDFlags()), nil, nil
 	case linux.F_SETFD:
 		flags := args[2].Uint()
-		err := t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+		err := t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
 			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
 		})
 		return 0, nil, err
diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
index 38778a388..baa8a49af 100644
--- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -34,13 +34,13 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Handle ioctls that apply to all FDs.
 	switch args[1].Int() {
 	case linux.FIONCLEX:
-		t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+		t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
 			CloseOnExec: false,
 		})
 		return 0, nil, nil
 
 	case linux.FIOCLEX:
-		t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+		t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
 			CloseOnExec: true,
 		})
 		return 0, nil, nil
diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go
index 9b4848d9e..3aa6d939d 100644
--- a/pkg/sentry/syscalls/linux/vfs2/pipe.go
+++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go
@@ -53,7 +53,7 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error {
 	}
 	if _, err := t.CopyOut(addr, fds); err != nil {
 		for _, fd := range fds {
-			if _, file := t.FDTable().Remove(fd); file != nil {
+			if _, file := t.FDTable().Remove(t, fd); file != nil {
 				file.DecRef(t)
 			}
 		}
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index a5032657a..a15dad29f 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -252,7 +252,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 
 	if _, err := t.CopyOut(addr, fds); err != nil {
 		for _, fd := range fds {
-			if _, file := t.FDTable().Remove(fd); file != nil {
+			if _, file := t.FDTable().Remove(t, fd); file != nil {
 				file.DecRef(t)
 			}
 		}
-- 
cgit v1.2.3


From 4332affa833c6a11326aa5db366419ba7445cdaf Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Tue, 1 Sep 2020 14:41:54 -0700
Subject: Refactor tty codebase to use master-replica terminology.

Updates #2972

PiperOrigin-RevId: 329584905
---
 pkg/abi/linux/dev.go                        |   6 +-
 pkg/abi/linux/tty.go                        |   6 +-
 pkg/sentry/fs/host/tty.go                   |   2 +-
 pkg/sentry/fs/tty/BUILD                     |   2 +-
 pkg/sentry/fs/tty/dir.go                    |  46 +++---
 pkg/sentry/fs/tty/fs.go                     |   4 +-
 pkg/sentry/fs/tty/line_discipline.go        |  22 +--
 pkg/sentry/fs/tty/master.go                 |   4 +-
 pkg/sentry/fs/tty/queue.go                  |   2 +-
 pkg/sentry/fs/tty/replica.go                | 186 +++++++++++++++++++++
 pkg/sentry/fs/tty/slave.go                  | 185 ---------------------
 pkg/sentry/fs/tty/terminal.go               |  18 +--
 pkg/sentry/fs/tty/tty_test.go               |   4 +-
 pkg/sentry/fsimpl/devpts/BUILD              |   2 +-
 pkg/sentry/fsimpl/devpts/devpts.go          |  34 ++--
 pkg/sentry/fsimpl/devpts/devpts_test.go     |   4 +-
 pkg/sentry/fsimpl/devpts/line_discipline.go |  22 +--
 pkg/sentry/fsimpl/devpts/master.go          |   4 +-
 pkg/sentry/fsimpl/devpts/queue.go           |   2 +-
 pkg/sentry/fsimpl/devpts/replica.go         | 205 +++++++++++++++++++++++
 pkg/sentry/fsimpl/devpts/slave.go           | 205 -----------------------
 pkg/sentry/fsimpl/devpts/terminal.go        |  16 +-
 pkg/sentry/fsimpl/host/host.go              |   2 +-
 runsc/boot/fs.go                            |  12 +-
 runsc/cmd/exec.go                           |   4 +-
 runsc/console/console.go                    |  18 +--
 runsc/container/console_test.go             |  10 +-
 runsc/container/multi_container_test.go     |   2 +-
 runsc/sandbox/sandbox.go                    |   4 +-
 test/syscalls/linux/pty.cc                  | 243 ++++++++++++++--------------
 test/syscalls/linux/pty_root.cc             |   4 +-
 test/util/pty_util.cc                       |  10 +-
 test/util/pty_util.h                        |   8 +-
 33 files changed, 651 insertions(+), 647 deletions(-)
 create mode 100644 pkg/sentry/fs/tty/replica.go
 delete mode 100644 pkg/sentry/fs/tty/slave.go
 create mode 100644 pkg/sentry/fsimpl/devpts/replica.go
 delete mode 100644 pkg/sentry/fsimpl/devpts/slave.go

(limited to 'pkg')

diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go
index 192e2093b..7771650b3 100644
--- a/pkg/abi/linux/dev.go
+++ b/pkg/abi/linux/dev.go
@@ -54,9 +54,9 @@ const (
 	// Unix98 PTY masters.
 	UNIX98_PTY_MASTER_MAJOR = 128
 
-	// UNIX98_PTY_SLAVE_MAJOR is the initial major device number for
-	// Unix98 PTY slaves.
-	UNIX98_PTY_SLAVE_MAJOR = 136
+	// UNIX98_PTY_REPLICA_MAJOR is the initial major device number for
+	// Unix98 PTY replicas.
+	UNIX98_PTY_REPLICA_MAJOR = 136
 )
 
 // Minor device numbers for TTYAUX_MAJOR.
diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index 5a5ff0aa2..47e65d9fb 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -325,9 +325,9 @@ var MasterTermios = KernelTermios{
 	OutputSpeed:       38400,
 }
 
-// DefaultSlaveTermios is the default terminal configuration of the slave end
-// of a Unix98 pseudoterminal.
-var DefaultSlaveTermios = KernelTermios{
+// DefaultReplicaTermios is the default terminal configuration of the replica
+// end of a Unix98 pseudoterminal.
+var DefaultReplicaTermios = KernelTermios{
 	InputFlags:        ICRNL | IXON,
 	OutputFlags:       OPOST | ONLCR,
 	ControlFlags:      B38400 | CS8 | CREAD,
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index 67a807f9d..87d56a51d 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -54,7 +54,7 @@ type TTYFileOperations struct {
 func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
 	return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{
 		fileOperations: fileOperations{iops: iops},
-		termios:        linux.DefaultSlaveTermios,
+		termios:        linux.DefaultReplicaTermios,
 	})
 }
 
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index b3f5a8244..fdd5a40d5 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -10,7 +10,7 @@ go_library(
         "line_discipline.go",
         "master.go",
         "queue.go",
-        "slave.go",
+        "replica.go",
         "terminal.go",
     ],
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 463f6189e..c2da80bc2 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -37,14 +37,14 @@ import (
 // This indirectly manages all terminals within the mount.
 //
 // New Terminals are created by masterInodeOperations.GetFile, which registers
-// the slave Inode in the this directory for discovery via Lookup/Readdir. The
-// slave inode is unregistered when the master file is Released, as the slave
+// the replica Inode in the this directory for discovery via Lookup/Readdir. The
+// replica inode is unregistered when the master file is Released, as the replica
 // is no longer discoverable at that point.
 //
 // References on the underlying Terminal are held by masterFileOperations and
-// slaveInodeOperations.
+// replicaInodeOperations.
 //
-// masterInodeOperations and slaveInodeOperations hold a pointer to
+// masterInodeOperations and replicaInodeOperations hold a pointer to
 // dirInodeOperations, which is reference counted by the refcount their
 // corresponding Dirents hold on their parent (this directory).
 //
@@ -76,16 +76,16 @@ type dirInodeOperations struct {
 	// master is the master PTY inode.
 	master *fs.Inode
 
-	// slaves contains the slave inodes reachable from the directory.
+	// replicas contains the replica inodes reachable from the directory.
 	//
-	// A new slave is added by allocateTerminal and is removed by
+	// A new replica is added by allocateTerminal and is removed by
 	// masterFileOperations.Release.
 	//
-	// A reference is held on every slave in the map.
-	slaves map[uint32]*fs.Inode
+	// A reference is held on every replica in the map.
+	replicas map[uint32]*fs.Inode
 
 	// dentryMap is a SortedDentryMap used to implement Readdir containing
-	// the master and all entries in slaves.
+	// the master and all entries in replicas.
 	dentryMap *fs.SortedDentryMap
 
 	// next is the next pty index to use.
@@ -101,7 +101,7 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
 	d := &dirInodeOperations{
 		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0555), linux.DEVPTS_SUPER_MAGIC),
 		msrc:                  m,
-		slaves:                make(map[uint32]*fs.Inode),
+		replicas:              make(map[uint32]*fs.Inode),
 		dentryMap:             fs.NewSortedDentryMap(nil),
 	}
 	// Linux devpts uses a default mode of 0000 for ptmx which can be
@@ -133,7 +133,7 @@ func (d *dirInodeOperations) Release(ctx context.Context) {
 	defer d.mu.Unlock()
 
 	d.master.DecRef(ctx)
-	if len(d.slaves) != 0 {
+	if len(d.replicas) != 0 {
 		panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d))
 	}
 }
@@ -149,14 +149,14 @@ func (d *dirInodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name str
 		return fs.NewDirent(ctx, d.master, name), nil
 	}
 
-	// Slave number?
+	// Replica number?
 	n, err := strconv.ParseUint(name, 10, 32)
 	if err != nil {
 		// Not found.
 		return nil, syserror.ENOENT
 	}
 
-	s, ok := d.slaves[uint32(n)]
+	s, ok := d.replicas[uint32(n)]
 	if !ok {
 		return nil, syserror.ENOENT
 	}
@@ -236,7 +236,7 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e
 		return nil, syserror.ENOMEM
 	}
 
-	if _, ok := d.slaves[n]; ok {
+	if _, ok := d.replicas[n]; ok {
 		panic(fmt.Sprintf("pty index collision; index %d already exists", n))
 	}
 
@@ -244,19 +244,19 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e
 	d.next++
 
 	// The reference returned by newTerminal is returned to the caller.
-	// Take another for the slave inode.
+	// Take another for the replica inode.
 	t.IncRef()
 
 	// Create a pts node. The owner is based on the context that opens
 	// ptmx.
 	creds := auth.CredentialsFromContext(ctx)
 	uid, gid := creds.EffectiveKUID, creds.EffectiveKGID
-	slave := newSlaveInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666))
+	replica := newReplicaInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666))
 
-	d.slaves[n] = slave
+	d.replicas[n] = replica
 	d.dentryMap.Add(strconv.FormatUint(uint64(n), 10), fs.DentAttr{
-		Type:    slave.StableAttr.Type,
-		InodeID: slave.StableAttr.InodeID,
+		Type:    replica.StableAttr.Type,
+		InodeID: replica.StableAttr.InodeID,
 	})
 
 	return t, nil
@@ -267,18 +267,18 @@ func (d *dirInodeOperations) masterClose(ctx context.Context, t *Terminal) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
-	// The slave end disappears from the directory when the master end is
-	// closed, even if the slave end is open elsewhere.
+	// The replica end disappears from the directory when the master end is
+	// closed, even if the replica end is open elsewhere.
 	//
 	// N.B. since we're using a backdoor method to remove a directory entry
 	// we won't properly fire inotify events like Linux would.
-	s, ok := d.slaves[t.n]
+	s, ok := d.replicas[t.n]
 	if !ok {
 		panic(fmt.Sprintf("Terminal %+v doesn't exist in %+v?", t, d))
 	}
 
 	s.DecRef(ctx)
-	delete(d.slaves, t.n)
+	delete(d.replicas, t.n)
 	d.dentryMap.Remove(strconv.FormatUint(uint64(t.n), 10))
 }
 
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index 2d4d44bf3..13f4901db 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -79,8 +79,8 @@ type superOperations struct{}
 //
 // It always returns true, forcing a Lookup for all entries.
 //
-// Slave entries are dropped from dir when their master is closed, so an
-// existing slave Dirent in the tree is not sufficient to guarantee that it
+// Replica entries are dropped from dir when their master is closed, so an
+// existing replica Dirent in the tree is not sufficient to guarantee that it
 // still exists on the filesystem.
 func (superOperations) Revalidate(context.Context, string, *fs.Inode, *fs.Inode) bool {
 	return true
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index b6bc011a9..b34f4a0eb 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -44,7 +44,7 @@ const (
 )
 
 // lineDiscipline dictates how input and output are handled between the
-// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
 // modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
 // pages are good resources for how to affect the line discipline:
 //
@@ -55,8 +55,8 @@ const (
 //
 // lineDiscipline has a simple structure but supports a multitude of options
 // (see the above man pages). It consists of two queues of bytes: one from the
-// terminal master to slave (the input queue) and one from slave to master (the
-// output queue). When bytes are written to one end of the pty, the line
+// terminal master to replica (the input queue) and one from replica to master
+// (the output queue). When bytes are written to one end of the pty, the line
 // discipline reads the bytes, modifies them or takes special action if
 // required, and enqueues them to be read by the other end of the pty:
 //
@@ -65,7 +65,7 @@ const (
 //    |   (inputQueueWrite)     +-------------+     (inputQueueRead)      |
 //    |                                                                   |
 //    |                                                                   v
-// masterFD                                                            slaveFD
+// masterFD                                                           replicaFD
 //    ^                                                                   |
 //    |                                                                   |
 //    |   output to terminal   +--------------+    output from process    |
@@ -104,8 +104,8 @@ type lineDiscipline struct {
 	// masterWaiter is used to wait on the master end of the TTY.
 	masterWaiter waiter.Queue `state:"zerovalue"`
 
-	// slaveWaiter is used to wait on the slave end of the TTY.
-	slaveWaiter waiter.Queue `state:"zerovalue"`
+	// replicaWaiter is used to wait on the replica end of the TTY.
+	replicaWaiter waiter.Queue `state:"zerovalue"`
 }
 
 func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -143,7 +143,7 @@ func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArgument
 		l.inQueue.pushWaitBufLocked(l)
 		l.inQueue.readable = true
 		l.inQueue.mu.Unlock()
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 	}
 
 	return 0, err
@@ -169,7 +169,7 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
 	return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
 }
 
-func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
@@ -189,7 +189,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
 	if n > 0 {
 		l.masterWaiter.Notify(waiter.EventOut)
 		if pushed {
-			l.slaveWaiter.Notify(waiter.EventIn)
+			l.replicaWaiter.Notify(waiter.EventIn)
 		}
 		return n, nil
 	}
@@ -204,7 +204,7 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 		return n, nil
 	}
 	return 0, syserror.ErrWouldBlock
@@ -222,7 +222,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventOut)
+		l.replicaWaiter.Notify(waiter.EventOut)
 		if pushed {
 			l.masterWaiter.Notify(waiter.EventIn)
 		}
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 175457297..bebf90ffa 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -166,11 +166,11 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io userme
 		return 0, mf.t.ld.outputQueueReadSize(t, args)
 	case linux.TCGETS:
 		// N.B. TCGETS on the master actually returns the configuration
-		// of the slave end.
+		// of the replica end.
 		return mf.t.ld.getTermios(t, args)
 	case linux.TCSETS:
 		// N.B. TCSETS on the master actually affects the configuration
-		// of the slave end.
+		// of the replica end.
 		return mf.t.ld.setTermios(t, args)
 	case linux.TCSETSW:
 		// TODO(b/29356795): This should drain the output queue first.
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index f0a18c75a..e070a1b71 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -34,7 +34,7 @@ import (
 const waitBufMaxBytes = 131072
 
 // queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
+// replica. Bytes written to a queue are added to the read buffer until it is
 // full, at which point they are written to the wait buffer. Bytes are
 // processed (i.e. undergo termios transformations) as they are added to the
 // read buffer. The read buffer is readable when its length is nonzero and
diff --git a/pkg/sentry/fs/tty/replica.go b/pkg/sentry/fs/tty/replica.go
new file mode 100644
index 000000000..cb6cd6864
--- /dev/null
+++ b/pkg/sentry/fs/tty/replica.go
@@ -0,0 +1,186 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tty
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
+)
+
+// LINT.IfChange
+
+// replicaInodeOperations are the fs.InodeOperations for the replica end of the
+// Terminal (pts file).
+//
+// +stateify savable
+type replicaInodeOperations struct {
+	fsutil.SimpleFileInode
+
+	// d is the containing dir.
+	d *dirInodeOperations
+
+	// t is the connected Terminal.
+	t *Terminal
+}
+
+var _ fs.InodeOperations = (*replicaInodeOperations)(nil)
+
+// newReplicaInode creates an fs.Inode for the replica end of a terminal.
+//
+// newReplicaInode takes ownership of t.
+func newReplicaInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
+	iops := &replicaInodeOperations{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, owner, p, linux.DEVPTS_SUPER_MAGIC),
+		d:               d,
+		t:               t,
+	}
+
+	return fs.NewInode(ctx, iops, d.msrc, fs.StableAttr{
+		DeviceID: ptsDevice.DeviceID(),
+		// N.B. Linux always uses inode id = tty index + 3. See
+		// fs/devpts/inode.c:devpts_pty_new.
+		//
+		// TODO(b/75267214): Since ptsDevice must be shared between
+		// different mounts, we must not assign fixed numbers.
+		InodeID: ptsDevice.NextIno(),
+		Type:    fs.CharacterDevice,
+		// See fs/devpts/inode.c:devpts_fill_super.
+		BlockSize:       1024,
+		DeviceFileMajor: linux.UNIX98_PTY_REPLICA_MAJOR,
+		DeviceFileMinor: t.n,
+	})
+}
+
+// Release implements fs.InodeOperations.Release.
+func (si *replicaInodeOperations) Release(ctx context.Context) {
+	si.t.DecRef(ctx)
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (*replicaInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+//
+// This may race with destruction of the terminal. If the terminal is gone, it
+// returns ENOENT.
+func (si *replicaInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, d, flags, &replicaFileOperations{si: si}), nil
+}
+
+// replicaFileOperations are the fs.FileOperations for the replica end of a
+// terminal.
+//
+// +stateify savable
+type replicaFileOperations struct {
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	// si is the inode operations.
+	si *replicaInodeOperations
+}
+
+var _ fs.FileOperations = (*replicaFileOperations)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (sf *replicaFileOperations) Release(context.Context) {
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (sf *replicaFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	sf.si.t.ld.replicaWaiter.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (sf *replicaFileOperations) EventUnregister(e *waiter.Entry) {
+	sf.si.t.ld.replicaWaiter.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (sf *replicaFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return sf.si.t.ld.replicaReadiness()
+}
+
+// Read implements fs.FileOperations.Read.
+func (sf *replicaFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	return sf.si.t.ld.inputQueueRead(ctx, dst)
+}
+
+// Write implements fs.FileOperations.Write.
+func (sf *replicaFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	return sf.si.t.ld.outputQueueWrite(ctx, src)
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (sf *replicaFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+		// Get the number of bytes in the input queue read buffer.
+		return 0, sf.si.t.ld.inputQueueReadSize(t, args)
+	case linux.TCGETS:
+		return sf.si.t.ld.getTermios(t, args)
+	case linux.TCSETS:
+		return sf.si.t.ld.setTermios(t, args)
+	case linux.TCSETSW:
+		// TODO(b/29356795): This should drain the output queue first.
+		return sf.si.t.ld.setTermios(t, args)
+	case linux.TIOCGPTN:
+		nP := primitive.Uint32(sf.si.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
+		return 0, err
+	case linux.TIOCGWINSZ:
+		return 0, sf.si.t.ld.windowSize(t, args)
+	case linux.TIOCSWINSZ:
+		return 0, sf.si.t.ld.setWindowSize(t, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		return 0, sf.si.t.setControllingTTY(ctx, args, false /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, sf.si.t.releaseControllingTTY(ctx, args, false /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return sf.si.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return sf.si.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
+	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
+		return 0, syserror.ENOTTY
+	}
+}
+
+// LINT.ThenChange(../../fsimpl/devpts/replica.go)
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
deleted file mode 100644
index 933d2c3ff..000000000
--- a/pkg/sentry/fs/tty/slave.go
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tty
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
-)
-
-// LINT.IfChange
-
-// slaveInodeOperations are the fs.InodeOperations for the slave end of the
-// Terminal (pts file).
-//
-// +stateify savable
-type slaveInodeOperations struct {
-	fsutil.SimpleFileInode
-
-	// d is the containing dir.
-	d *dirInodeOperations
-
-	// t is the connected Terminal.
-	t *Terminal
-}
-
-var _ fs.InodeOperations = (*slaveInodeOperations)(nil)
-
-// newSlaveInode creates an fs.Inode for the slave end of a terminal.
-//
-// newSlaveInode takes ownership of t.
-func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
-	iops := &slaveInodeOperations{
-		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, owner, p, linux.DEVPTS_SUPER_MAGIC),
-		d:               d,
-		t:               t,
-	}
-
-	return fs.NewInode(ctx, iops, d.msrc, fs.StableAttr{
-		DeviceID: ptsDevice.DeviceID(),
-		// N.B. Linux always uses inode id = tty index + 3. See
-		// fs/devpts/inode.c:devpts_pty_new.
-		//
-		// TODO(b/75267214): Since ptsDevice must be shared between
-		// different mounts, we must not assign fixed numbers.
-		InodeID: ptsDevice.NextIno(),
-		Type:    fs.CharacterDevice,
-		// See fs/devpts/inode.c:devpts_fill_super.
-		BlockSize:       1024,
-		DeviceFileMajor: linux.UNIX98_PTY_SLAVE_MAJOR,
-		DeviceFileMinor: t.n,
-	})
-}
-
-// Release implements fs.InodeOperations.Release.
-func (si *slaveInodeOperations) Release(ctx context.Context) {
-	si.t.DecRef(ctx)
-}
-
-// Truncate implements fs.InodeOperations.Truncate.
-func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
-}
-
-// GetFile implements fs.InodeOperations.GetFile.
-//
-// This may race with destruction of the terminal. If the terminal is gone, it
-// returns ENOENT.
-func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	return fs.NewFile(ctx, d, flags, &slaveFileOperations{si: si}), nil
-}
-
-// slaveFileOperations are the fs.FileOperations for the slave end of a terminal.
-//
-// +stateify savable
-type slaveFileOperations struct {
-	fsutil.FilePipeSeek             `state:"nosave"`
-	fsutil.FileNotDirReaddir        `state:"nosave"`
-	fsutil.FileNoFsync              `state:"nosave"`
-	fsutil.FileNoMMap               `state:"nosave"`
-	fsutil.FileNoSplice             `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
-	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
-
-	// si is the inode operations.
-	si *slaveInodeOperations
-}
-
-var _ fs.FileOperations = (*slaveFileOperations)(nil)
-
-// Release implements fs.FileOperations.Release.
-func (sf *slaveFileOperations) Release(context.Context) {
-}
-
-// EventRegister implements waiter.Waitable.EventRegister.
-func (sf *slaveFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	sf.si.t.ld.slaveWaiter.EventRegister(e, mask)
-}
-
-// EventUnregister implements waiter.Waitable.EventUnregister.
-func (sf *slaveFileOperations) EventUnregister(e *waiter.Entry) {
-	sf.si.t.ld.slaveWaiter.EventUnregister(e)
-}
-
-// Readiness implements waiter.Waitable.Readiness.
-func (sf *slaveFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return sf.si.t.ld.slaveReadiness()
-}
-
-// Read implements fs.FileOperations.Read.
-func (sf *slaveFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
-	return sf.si.t.ld.inputQueueRead(ctx, dst)
-}
-
-// Write implements fs.FileOperations.Write.
-func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
-	return sf.si.t.ld.outputQueueWrite(ctx, src)
-}
-
-// Ioctl implements fs.FileOperations.Ioctl.
-func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	t := kernel.TaskFromContext(ctx)
-	if t == nil {
-		// ioctl(2) may only be called from a task goroutine.
-		return 0, syserror.ENOTTY
-	}
-
-	switch cmd := args[1].Uint(); cmd {
-	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
-		// Get the number of bytes in the input queue read buffer.
-		return 0, sf.si.t.ld.inputQueueReadSize(t, args)
-	case linux.TCGETS:
-		return sf.si.t.ld.getTermios(t, args)
-	case linux.TCSETS:
-		return sf.si.t.ld.setTermios(t, args)
-	case linux.TCSETSW:
-		// TODO(b/29356795): This should drain the output queue first.
-		return sf.si.t.ld.setTermios(t, args)
-	case linux.TIOCGPTN:
-		nP := primitive.Uint32(sf.si.t.n)
-		_, err := nP.CopyOut(t, args[2].Pointer())
-		return 0, err
-	case linux.TIOCGWINSZ:
-		return 0, sf.si.t.ld.windowSize(t, args)
-	case linux.TIOCSWINSZ:
-		return 0, sf.si.t.ld.setWindowSize(t, args)
-	case linux.TIOCSCTTY:
-		// Make the given terminal the controlling terminal of the
-		// calling process.
-		return 0, sf.si.t.setControllingTTY(ctx, args, false /* isMaster */)
-	case linux.TIOCNOTTY:
-		// Release this process's controlling terminal.
-		return 0, sf.si.t.releaseControllingTTY(ctx, args, false /* isMaster */)
-	case linux.TIOCGPGRP:
-		// Get the foreground process group.
-		return sf.si.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
-	case linux.TIOCSPGRP:
-		// Set the foreground process group.
-		return sf.si.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
-	default:
-		maybeEmitUnimplementedEvent(ctx, cmd)
-		return 0, syserror.ENOTTY
-	}
-}
-
-// LINT.ThenChange(../../fsimpl/devpts/slave.go)
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index 56b59632d..c9dbf1f3b 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -44,19 +44,19 @@ type Terminal struct {
 	// this terminal. This field is immutable.
 	masterKTTY *kernel.TTY
 
-	// slaveKTTY contains the controlling process of the slave end of this
+	// replicaKTTY contains the controlling process of the replica end of this
 	// terminal. This field is immutable.
-	slaveKTTY *kernel.TTY
+	replicaKTTY *kernel.TTY
 }
 
 func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal {
-	termios := linux.DefaultSlaveTermios
+	termios := linux.DefaultReplicaTermios
 	t := Terminal{
-		d:          d,
-		n:          n,
-		ld:         newLineDiscipline(termios),
-		masterKTTY: &kernel.TTY{Index: n},
-		slaveKTTY:  &kernel.TTY{Index: n},
+		d:           d,
+		n:           n,
+		ld:          newLineDiscipline(termios),
+		masterKTTY:  &kernel.TTY{Index: n},
+		replicaKTTY: &kernel.TTY{Index: n},
 	}
 	t.EnableLeakCheck("tty.Terminal")
 	return &t
@@ -123,7 +123,7 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
 	if isMaster {
 		return tm.masterKTTY
 	}
-	return tm.slaveKTTY
+	return tm.replicaKTTY
 }
 
 // LINT.ThenChange(../../fsimpl/devpts/terminal.go)
diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go
index 2cbc05678..49edee83d 100644
--- a/pkg/sentry/fs/tty/tty_test.go
+++ b/pkg/sentry/fs/tty/tty_test.go
@@ -22,8 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func TestSimpleMasterToSlave(t *testing.T) {
-	ld := newLineDiscipline(linux.DefaultSlaveTermios)
+func TestSimpleMasterToReplica(t *testing.T) {
+	ld := newLineDiscipline(linux.DefaultReplicaTermios)
 	ctx := contexttest.Context(t)
 	inBytes := []byte("hello, tty\n")
 	src := usermem.BytesIOSequence(inBytes)
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 3e8c5e3fd..ac48ab34b 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -21,8 +21,8 @@ go_library(
         "line_discipline.go",
         "master.go",
         "queue.go",
+        "replica.go",
         "root_inode_refs.go",
-        "slave.go",
         "terminal.go",
     ],
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 57580f4d4..dcf1ee25b 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -79,7 +79,7 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds
 
 	// Construct the root directory. This is always inode id 1.
 	root := &rootInode{
-		slaves: make(map[uint32]*slaveInode),
+		replicas: make(map[uint32]*replicaInode),
 	}
 	root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
 	root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
@@ -133,8 +133,8 @@ type rootInode struct {
 	// mu protects the fields below.
 	mu sync.Mutex
 
-	// slaves maps pty ids to slave inodes.
-	slaves map[uint32]*slaveInode
+	// replicas maps pty ids to replica inodes.
+	replicas map[uint32]*replicaInode
 
 	// nextIdx is the next pty index to use. Must be accessed atomically.
 	//
@@ -154,22 +154,22 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error)
 	idx := i.nextIdx
 	i.nextIdx++
 
-	// Sanity check that slave with idx does not exist.
-	if _, ok := i.slaves[idx]; ok {
+	// Sanity check that replica with idx does not exist.
+	if _, ok := i.replicas[idx]; ok {
 		panic(fmt.Sprintf("pty index collision; index %d already exists", idx))
 	}
 
-	// Create the new terminal and slave.
+	// Create the new terminal and replica.
 	t := newTerminal(idx)
-	slave := &slaveInode{
+	replica := &replicaInode{
 		root: i,
 		t:    t,
 	}
 	// Linux always uses pty index + 3 as the inode id. See
 	// fs/devpts/inode.c:devpts_pty_new().
-	slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
-	slave.dentry.Init(slave)
-	i.slaves[idx] = slave
+	replica.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
+	replica.dentry.Init(replica)
+	i.replicas[idx] = replica
 
 	return t, nil
 }
@@ -179,11 +179,11 @@ func (i *rootInode) masterClose(t *Terminal) {
 	i.mu.Lock()
 	defer i.mu.Unlock()
 
-	// Sanity check that slave with idx exists.
-	if _, ok := i.slaves[t.n]; !ok {
+	// Sanity check that replica with idx exists.
+	if _, ok := i.replicas[t.n]; !ok {
 		panic(fmt.Sprintf("pty with index %d does not exist", t.n))
 	}
-	delete(i.slaves, t.n)
+	delete(i.replicas, t.n)
 }
 
 // Open implements kernfs.Inode.Open.
@@ -205,7 +205,7 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error
 	}
 	i.mu.Lock()
 	defer i.mu.Unlock()
-	if si, ok := i.slaves[uint32(idx)]; ok {
+	if si, ok := i.replicas[uint32(idx)]; ok {
 		si.dentry.IncRef()
 		return si.dentry.VFSDentry(), nil
 
@@ -217,8 +217,8 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error
 func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	i.mu.Lock()
 	defer i.mu.Unlock()
-	ids := make([]int, 0, len(i.slaves))
-	for id := range i.slaves {
+	ids := make([]int, 0, len(i.replicas))
+	for id := range i.replicas {
 		ids = append(ids, int(id))
 	}
 	sort.Ints(ids)
@@ -226,7 +226,7 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
 		dirent := vfs.Dirent{
 			Name:    strconv.FormatUint(uint64(id), 10),
 			Type:    linux.DT_CHR,
-			Ino:     i.slaves[uint32(id)].InodeAttrs.Ino(),
+			Ino:     i.replicas[uint32(id)].InodeAttrs.Ino(),
 			NextOff: offset + 1,
 		}
 		if err := cb.Handle(dirent); err != nil {
diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go
index b7c149047..448390cfe 100644
--- a/pkg/sentry/fsimpl/devpts/devpts_test.go
+++ b/pkg/sentry/fsimpl/devpts/devpts_test.go
@@ -22,8 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func TestSimpleMasterToSlave(t *testing.T) {
-	ld := newLineDiscipline(linux.DefaultSlaveTermios)
+func TestSimpleMasterToReplica(t *testing.T) {
+	ld := newLineDiscipline(linux.DefaultReplicaTermios)
 	ctx := contexttest.Context(t)
 	inBytes := []byte("hello, tty\n")
 	src := usermem.BytesIOSequence(inBytes)
diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go
index b954c1ba1..e6b0e81cf 100644
--- a/pkg/sentry/fsimpl/devpts/line_discipline.go
+++ b/pkg/sentry/fsimpl/devpts/line_discipline.go
@@ -42,7 +42,7 @@ const (
 )
 
 // lineDiscipline dictates how input and output are handled between the
-// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
 // modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
 // pages are good resources for how to affect the line discipline:
 //
@@ -53,8 +53,8 @@ const (
 //
 // lineDiscipline has a simple structure but supports a multitude of options
 // (see the above man pages). It consists of two queues of bytes: one from the
-// terminal master to slave (the input queue) and one from slave to master (the
-// output queue). When bytes are written to one end of the pty, the line
+// terminal master to replica (the input queue) and one from replica to master
+// (the output queue). When bytes are written to one end of the pty, the line
 // discipline reads the bytes, modifies them or takes special action if
 // required, and enqueues them to be read by the other end of the pty:
 //
@@ -63,7 +63,7 @@ const (
 //    |   (inputQueueWrite)     +-------------+     (inputQueueRead)      |
 //    |                                                                   |
 //    |                                                                   v
-// masterFD                                                            slaveFD
+// masterFD                                                           replicaFD
 //    ^                                                                   |
 //    |                                                                   |
 //    |   output to terminal   +--------------+    output from process    |
@@ -102,8 +102,8 @@ type lineDiscipline struct {
 	// masterWaiter is used to wait on the master end of the TTY.
 	masterWaiter waiter.Queue `state:"zerovalue"`
 
-	// slaveWaiter is used to wait on the slave end of the TTY.
-	slaveWaiter waiter.Queue `state:"zerovalue"`
+	// replicaWaiter is used to wait on the replica end of the TTY.
+	replicaWaiter waiter.Queue `state:"zerovalue"`
 }
 
 func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -141,7 +141,7 @@ func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArgument
 		l.inQueue.pushWaitBufLocked(l)
 		l.inQueue.readable = true
 		l.inQueue.mu.Unlock()
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 	}
 
 	return 0, err
@@ -167,7 +167,7 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
 	return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
 }
 
-func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
@@ -187,7 +187,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
 	if n > 0 {
 		l.masterWaiter.Notify(waiter.EventOut)
 		if pushed {
-			l.slaveWaiter.Notify(waiter.EventIn)
+			l.replicaWaiter.Notify(waiter.EventIn)
 		}
 		return n, nil
 	}
@@ -202,7 +202,7 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 		return n, nil
 	}
 	return 0, syserror.ErrWouldBlock
@@ -220,7 +220,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventOut)
+		l.replicaWaiter.Notify(waiter.EventOut)
 		if pushed {
 			l.masterWaiter.Notify(waiter.EventIn)
 		}
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 3422db6a4..d07e1ded8 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -145,11 +145,11 @@ func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args
 		return 0, mfd.t.ld.outputQueueReadSize(t, io, args)
 	case linux.TCGETS:
 		// N.B. TCGETS on the master actually returns the configuration
-		// of the slave end.
+		// of the replica end.
 		return mfd.t.ld.getTermios(t, args)
 	case linux.TCSETS:
 		// N.B. TCSETS on the master actually affects the configuration
-		// of the slave end.
+		// of the replica end.
 		return mfd.t.ld.setTermios(t, args)
 	case linux.TCSETSW:
 		// TODO(b/29356795): This should drain the output queue first.
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
index 08eca2589..ca36b66e9 100644
--- a/pkg/sentry/fsimpl/devpts/queue.go
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -32,7 +32,7 @@ import (
 const waitBufMaxBytes = 131072
 
 // queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
+// replica. Bytes written to a queue are added to the read buffer until it is
 // full, at which point they are written to the wait buffer. Bytes are
 // processed (i.e. undergo termios transformations) as they are added to the
 // read buffer. The read buffer is readable when its length is nonzero and
diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go
new file mode 100644
index 000000000..1f99f4b4d
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/replica.go
@@ -0,0 +1,205 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/primitive"
+)
+
+// replicaInode is the inode for the replica end of the Terminal.
+type replicaInode struct {
+	implStatFS
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	locks vfs.FileLocks
+
+	// Keep a reference to this inode's dentry.
+	dentry kernfs.Dentry
+
+	// root is the devpts root inode.
+	root *rootInode
+
+	// t is the connected Terminal.
+	t *Terminal
+}
+
+var _ kernfs.Inode = (*replicaInode)(nil)
+
+// Open implements kernfs.Inode.Open.
+func (si *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	si.IncRef()
+	fd := &replicaFileDescription{
+		inode: si,
+	}
+	fd.LockFD.Init(&si.locks)
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		si.DecRef(ctx)
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+
+}
+
+// Valid implements kernfs.Inode.Valid.
+func (si *replicaInode) Valid(context.Context) bool {
+	// Return valid if the replica still exists.
+	si.root.mu.Lock()
+	defer si.root.mu.Unlock()
+	_, ok := si.root.replicas[si.t.n]
+	return ok
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (si *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	statx.Blksize = 1024
+	statx.RdevMajor = linux.UNIX98_PTY_REPLICA_MAJOR
+	statx.RdevMinor = si.t.n
+	return statx, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat
+func (si *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
+		return syserror.EINVAL
+	}
+	return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
+}
+
+type replicaFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	inode *replicaInode
+}
+
+var _ vfs.FileDescriptionImpl = (*replicaFileDescription)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (sfd *replicaFileDescription) Release(ctx context.Context) {
+	sfd.inode.DecRef(ctx)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (sfd *replicaFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	sfd.inode.t.ld.replicaWaiter.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (sfd *replicaFileDescription) EventUnregister(e *waiter.Entry) {
+	sfd.inode.t.ld.replicaWaiter.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (sfd *replicaFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return sfd.inode.t.ld.replicaReadiness()
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (sfd *replicaFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	return sfd.inode.t.ld.inputQueueRead(ctx, dst)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (sfd *replicaFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	return sfd.inode.t.ld.outputQueueWrite(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (sfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+		// Get the number of bytes in the input queue read buffer.
+		return 0, sfd.inode.t.ld.inputQueueReadSize(t, io, args)
+	case linux.TCGETS:
+		return sfd.inode.t.ld.getTermios(t, args)
+	case linux.TCSETS:
+		return sfd.inode.t.ld.setTermios(t, args)
+	case linux.TCSETSW:
+		// TODO(b/29356795): This should drain the output queue first.
+		return sfd.inode.t.ld.setTermios(t, args)
+	case linux.TIOCGPTN:
+		nP := primitive.Uint32(sfd.inode.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
+		return 0, err
+	case linux.TIOCGWINSZ:
+		return 0, sfd.inode.t.ld.windowSize(t, args)
+	case linux.TIOCSWINSZ:
+		return 0, sfd.inode.t.ld.setWindowSize(t, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		return 0, sfd.inode.t.setControllingTTY(ctx, args, false /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, sfd.inode.t.releaseControllingTTY(ctx, args, false /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return sfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return sfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
+	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
+		return 0, syserror.ENOTTY
+	}
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (sfd *replicaFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return sfd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (sfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return sfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (sfd *replicaFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (sfd *replicaFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
deleted file mode 100644
index 5f4b474b3..000000000
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ /dev/null
@@ -1,205 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package devpts
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
-	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
-)
-
-// slaveInode is the inode for the slave end of the Terminal.
-type slaveInode struct {
-	implStatFS
-	kernfs.InodeAttrs
-	kernfs.InodeNoopRefCount
-	kernfs.InodeNotDirectory
-	kernfs.InodeNotSymlink
-
-	locks vfs.FileLocks
-
-	// Keep a reference to this inode's dentry.
-	dentry kernfs.Dentry
-
-	// root is the devpts root inode.
-	root *rootInode
-
-	// t is the connected Terminal.
-	t *Terminal
-}
-
-var _ kernfs.Inode = (*slaveInode)(nil)
-
-// Open implements kernfs.Inode.Open.
-func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	si.IncRef()
-	fd := &slaveFileDescription{
-		inode: si,
-	}
-	fd.LockFD.Init(&si.locks)
-	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
-		si.DecRef(ctx)
-		return nil, err
-	}
-	return &fd.vfsfd, nil
-
-}
-
-// Valid implements kernfs.Inode.Valid.
-func (si *slaveInode) Valid(context.Context) bool {
-	// Return valid if the slave still exists.
-	si.root.mu.Lock()
-	defer si.root.mu.Unlock()
-	_, ok := si.root.slaves[si.t.n]
-	return ok
-}
-
-// Stat implements kernfs.Inode.Stat.
-func (si *slaveInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts)
-	if err != nil {
-		return linux.Statx{}, err
-	}
-	statx.Blksize = 1024
-	statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR
-	statx.RdevMinor = si.t.n
-	return statx, nil
-}
-
-// SetStat implements kernfs.Inode.SetStat
-func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
-	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
-		return syserror.EINVAL
-	}
-	return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
-}
-
-type slaveFileDescription struct {
-	vfsfd vfs.FileDescription
-	vfs.FileDescriptionDefaultImpl
-	vfs.LockFD
-
-	inode *slaveInode
-}
-
-var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil)
-
-// Release implements fs.FileOperations.Release.
-func (sfd *slaveFileDescription) Release(ctx context.Context) {
-	sfd.inode.DecRef(ctx)
-}
-
-// EventRegister implements waiter.Waitable.EventRegister.
-func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask)
-}
-
-// EventUnregister implements waiter.Waitable.EventUnregister.
-func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) {
-	sfd.inode.t.ld.slaveWaiter.EventUnregister(e)
-}
-
-// Readiness implements waiter.Waitable.Readiness.
-func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return sfd.inode.t.ld.slaveReadiness()
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
-	return sfd.inode.t.ld.inputQueueRead(ctx, dst)
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
-	return sfd.inode.t.ld.outputQueueWrite(ctx, src)
-}
-
-// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
-func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	t := kernel.TaskFromContext(ctx)
-	if t == nil {
-		// ioctl(2) may only be called from a task goroutine.
-		return 0, syserror.ENOTTY
-	}
-
-	switch cmd := args[1].Uint(); cmd {
-	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
-		// Get the number of bytes in the input queue read buffer.
-		return 0, sfd.inode.t.ld.inputQueueReadSize(t, io, args)
-	case linux.TCGETS:
-		return sfd.inode.t.ld.getTermios(t, args)
-	case linux.TCSETS:
-		return sfd.inode.t.ld.setTermios(t, args)
-	case linux.TCSETSW:
-		// TODO(b/29356795): This should drain the output queue first.
-		return sfd.inode.t.ld.setTermios(t, args)
-	case linux.TIOCGPTN:
-		nP := primitive.Uint32(sfd.inode.t.n)
-		_, err := nP.CopyOut(t, args[2].Pointer())
-		return 0, err
-	case linux.TIOCGWINSZ:
-		return 0, sfd.inode.t.ld.windowSize(t, args)
-	case linux.TIOCSWINSZ:
-		return 0, sfd.inode.t.ld.setWindowSize(t, args)
-	case linux.TIOCSCTTY:
-		// Make the given terminal the controlling terminal of the
-		// calling process.
-		return 0, sfd.inode.t.setControllingTTY(ctx, args, false /* isMaster */)
-	case linux.TIOCNOTTY:
-		// Release this process's controlling terminal.
-		return 0, sfd.inode.t.releaseControllingTTY(ctx, args, false /* isMaster */)
-	case linux.TIOCGPGRP:
-		// Get the foreground process group.
-		return sfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
-	case linux.TIOCSPGRP:
-		// Set the foreground process group.
-		return sfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
-	default:
-		maybeEmitUnimplementedEvent(ctx, cmd)
-		return 0, syserror.ENOTTY
-	}
-}
-
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	creds := auth.CredentialsFromContext(ctx)
-	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return sfd.inode.SetStat(ctx, fs, creds, opts)
-}
-
-// Stat implements vfs.FileDescriptionImpl.Stat.
-func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
-	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return sfd.inode.Stat(ctx, fs, opts)
-}
-
-// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
-func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
-	return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block)
-}
-
-// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
-func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
-	return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence)
-}
diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go
index e88eb6360..731955d62 100644
--- a/pkg/sentry/fsimpl/devpts/terminal.go
+++ b/pkg/sentry/fsimpl/devpts/terminal.go
@@ -36,18 +36,18 @@ type Terminal struct {
 	// this terminal. This field is immutable.
 	masterKTTY *kernel.TTY
 
-	// slaveKTTY contains the controlling process of the slave end of this
+	// replicaKTTY contains the controlling process of the replica end of this
 	// terminal. This field is immutable.
-	slaveKTTY *kernel.TTY
+	replicaKTTY *kernel.TTY
 }
 
 func newTerminal(n uint32) *Terminal {
-	termios := linux.DefaultSlaveTermios
+	termios := linux.DefaultReplicaTermios
 	t := Terminal{
-		n:          n,
-		ld:         newLineDiscipline(termios),
-		masterKTTY: &kernel.TTY{Index: n},
-		slaveKTTY:  &kernel.TTY{Index: n},
+		n:           n,
+		ld:          newLineDiscipline(termios),
+		masterKTTY:  &kernel.TTY{Index: n},
+		replicaKTTY: &kernel.TTY{Index: n},
 	}
 	return &t
 }
@@ -113,5 +113,5 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
 	if isMaster {
 		return tm.masterKTTY
 	}
-	return tm.slaveKTTY
+	return tm.replicaKTTY
 }
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 7561f821c..1bd0e4ee8 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -496,7 +496,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 		if i.isTTY {
 			fd := &TTYFileDescription{
 				fileDescription: fileDescription{inode: i},
-				termios:         linux.DefaultSlaveTermios,
+				termios:         linux.DefaultReplicaTermios,
 			}
 			fd.LockFD.Init(&i.locks)
 			vfsfd := &fd.vfsfd
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 163265afe..ea0461a3d 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -453,17 +453,17 @@ func (m *mountHint) isSupported() bool {
 func (m *mountHint) checkCompatible(mount specs.Mount) error {
 	// Remove options that don't affect to mount's behavior.
 	masterOpts := filterUnsupportedOptions(m.mount)
-	slaveOpts := filterUnsupportedOptions(mount)
+	replicaOpts := filterUnsupportedOptions(mount)
 
-	if len(masterOpts) != len(slaveOpts) {
-		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+	if len(masterOpts) != len(replicaOpts) {
+		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
 	}
 
 	sort.Strings(masterOpts)
-	sort.Strings(slaveOpts)
+	sort.Strings(replicaOpts)
 	for i, opt := range masterOpts {
-		if opt != slaveOpts[i] {
-			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+		if opt != replicaOpts[i] {
+			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
 		}
 	}
 	return nil
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 600876a27..775ed4b43 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -220,7 +220,7 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi
 	cmd.Stderr = os.Stderr
 
 	// If the console control socket file is provided, then create a new
-	// pty master/slave pair and set the TTY on the sandbox process.
+	// pty master/replica pair and set the TTY on the sandbox process.
 	if ex.consoleSocket != "" {
 		// Create a new TTY pair and send the master on the provided socket.
 		tty, err := console.NewWithSocket(ex.consoleSocket)
@@ -229,7 +229,7 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi
 		}
 		defer tty.Close()
 
-		// Set stdio to the new TTY slave.
+		// Set stdio to the new TTY replica.
 		cmd.Stdin = tty
 		cmd.Stdout = tty
 		cmd.Stderr = tty
diff --git a/runsc/console/console.go b/runsc/console/console.go
index 64b23639a..dbb88e117 100644
--- a/runsc/console/console.go
+++ b/runsc/console/console.go
@@ -24,11 +24,11 @@ import (
 	"golang.org/x/sys/unix"
 )
 
-// NewWithSocket creates pty master/slave pair, sends the master FD over the given
-// socket, and returns the slave.
+// NewWithSocket creates pty master/replica pair, sends the master FD over the given
+// socket, and returns the replica.
 func NewWithSocket(socketPath string) (*os.File, error) {
-	// Create a new pty master and slave.
-	ptyMaster, ptySlave, err := pty.Open()
+	// Create a new pty master and replica.
+	ptyMaster, ptyReplica, err := pty.Open()
 	if err != nil {
 		return nil, fmt.Errorf("opening pty: %v", err)
 	}
@@ -37,18 +37,18 @@ func NewWithSocket(socketPath string) (*os.File, error) {
 	// Get a connection to the socket path.
 	conn, err := net.Dial("unix", socketPath)
 	if err != nil {
-		ptySlave.Close()
+		ptyReplica.Close()
 		return nil, fmt.Errorf("dialing socket %q: %v", socketPath, err)
 	}
 	defer conn.Close()
 	uc, ok := conn.(*net.UnixConn)
 	if !ok {
-		ptySlave.Close()
+		ptyReplica.Close()
 		return nil, fmt.Errorf("connection is not a UnixConn: %T", conn)
 	}
 	socket, err := uc.File()
 	if err != nil {
-		ptySlave.Close()
+		ptyReplica.Close()
 		return nil, fmt.Errorf("getting file for unix socket %v: %v", uc, err)
 	}
 	defer socket.Close()
@@ -56,8 +56,8 @@ func NewWithSocket(socketPath string) (*os.File, error) {
 	// Send the master FD over the connection.
 	msg := unix.UnixRights(int(ptyMaster.Fd()))
 	if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
-		ptySlave.Close()
+		ptyReplica.Close()
 		return nil, fmt.Errorf("sending console over unix socket %q: %v", socketPath, err)
 	}
-	return ptySlave, nil
+	return ptyReplica, nil
 }
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 995d4e267..4228399b8 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -185,14 +185,14 @@ func TestJobControlSignalExec(t *testing.T) {
 		t.Fatalf("error starting container: %v", err)
 	}
 
-	// Create a pty master/slave. The slave will be passed to the exec
+	// Create a pty master/replica. The replica will be passed to the exec
 	// process.
-	ptyMaster, ptySlave, err := pty.Open()
+	ptyMaster, ptyReplica, err := pty.Open()
 	if err != nil {
 		t.Fatalf("error opening pty: %v", err)
 	}
 	defer ptyMaster.Close()
-	defer ptySlave.Close()
+	defer ptyReplica.Close()
 
 	// Exec bash and attach a terminal. Note that occasionally /bin/sh
 	// may be a different shell or have a different configuration (such
@@ -203,9 +203,9 @@ func TestJobControlSignalExec(t *testing.T) {
 		// Don't let bash execute from profile or rc files, otherwise
 		// our PID counts get messed up.
 		Argv: []string{"/bin/bash", "--noprofile", "--norc"},
-		// Pass the pty slave as FD 0, 1, and 2.
+		// Pass the pty replica as FD 0, 1, and 2.
 		FilePayload: urpc.FilePayload{
-			Files: []*os.File{ptySlave, ptySlave, ptySlave},
+			Files: []*os.File{ptyReplica, ptyReplica, ptyReplica},
 		},
 		StdioIsPty: true,
 	}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 1beea123f..da1694280 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -1360,7 +1360,7 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 }
 
 // Test that unsupported pod mounts options are ignored when matching master and
-// slave mounts.
+// replica mounts.
 func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
 	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index a339937fb..a8f4f64a5 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -478,10 +478,10 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn
 	cmd.Stderr = nil
 
 	// If the console control socket file is provided, then create a new
-	// pty master/slave pair and set the TTY on the sandbox process.
+	// pty master/replica pair and set the TTY on the sandbox process.
 	if args.Spec.Process.Terminal && args.ConsoleSocket != "" {
 		// console.NewWithSocket will send the master on the given
-		// socket, and return the slave.
+		// socket, and return the replica.
 		tty, err := console.NewWithSocket(args.ConsoleSocket)
 		if err != nil {
 			return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err)
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index 2e4ab6ca8..0b174e2be 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -71,7 +71,7 @@ constexpr absl::Duration kTimeout = absl::Seconds(20);
 // The maximum line size in bytes returned per read from a pty file.
 constexpr int kMaxLineSize = 4096;
 
-constexpr char kMainPath[] = "/dev/ptmx";
+constexpr char kMasterPath[] = "/dev/ptmx";
 
 // glibc defines its own, different, version of struct termios. We care about
 // what the kernel does, not glibc.
@@ -388,22 +388,22 @@ PosixErrorOr<size_t> PollAndReadFd(int fd, void* buf, size_t count,
 TEST(PtyTrunc, Truncate) {
   // Opening PTYs with O_TRUNC shouldn't cause an error, but calls to
   // (f)truncate should.
-  FileDescriptor main =
-      ASSERT_NO_ERRNO_AND_VALUE(Open(kMainPath, O_RDWR | O_TRUNC));
-  int n = ASSERT_NO_ERRNO_AND_VALUE(ReplicaID(main));
+  FileDescriptor master =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(kMasterPath, O_RDWR | O_TRUNC));
+  int n = ASSERT_NO_ERRNO_AND_VALUE(ReplicaID(master));
   std::string spath = absl::StrCat("/dev/pts/", n);
   FileDescriptor replica =
       ASSERT_NO_ERRNO_AND_VALUE(Open(spath, O_RDWR | O_NONBLOCK | O_TRUNC));
 
-  EXPECT_THAT(truncate(kMainPath, 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(truncate(kMasterPath, 0), SyscallFailsWithErrno(EINVAL));
   EXPECT_THAT(truncate(spath.c_str(), 0), SyscallFailsWithErrno(EINVAL));
-  EXPECT_THAT(ftruncate(main.get(), 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(ftruncate(master.get(), 0), SyscallFailsWithErrno(EINVAL));
   EXPECT_THAT(ftruncate(replica.get(), 0), SyscallFailsWithErrno(EINVAL));
 }
 
-TEST(BasicPtyTest, StatUnopenedMain) {
+TEST(BasicPtyTest, StatUnopenedMaster) {
   struct stat s;
-  ASSERT_THAT(stat(kMainPath, &s), SyscallSucceeds());
+  ASSERT_THAT(stat(kMasterPath, &s), SyscallSucceeds());
 
   EXPECT_EQ(s.st_rdev, makedev(TTYAUX_MAJOR, kPtmxMinor));
   EXPECT_EQ(s.st_size, 0);
@@ -454,41 +454,41 @@ void ExpectReadable(const FileDescriptor& fd, int expected, char* buf) {
   EXPECT_EQ(expected, n);
 }
 
-TEST(BasicPtyTest, OpenMainReplica) {
-  FileDescriptor main = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
-  FileDescriptor replica = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(main));
+TEST(BasicPtyTest, OpenMasterReplica) {
+  FileDescriptor master = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
+  FileDescriptor replica = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master));
 }
 
-// The replica entry in /dev/pts/ disappears when the main is closed, even if
+// The replica entry in /dev/pts/ disappears when the master is closed, even if
 // the replica is still open.
-TEST(BasicPtyTest, ReplicaEntryGoneAfterMainClose) {
-  FileDescriptor main = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
-  FileDescriptor replica = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(main));
+TEST(BasicPtyTest, ReplicaEntryGoneAfterMasterClose) {
+  FileDescriptor master = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
+  FileDescriptor replica = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master));
 
   // Get pty index.
   int index = -1;
-  ASSERT_THAT(ioctl(main.get(), TIOCGPTN, &index), SyscallSucceeds());
+  ASSERT_THAT(ioctl(master.get(), TIOCGPTN, &index), SyscallSucceeds());
 
   std::string path = absl::StrCat("/dev/pts/", index);
 
   struct stat st;
   EXPECT_THAT(stat(path.c_str(), &st), SyscallSucceeds());
 
-  main.reset();
+  master.reset();
 
   EXPECT_THAT(stat(path.c_str(), &st), SyscallFailsWithErrno(ENOENT));
 }
 
 TEST(BasicPtyTest, Getdents) {
-  FileDescriptor main1 = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
+  FileDescriptor master1 = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
   int index1 = -1;
-  ASSERT_THAT(ioctl(main1.get(), TIOCGPTN, &index1), SyscallSucceeds());
-  FileDescriptor replica1 = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(main1));
+  ASSERT_THAT(ioctl(master1.get(), TIOCGPTN, &index1), SyscallSucceeds());
+  FileDescriptor replica1 = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master1));
 
-  FileDescriptor main2 = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
+  FileDescriptor master2 = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
   int index2 = -1;
-  ASSERT_THAT(ioctl(main2.get(), TIOCGPTN, &index2), SyscallSucceeds());
-  FileDescriptor replica2 = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(main2));
+  ASSERT_THAT(ioctl(master2.get(), TIOCGPTN, &index2), SyscallSucceeds());
+  FileDescriptor replica2 = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master2));
 
   // The directory contains ptmx, index1, and index2. (Plus any additional PTYs
   // unrelated to this test.)
@@ -498,9 +498,9 @@ TEST(BasicPtyTest, Getdents) {
   EXPECT_THAT(contents, Contains(absl::StrCat(index1)));
   EXPECT_THAT(contents, Contains(absl::StrCat(index2)));
 
-  main2.reset();
+  master2.reset();
 
-  // The directory contains ptmx and index1, but not index2 since the main is
+  // The directory contains ptmx and index1, but not index2 since the master is
   // closed. (Plus any additional PTYs unrelated to this test.)
 
   contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/dev/pts/", true));
@@ -519,8 +519,8 @@ TEST(BasicPtyTest, Getdents) {
 class PtyTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    main_ = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
-    replica_ = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(main_));
+    master_ = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
+    replica_ = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master_));
   }
 
   void DisableCanonical() {
@@ -537,21 +537,22 @@ class PtyTest : public ::testing::Test {
     EXPECT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
   }
 
-  // Main and replica ends of the PTY. Non-blocking.
-  FileDescriptor main_;
+  // Master and replica ends of the PTY. Non-blocking.
+  FileDescriptor master_;
   FileDescriptor replica_;
 };
 
-// Main to replica sanity test.
-TEST_F(PtyTest, WriteMainToReplica) {
-  // N.B. by default, the replica reads nothing until the main writes a newline.
+// Master to replica sanity test.
+TEST_F(PtyTest, WriteMasterToReplica) {
+  // N.B. by default, the replica reads nothing until the master writes a
+  // newline.
   constexpr char kBuf[] = "hello\n";
 
-  EXPECT_THAT(WriteFd(main_.get(), kBuf, sizeof(kBuf) - 1),
+  EXPECT_THAT(WriteFd(master_.get(), kBuf, sizeof(kBuf) - 1),
               SyscallSucceedsWithValue(sizeof(kBuf) - 1));
 
-  // Linux moves data from the main to the replica via async work scheduled via
-  // tty_flip_buffer_push. Since it is asynchronous, the data may not be
+  // Linux moves data from the master to the replica via async work scheduled
+  // via tty_flip_buffer_push. Since it is asynchronous, the data may not be
   // available for reading immediately. Instead we must poll and assert that it
   // becomes available "soon".
 
@@ -561,63 +562,63 @@ TEST_F(PtyTest, WriteMainToReplica) {
   EXPECT_EQ(memcmp(buf, kBuf, sizeof(kBuf)), 0);
 }
 
-// Replica to main sanity test.
-TEST_F(PtyTest, WriteReplicaToMain) {
-  // N.B. by default, the main reads nothing until the replica writes a newline,
-  // and the main gets a carriage return.
+// Replica to master sanity test.
+TEST_F(PtyTest, WriteReplicaToMaster) {
+  // N.B. by default, the master reads nothing until the replica writes a
+  // newline, and the master gets a carriage return.
   constexpr char kInput[] = "hello\n";
   constexpr char kExpected[] = "hello\r\n";
 
   EXPECT_THAT(WriteFd(replica_.get(), kInput, sizeof(kInput) - 1),
               SyscallSucceedsWithValue(sizeof(kInput) - 1));
 
-  // Linux moves data from the main to the replica via async work scheduled via
-  // tty_flip_buffer_push. Since it is asynchronous, the data may not be
+  // Linux moves data from the master to the replica via async work scheduled
+  // via tty_flip_buffer_push. Since it is asynchronous, the data may not be
   // available for reading immediately. Instead we must poll and assert that it
   // becomes available "soon".
 
   char buf[sizeof(kExpected)] = {};
-  ExpectReadable(main_, sizeof(buf) - 1, buf);
+  ExpectReadable(master_, sizeof(buf) - 1, buf);
 
   EXPECT_EQ(memcmp(buf, kExpected, sizeof(kExpected)), 0);
 }
 
 TEST_F(PtyTest, WriteInvalidUTF8) {
   char c = 0xff;
-  ASSERT_THAT(syscall(__NR_write, main_.get(), &c, sizeof(c)),
+  ASSERT_THAT(syscall(__NR_write, master_.get(), &c, sizeof(c)),
               SyscallSucceedsWithValue(sizeof(c)));
 }
 
-// Both the main and replica report the standard default termios settings.
+// Both the master and replica report the standard default termios settings.
 //
-// Note that TCGETS on the main actually redirects to the replica (see comment
-// on MainTermiosUnchangable).
+// Note that TCGETS on the master actually redirects to the replica (see comment
+// on MasterTermiosUnchangable).
 TEST_F(PtyTest, DefaultTermios) {
   struct kernel_termios t = {};
   EXPECT_THAT(ioctl(replica_.get(), TCGETS, &t), SyscallSucceeds());
   EXPECT_EQ(t, DefaultTermios());
 
-  EXPECT_THAT(ioctl(main_.get(), TCGETS, &t), SyscallSucceeds());
+  EXPECT_THAT(ioctl(master_.get(), TCGETS, &t), SyscallSucceeds());
   EXPECT_EQ(t, DefaultTermios());
 }
 
-// Changing termios from the main actually affects the replica.
+// Changing termios from the master actually affects the replica.
 //
-// TCSETS on the main actually redirects to the replica (see comment on
-// MainTermiosUnchangable).
+// TCSETS on the master actually redirects to the replica (see comment on
+// MasterTermiosUnchangable).
 TEST_F(PtyTest, TermiosAffectsReplica) {
-  struct kernel_termios main_termios = {};
-  EXPECT_THAT(ioctl(main_.get(), TCGETS, &main_termios), SyscallSucceeds());
-  main_termios.c_lflag ^= ICANON;
-  EXPECT_THAT(ioctl(main_.get(), TCSETS, &main_termios), SyscallSucceeds());
+  struct kernel_termios master_termios = {};
+  EXPECT_THAT(ioctl(master_.get(), TCGETS, &master_termios), SyscallSucceeds());
+  master_termios.c_lflag ^= ICANON;
+  EXPECT_THAT(ioctl(master_.get(), TCSETS, &master_termios), SyscallSucceeds());
 
   struct kernel_termios replica_termios = {};
   EXPECT_THAT(ioctl(replica_.get(), TCGETS, &replica_termios),
               SyscallSucceeds());
-  EXPECT_EQ(main_termios, replica_termios);
+  EXPECT_EQ(master_termios, replica_termios);
 }
 
-// The main end of the pty has termios:
+// The master end of the pty has termios:
 //
 // struct kernel_termios t = {
 //   .c_iflag = 0;
@@ -629,25 +630,25 @@ TEST_F(PtyTest, TermiosAffectsReplica) {
 //
 // (From drivers/tty/pty.c:unix98_pty_init)
 //
-// All termios control ioctls on the main actually redirect to the replica
+// All termios control ioctls on the master actually redirect to the replica
 // (drivers/tty/tty_ioctl.c:tty_mode_ioctl), making it impossible to change the
-// main termios.
+// master termios.
 //
 // Verify this by setting ICRNL (which rewrites input \r to \n) and verify that
-// it has no effect on the main.
-TEST_F(PtyTest, MainTermiosUnchangable) {
-  struct kernel_termios main_termios = {};
-  EXPECT_THAT(ioctl(main_.get(), TCGETS, &main_termios), SyscallSucceeds());
-  main_termios.c_lflag |= ICRNL;
-  EXPECT_THAT(ioctl(main_.get(), TCSETS, &main_termios), SyscallSucceeds());
+// it has no effect on the master.
+TEST_F(PtyTest, MasterTermiosUnchangable) {
+  struct kernel_termios master_termios = {};
+  EXPECT_THAT(ioctl(master_.get(), TCGETS, &master_termios), SyscallSucceeds());
+  master_termios.c_lflag |= ICRNL;
+  EXPECT_THAT(ioctl(master_.get(), TCSETS, &master_termios), SyscallSucceeds());
 
   char c = '\r';
   ASSERT_THAT(WriteFd(replica_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
-  ExpectReadable(main_, 1, &c);
+  ExpectReadable(master_, 1, &c);
   EXPECT_EQ(c, '\r');  // ICRNL had no effect!
 
-  ExpectFinished(main_);
+  ExpectFinished(master_);
 }
 
 // ICRNL rewrites input \r to \n.
@@ -658,7 +659,7 @@ TEST_F(PtyTest, TermiosICRNL) {
   ASSERT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
 
   char c = '\r';
-  ASSERT_THAT(WriteFd(main_.get(), &c, 1), SyscallSucceedsWithValue(1));
+  ASSERT_THAT(WriteFd(master_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
   ExpectReadable(replica_, 1, &c);
   EXPECT_EQ(c, '\n');
@@ -678,7 +679,7 @@ TEST_F(PtyTest, TermiosONLCR) {
 
   // Extra byte for NUL for EXPECT_STREQ.
   char buf[3] = {};
-  ExpectReadable(main_, 2, buf);
+  ExpectReadable(master_, 2, buf);
   EXPECT_STREQ(buf, "\r\n");
 
   ExpectFinished(replica_);
@@ -691,7 +692,7 @@ TEST_F(PtyTest, TermiosIGNCR) {
   ASSERT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
 
   char c = '\r';
-  ASSERT_THAT(WriteFd(main_.get(), &c, 1), SyscallSucceedsWithValue(1));
+  ASSERT_THAT(WriteFd(master_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
   // Nothing to read.
   ASSERT_THAT(PollAndReadFd(replica_.get(), &c, 1, kTimeout),
@@ -725,18 +726,18 @@ TEST_F(PtyTest, TermiosPollReplica) {
   absl::SleepFor(absl::Seconds(1));
 
   char s[] = "foo\n";
-  ASSERT_THAT(WriteFd(main_.get(), s, strlen(s) + 1), SyscallSucceeds());
+  ASSERT_THAT(WriteFd(master_.get(), s, strlen(s) + 1), SyscallSucceeds());
 }
 
-// Test that we can successfully poll for readable data from the main.
-TEST_F(PtyTest, TermiosPollMain) {
+// Test that we can successfully poll for readable data from the master.
+TEST_F(PtyTest, TermiosPollMaster) {
   struct kernel_termios t = DefaultTermios();
   t.c_iflag |= IGNCR;
   t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
-  ASSERT_THAT(ioctl(main_.get(), TCSETS, &t), SyscallSucceeds());
+  ASSERT_THAT(ioctl(master_.get(), TCSETS, &t), SyscallSucceeds());
 
   absl::Notification notify;
-  int mfd = main_.get();
+  int mfd = master_.get();
   ScopedThread th([mfd, &notify]() {
     notify.Notify();
 
@@ -765,7 +766,7 @@ TEST_F(PtyTest, TermiosINLCR) {
   ASSERT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
 
   char c = '\n';
-  ASSERT_THAT(WriteFd(main_.get(), &c, 1), SyscallSucceedsWithValue(1));
+  ASSERT_THAT(WriteFd(master_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
   ExpectReadable(replica_, 1, &c);
   EXPECT_EQ(c, '\r');
@@ -784,7 +785,7 @@ TEST_F(PtyTest, TermiosONOCR) {
   ASSERT_THAT(WriteFd(replica_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
   // Nothing to read.
-  ASSERT_THAT(PollAndReadFd(main_.get(), &c, 1, kTimeout),
+  ASSERT_THAT(PollAndReadFd(master_.get(), &c, 1, kTimeout),
               PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
 
   // This time the column is greater than 0, so we should be able to read the CR
@@ -795,17 +796,17 @@ TEST_F(PtyTest, TermiosONOCR) {
               SyscallSucceedsWithValue(kInputSize));
 
   char buf[kInputSize] = {};
-  ExpectReadable(main_, kInputSize, buf);
+  ExpectReadable(master_, kInputSize, buf);
 
   EXPECT_EQ(memcmp(buf, kInput, kInputSize), 0);
 
-  ExpectFinished(main_);
+  ExpectFinished(master_);
 
   // Terminal should be at column 0 again, so no CR can be read.
   ASSERT_THAT(WriteFd(replica_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
   // Nothing to read.
-  ASSERT_THAT(PollAndReadFd(main_.get(), &c, 1, kTimeout),
+  ASSERT_THAT(PollAndReadFd(master_.get(), &c, 1, kTimeout),
               PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
 }
 
@@ -819,10 +820,10 @@ TEST_F(PtyTest, TermiosOCRNL) {
   char c = '\r';
   ASSERT_THAT(WriteFd(replica_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
-  ExpectReadable(main_, 1, &c);
+  ExpectReadable(master_, 1, &c);
   EXPECT_EQ(c, '\n');
 
-  ExpectFinished(main_);
+  ExpectFinished(master_);
 }
 
 // Tests that VEOL is disabled when we start, and that we can set it to enable
@@ -830,7 +831,7 @@ TEST_F(PtyTest, TermiosOCRNL) {
 TEST_F(PtyTest, VEOLTermination) {
   // Write a few bytes ending with '\0', and confirm that we can't read.
   constexpr char kInput[] = "hello";
-  ASSERT_THAT(WriteFd(main_.get(), kInput, sizeof(kInput)),
+  ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput)),
               SyscallSucceedsWithValue(sizeof(kInput)));
   char buf[sizeof(kInput)] = {};
   ASSERT_THAT(PollAndReadFd(replica_.get(), buf, sizeof(kInput), kTimeout),
@@ -841,7 +842,7 @@ TEST_F(PtyTest, VEOLTermination) {
   struct kernel_termios t = DefaultTermios();
   t.c_cc[VEOL] = delim;
   ASSERT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
-  ASSERT_THAT(WriteFd(main_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+  ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
 
   // Now we can read, as sending EOL caused the line to become available.
   ExpectReadable(replica_, sizeof(kInput), buf);
@@ -861,7 +862,7 @@ TEST_F(PtyTest, CanonBigWrite) {
   char input[kWriteLen];
   memset(input, 'M', kWriteLen - 1);
   input[kWriteLen - 1] = '\n';
-  ASSERT_THAT(WriteFd(main_.get(), input, kWriteLen),
+  ASSERT_THAT(WriteFd(master_.get(), input, kWriteLen),
               SyscallSucceedsWithValue(kWriteLen));
 
   // We can read the line.
@@ -877,7 +878,7 @@ TEST_F(PtyTest, SwitchCanonToNoncanon) {
   // Write a few bytes without a terminating character, switch to noncanonical
   // mode, and read them.
   constexpr char kInput[] = "hello";
-  ASSERT_THAT(WriteFd(main_.get(), kInput, sizeof(kInput)),
+  ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput)),
               SyscallSucceedsWithValue(sizeof(kInput)));
 
   // Nothing available yet.
@@ -896,7 +897,7 @@ TEST_F(PtyTest, SwitchCanonToNoncanon) {
 TEST_F(PtyTest, SwitchCanonToNonCanonNewline) {
   // Write a few bytes with a terminating character.
   constexpr char kInput[] = "hello\n";
-  ASSERT_THAT(WriteFd(main_.get(), kInput, sizeof(kInput)),
+  ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput)),
               SyscallSucceedsWithValue(sizeof(kInput)));
 
   DisableCanonical();
@@ -916,12 +917,12 @@ TEST_F(PtyTest, SwitchNoncanonToCanonNewlineBig) {
   constexpr int kWriteLen = 4100;
   char input[kWriteLen];
   memset(input, 'M', kWriteLen);
-  ASSERT_THAT(WriteFd(main_.get(), input, kWriteLen),
+  ASSERT_THAT(WriteFd(master_.get(), input, kWriteLen),
               SyscallSucceedsWithValue(kWriteLen));
   // Wait for the input queue to fill.
   ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), kMaxLineSize - 1));
   constexpr char delim = '\n';
-  ASSERT_THAT(WriteFd(main_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+  ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
 
   EnableCanonical();
 
@@ -941,7 +942,7 @@ TEST_F(PtyTest, SwitchNoncanonToCanonNoNewline) {
   // Write a few bytes without a terminating character.
   // mode, and read them.
   constexpr char kInput[] = "hello";
-  ASSERT_THAT(WriteFd(main_.get(), kInput, sizeof(kInput) - 1),
+  ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput) - 1),
               SyscallSucceedsWithValue(sizeof(kInput) - 1));
 
   ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), sizeof(kInput) - 1));
@@ -963,7 +964,7 @@ TEST_F(PtyTest, SwitchNoncanonToCanonNoNewlineBig) {
   constexpr int kWriteLen = 4100;
   char input[kWriteLen];
   memset(input, 'M', kWriteLen);
-  ASSERT_THAT(WriteFd(main_.get(), input, kWriteLen),
+  ASSERT_THAT(WriteFd(master_.get(), input, kWriteLen),
               SyscallSucceedsWithValue(kWriteLen));
 
   ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), kMaxLineSize - 1));
@@ -987,12 +988,12 @@ TEST_F(PtyTest, NoncanonBigWrite) {
   for (int i = 0; i < kInputSize; i++) {
     // This makes too many syscalls for save/restore.
     const DisableSave ds;
-    ASSERT_THAT(WriteFd(main_.get(), &kInput, sizeof(kInput)),
+    ASSERT_THAT(WriteFd(master_.get(), &kInput, sizeof(kInput)),
                 SyscallSucceedsWithValue(sizeof(kInput)));
   }
 
   // We should be able to read out everything. Sleep a bit so that Linux has a
-  // chance to move data from the main to the replica.
+  // chance to move data from the master to the replica.
   ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), kMaxLineSize - 1));
   for (int i = 0; i < kInputSize; i++) {
     // This makes too many syscalls for save/restore.
@@ -1010,7 +1011,7 @@ TEST_F(PtyTest, NoncanonBigWrite) {
 // Test newline.
 TEST_F(PtyTest, TermiosICANONNewline) {
   char input[3] = {'a', 'b', 'c'};
-  ASSERT_THAT(WriteFd(main_.get(), input, sizeof(input)),
+  ASSERT_THAT(WriteFd(master_.get(), input, sizeof(input)),
               SyscallSucceedsWithValue(sizeof(input)));
 
   // Extra bytes for newline (written later) and NUL for EXPECT_STREQ.
@@ -1021,7 +1022,7 @@ TEST_F(PtyTest, TermiosICANONNewline) {
               PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
 
   char delim = '\n';
-  ASSERT_THAT(WriteFd(main_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+  ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
 
   // Now it is available.
   ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), sizeof(input) + 1));
@@ -1036,7 +1037,7 @@ TEST_F(PtyTest, TermiosICANONNewline) {
 // Test EOF (^D).
 TEST_F(PtyTest, TermiosICANONEOF) {
   char input[3] = {'a', 'b', 'c'};
-  ASSERT_THAT(WriteFd(main_.get(), input, sizeof(input)),
+  ASSERT_THAT(WriteFd(master_.get(), input, sizeof(input)),
               SyscallSucceedsWithValue(sizeof(input)));
 
   // Extra byte for NUL for EXPECT_STREQ.
@@ -1046,7 +1047,7 @@ TEST_F(PtyTest, TermiosICANONEOF) {
   ASSERT_THAT(PollAndReadFd(replica_.get(), buf, sizeof(input), kTimeout),
               PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
   char delim = ControlCharacter('D');
-  ASSERT_THAT(WriteFd(main_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+  ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
 
   // Now it is available. Note that ^D is not included.
   ExpectReadable(replica_, sizeof(input), buf);
@@ -1069,10 +1070,10 @@ TEST_F(PtyTest, CanonDiscard) {
     // This makes too many syscalls for save/restore.
     const DisableSave ds;
     for (int i = 0; i < kInputSize; i++) {
-      ASSERT_THAT(WriteFd(main_.get(), &kInput, sizeof(kInput)),
+      ASSERT_THAT(WriteFd(master_.get(), &kInput, sizeof(kInput)),
                   SyscallSucceedsWithValue(sizeof(kInput)));
     }
-    ASSERT_THAT(WriteFd(main_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+    ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
   }
 
   // There should be multiple truncated lines available to read.
@@ -1091,9 +1092,9 @@ TEST_F(PtyTest, CanonMultiline) {
   constexpr char kInput2[] = "BLUE\n";
 
   // Write both lines.
-  ASSERT_THAT(WriteFd(main_.get(), kInput1, sizeof(kInput1) - 1),
+  ASSERT_THAT(WriteFd(master_.get(), kInput1, sizeof(kInput1) - 1),
               SyscallSucceedsWithValue(sizeof(kInput1) - 1));
-  ASSERT_THAT(WriteFd(main_.get(), kInput2, sizeof(kInput2) - 1),
+  ASSERT_THAT(WriteFd(master_.get(), kInput2, sizeof(kInput2) - 1),
               SyscallSucceedsWithValue(sizeof(kInput2) - 1));
 
   // Get the first line.
@@ -1117,9 +1118,9 @@ TEST_F(PtyTest, SwitchNoncanonToCanonMultiline) {
   constexpr char kExpected[] = "GO\nBLUE\n";
 
   // Write both lines.
-  ASSERT_THAT(WriteFd(main_.get(), kInput1, sizeof(kInput1) - 1),
+  ASSERT_THAT(WriteFd(master_.get(), kInput1, sizeof(kInput1) - 1),
               SyscallSucceedsWithValue(sizeof(kInput1) - 1));
-  ASSERT_THAT(WriteFd(main_.get(), kInput2, sizeof(kInput2) - 1),
+  ASSERT_THAT(WriteFd(master_.get(), kInput2, sizeof(kInput2) - 1),
               SyscallSucceedsWithValue(sizeof(kInput2) - 1));
 
   ASSERT_NO_ERRNO(
@@ -1140,7 +1141,7 @@ TEST_F(PtyTest, SwitchTwiceMultiline) {
 
   // Write each line.
   for (const std::string& input : kInputs) {
-    ASSERT_THAT(WriteFd(main_.get(), input.c_str(), input.size()),
+    ASSERT_THAT(WriteFd(master_.get(), input.c_str(), input.size()),
                 SyscallSucceedsWithValue(input.size()));
   }
 
@@ -1162,7 +1163,7 @@ TEST_F(PtyTest, SwitchTwiceMultiline) {
 TEST_F(PtyTest, QueueSize) {
   // Write the line.
   constexpr char kInput1[] = "GO\n";
-  ASSERT_THAT(WriteFd(main_.get(), kInput1, sizeof(kInput1) - 1),
+  ASSERT_THAT(WriteFd(master_.get(), kInput1, sizeof(kInput1) - 1),
               SyscallSucceedsWithValue(sizeof(kInput1) - 1));
   ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), sizeof(kInput1) - 1));
 
@@ -1170,7 +1171,7 @@ TEST_F(PtyTest, QueueSize) {
   // readable size.
   char input[kMaxLineSize];
   memset(input, 'M', kMaxLineSize);
-  ASSERT_THAT(WriteFd(main_.get(), input, kMaxLineSize),
+  ASSERT_THAT(WriteFd(master_.get(), input, kMaxLineSize),
               SyscallSucceedsWithValue(kMaxLineSize));
   int inputBufSize = ASSERT_NO_ERRNO_AND_VALUE(
       WaitUntilReceived(replica_.get(), sizeof(kInput1) - 1));
@@ -1192,10 +1193,11 @@ TEST_F(PtyTest, PartialBadBuffer) {
   // Leave only one free byte in the buffer.
   char* bad_buffer = buf + kPageSize - 1;
 
-  // Write to the main.
+  // Write to the master.
   constexpr char kBuf[] = "hello\n";
   constexpr size_t size = sizeof(kBuf) - 1;
-  EXPECT_THAT(WriteFd(main_.get(), kBuf, size), SyscallSucceedsWithValue(size));
+  EXPECT_THAT(WriteFd(master_.get(), kBuf, size),
+              SyscallSucceedsWithValue(size));
 
   // Read from the replica into bad_buffer.
   ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), size));
@@ -1207,14 +1209,14 @@ TEST_F(PtyTest, PartialBadBuffer) {
 
 TEST_F(PtyTest, SimpleEcho) {
   constexpr char kInput[] = "Mr. Eko";
-  EXPECT_THAT(WriteFd(main_.get(), kInput, strlen(kInput)),
+  EXPECT_THAT(WriteFd(master_.get(), kInput, strlen(kInput)),
               SyscallSucceedsWithValue(strlen(kInput)));
 
   char buf[100] = {};
-  ExpectReadable(main_, strlen(kInput), buf);
+  ExpectReadable(master_, strlen(kInput), buf);
 
   EXPECT_STREQ(buf, kInput);
-  ExpectFinished(main_);
+  ExpectFinished(master_);
 }
 
 TEST_F(PtyTest, GetWindowSize) {
@@ -1231,16 +1233,17 @@ TEST_F(PtyTest, SetReplicaWindowSize) {
   ASSERT_THAT(ioctl(replica_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
 
   struct winsize retrieved_ws = {};
-  ASSERT_THAT(ioctl(main_.get(), TIOCGWINSZ, &retrieved_ws), SyscallSucceeds());
+  ASSERT_THAT(ioctl(master_.get(), TIOCGWINSZ, &retrieved_ws),
+              SyscallSucceeds());
   EXPECT_EQ(retrieved_ws.ws_row, kRows);
   EXPECT_EQ(retrieved_ws.ws_col, kCols);
 }
 
-TEST_F(PtyTest, SetMainWindowSize) {
+TEST_F(PtyTest, SetMasterWindowSize) {
   constexpr uint16_t kRows = 343;
   constexpr uint16_t kCols = 2401;
   struct winsize ws = {.ws_row = kRows, .ws_col = kCols};
-  ASSERT_THAT(ioctl(main_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
+  ASSERT_THAT(ioctl(master_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
 
   struct winsize retrieved_ws = {};
   ASSERT_THAT(ioctl(replica_.get(), TIOCGWINSZ, &retrieved_ws),
@@ -1252,8 +1255,8 @@ TEST_F(PtyTest, SetMainWindowSize) {
 class JobControlTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    main_ = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
-    replica_ = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(main_));
+    master_ = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
+    replica_ = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master_));
 
     // Make this a session leader, which also drops the controlling terminal.
     // In the gVisor test environment, this test will be run as the session
@@ -1277,15 +1280,15 @@ class JobControlTest : public ::testing::Test {
     return PosixError(wstatus, "process returned");
   }
 
-  // Main and replica ends of the PTY. Non-blocking.
-  FileDescriptor main_;
+  // Master and replica ends of the PTY. Non-blocking.
+  FileDescriptor master_;
   FileDescriptor replica_;
 };
 
-TEST_F(JobControlTest, SetTTYMain) {
+TEST_F(JobControlTest, SetTTYMaster) {
   auto res = RunInChild([=]() {
     TEST_PCHECK(setsid() >= 0);
-    TEST_PCHECK(!ioctl(main_.get(), TIOCSCTTY, 0));
+    TEST_PCHECK(!ioctl(master_.get(), TIOCSCTTY, 0));
   });
   ASSERT_NO_ERRNO(res);
 }
@@ -1360,7 +1363,7 @@ TEST_F(JobControlTest, ReleaseWrongTTY) {
   auto res = RunInChild([=]() {
     TEST_PCHECK(setsid() >= 0);
     TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 0));
-    TEST_PCHECK(ioctl(main_.get(), TIOCNOTTY) < 0 && errno == ENOTTY);
+    TEST_PCHECK(ioctl(master_.get(), TIOCNOTTY) < 0 && errno == ENOTTY);
   });
   ASSERT_NO_ERRNO(res);
 }
diff --git a/test/syscalls/linux/pty_root.cc b/test/syscalls/linux/pty_root.cc
index a534cf0bb..4ac648729 100644
--- a/test/syscalls/linux/pty_root.cc
+++ b/test/syscalls/linux/pty_root.cc
@@ -48,9 +48,9 @@ TEST(JobControlRootTest, StealTTY) {
     ASSERT_THAT(setsid(), SyscallSucceeds());
   }
 
-  FileDescriptor main =
+  FileDescriptor master =
       ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
-  FileDescriptor replica = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(main));
+  FileDescriptor replica = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master));
 
   // Make replica the controlling terminal.
   ASSERT_THAT(ioctl(replica.get(), TIOCSCTTY, 0), SyscallSucceeds());
diff --git a/test/util/pty_util.cc b/test/util/pty_util.cc
index 5fa622922..2cf0bea74 100644
--- a/test/util/pty_util.cc
+++ b/test/util/pty_util.cc
@@ -23,25 +23,25 @@
 namespace gvisor {
 namespace testing {
 
-PosixErrorOr<FileDescriptor> OpenReplica(const FileDescriptor& main) {
-  PosixErrorOr<int> n = ReplicaID(main);
+PosixErrorOr<FileDescriptor> OpenReplica(const FileDescriptor& master) {
+  PosixErrorOr<int> n = ReplicaID(master);
   if (!n.ok()) {
     return PosixErrorOr<FileDescriptor>(n.error());
   }
   return Open(absl::StrCat("/dev/pts/", n.ValueOrDie()), O_RDWR | O_NONBLOCK);
 }
 
-PosixErrorOr<int> ReplicaID(const FileDescriptor& main) {
+PosixErrorOr<int> ReplicaID(const FileDescriptor& master) {
   // Get pty index.
   int n;
-  int ret = ioctl(main.get(), TIOCGPTN, &n);
+  int ret = ioctl(master.get(), TIOCGPTN, &n);
   if (ret < 0) {
     return PosixError(errno, "ioctl(TIOCGPTN) failed");
   }
 
   // Unlock pts.
   int unlock = 0;
-  ret = ioctl(main.get(), TIOCSPTLCK, &unlock);
+  ret = ioctl(master.get(), TIOCSPTLCK, &unlock);
   if (ret < 0) {
     return PosixError(errno, "ioctl(TIOSPTLCK) failed");
   }
diff --git a/test/util/pty_util.h b/test/util/pty_util.h
index dff6adab5..ed7658868 100644
--- a/test/util/pty_util.h
+++ b/test/util/pty_util.h
@@ -21,11 +21,11 @@
 namespace gvisor {
 namespace testing {
 
-// Opens the replica end of the passed main as R/W and nonblocking.
-PosixErrorOr<FileDescriptor> OpenReplica(const FileDescriptor& main);
+// Opens the replica end of the passed master as R/W and nonblocking.
+PosixErrorOr<FileDescriptor> OpenReplica(const FileDescriptor& master);
 
-// Get the number of the replica end of the main.
-PosixErrorOr<int> ReplicaID(const FileDescriptor& main);
+// Get the number of the replica end of the master.
+PosixErrorOr<int> ReplicaID(const FileDescriptor& master);
 
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 29306b3f646c57aaa475ffe224b04d4473e364c7 Mon Sep 17 00:00:00 2001
From: Mithun Iyer <iyerm@google.com>
Date: Tue, 1 Sep 2020 17:40:28 -0700
Subject: Fix handling of unacceptable ACKs during close.

On receiving an ACK with unacceptable ACK number, in a closing state,
TCP, needs to reply back with an ACK with correct seq and ack numbers and
remain in same state. This change is as per RFC793 page 37, but with a
difference that it does not apply to ESTABLISHED state, just as in Linux.
Also add more tests to check for OTW sequence number and unacceptable
ack numbers in these states.

Fixes #3785

PiperOrigin-RevId: 329616283
---
 pkg/tcpip/transport/tcp/rcv.go                     |  37 +++-
 test/packetimpact/dut/posix_server.cc              |   9 +
 test/packetimpact/proto/posix_server.proto         |  11 +
 test/packetimpact/testbench/connections.go         |   2 +-
 test/packetimpact/testbench/dut.go                 |  26 +++
 test/packetimpact/tests/BUILD                      |   4 +-
 test/packetimpact/tests/tcp_close_wait_ack_test.go | 109 ----------
 test/packetimpact/tests/tcp_unacc_seq_ack_test.go  | 234 +++++++++++++++++++++
 8 files changed, 309 insertions(+), 123 deletions(-)
 delete mode 100644 test/packetimpact/tests/tcp_close_wait_ack_test.go
 create mode 100644 test/packetimpact/tests/tcp_unacc_seq_ack_test.go

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 5e0bfe585..bc920a03b 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -268,14 +268,7 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 	// If we are in one of the shutdown states then we need to do
 	// additional checks before we try and process the segment.
 	switch state {
-	case StateCloseWait:
-		// If the ACK acks something not yet sent then we send an ACK.
-		if r.ep.snd.sndNxt.LessThan(s.ackNumber) {
-			r.ep.snd.sendAck()
-			return true, nil
-		}
-		fallthrough
-	case StateClosing, StateLastAck:
+	case StateCloseWait, StateClosing, StateLastAck:
 		if !s.sequenceNumber.LessThanEq(r.rcvNxt) {
 			// Just drop the segment as we have
 			// already received a FIN and this
@@ -284,9 +277,31 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 			return true, nil
 		}
 		fallthrough
-	case StateFinWait1:
-		fallthrough
-	case StateFinWait2:
+	case StateFinWait1, StateFinWait2:
+		// If the ACK acks something not yet sent then we send an ACK.
+		//
+		// RFC793, page 37: If the connection is in a synchronized state,
+		// (ESTABLISHED, FIN-WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK,
+		// TIME-WAIT), any unacceptable segment (out of window sequence number
+		// or unacceptable acknowledgment number) must elicit only an empty
+		// acknowledgment segment containing the current send-sequence number
+		// and an acknowledgment indicating the next sequence number expected
+		// to be received, and the connection remains in the same state.
+		//
+		// Just as on Linux, we do not apply this behavior when state is
+		// ESTABLISHED.
+		// Linux receive processing for all states except ESTABLISHED and
+		// TIME_WAIT is here where if the ACK check fails, we attempt to
+		// reply back with an ACK with correct seq/ack numbers.
+		// https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L6186
+		// The ESTABLISHED state processing is here where if the ACK check
+		// fails, we ignore the packet:
+		// https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L5591
+		if r.ep.snd.sndNxt.LessThan(s.ackNumber) {
+			r.ep.snd.sendAck()
+			return true, nil
+		}
+
 		// If we are closed for reads (either due to an
 		// incoming FIN or the user calling shutdown(..,
 		// SHUT_RD) then any data past the rcvNxt should
diff --git a/test/packetimpact/dut/posix_server.cc b/test/packetimpact/dut/posix_server.cc
index 2476998f8..de5b4be93 100644
--- a/test/packetimpact/dut/posix_server.cc
+++ b/test/packetimpact/dut/posix_server.cc
@@ -336,6 +336,15 @@ class PosixImpl final : public posix_server::Posix::Service {
     return ::grpc::Status::OK;
   }
 
+  ::grpc::Status Shutdown(grpc_impl::ServerContext *context,
+                          const ::posix_server::ShutdownRequest *request,
+                          ::posix_server::ShutdownResponse *response) override {
+    if (shutdown(request->fd(), request->how()) < 0) {
+      response->set_errno_(errno);
+    }
+    return ::grpc::Status::OK;
+  }
+
   ::grpc::Status Recv(::grpc::ServerContext *context,
                       const ::posix_server::RecvRequest *request,
                       ::posix_server::RecvResponse *response) override {
diff --git a/test/packetimpact/proto/posix_server.proto b/test/packetimpact/proto/posix_server.proto
index ccd20b10d..f32ed54ef 100644
--- a/test/packetimpact/proto/posix_server.proto
+++ b/test/packetimpact/proto/posix_server.proto
@@ -188,6 +188,15 @@ message SocketResponse {
   int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
+message ShutdownRequest {
+  int32 fd = 1;
+  int32 how = 2;
+}
+
+message ShutdownResponse {
+  int32 errno_ = 1;  // "errno" may fail to compile in c++.
+}
+
 message RecvRequest {
   int32 sockfd = 1;
   int32 len = 2;
@@ -225,6 +234,8 @@ service Posix {
   rpc SetSockOpt(SetSockOptRequest) returns (SetSockOptResponse);
   // Call socket() on the DUT.
   rpc Socket(SocketRequest) returns (SocketResponse);
+  // Call shutdown() on the DUT.
+  rpc Shutdown(ShutdownRequest) returns (ShutdownResponse);
   // Call recv() on the DUT.
   rpc Recv(RecvRequest) returns (RecvResponse);
 }
diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index 3af5f83fd..a90046f69 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -615,7 +615,7 @@ func (conn *Connection) ExpectFrame(t *testing.T, layers Layers, timeout time.Du
 			if errs == nil {
 				return nil, fmt.Errorf("got no frames matching %v during %s", layers, timeout)
 			}
-			return nil, fmt.Errorf("got no frames matching %v during %s: got %w", layers, timeout, errs)
+			return nil, fmt.Errorf("got frames %w want %v during %s", errs, layers, timeout)
 		}
 		if conn.match(layers, gotLayers) {
 			for i, s := range conn.layerStates {
diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go
index 73c532e75..ff269d949 100644
--- a/test/packetimpact/testbench/dut.go
+++ b/test/packetimpact/testbench/dut.go
@@ -700,3 +700,29 @@ func (dut *DUT) RecvWithErrno(ctx context.Context, t *testing.T, sockfd, len, fl
 	}
 	return resp.GetRet(), resp.GetBuf(), syscall.Errno(resp.GetErrno_())
 }
+
+// Shutdown calls shutdown on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is needed, use
+// ShutdownWithErrno.
+func (dut *DUT) Shutdown(t *testing.T, fd, how int32) error {
+	t.Helper()
+
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	return dut.ShutdownWithErrno(ctx, t, fd, how)
+}
+
+// ShutdownWithErrno calls shutdown on the DUT.
+func (dut *DUT) ShutdownWithErrno(ctx context.Context, t *testing.T, fd, how int32) error {
+	t.Helper()
+
+	req := pb.ShutdownRequest{
+		Fd:  fd,
+		How: how,
+	}
+	resp, err := dut.posixServer.Shutdown(ctx, &req)
+	if err != nil {
+		t.Fatalf("failed to call Shutdown: %s", err)
+	}
+	return syscall.Errno(resp.GetErrno_())
+}
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 74658fea0..e1ed0cc60 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -166,8 +166,8 @@ packetimpact_go_test(
 )
 
 packetimpact_go_test(
-    name = "tcp_close_wait_ack",
-    srcs = ["tcp_close_wait_ack_test.go"],
+    name = "tcp_unacc_seq_ack",
+    srcs = ["tcp_unacc_seq_ack_test.go"],
     deps = [
         "//pkg/tcpip/header",
         "//pkg/tcpip/seqnum",
diff --git a/test/packetimpact/tests/tcp_close_wait_ack_test.go b/test/packetimpact/tests/tcp_close_wait_ack_test.go
deleted file mode 100644
index e6a96f214..000000000
--- a/test/packetimpact/tests/tcp_close_wait_ack_test.go
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tcp_close_wait_ack_test
-
-import (
-	"flag"
-	"fmt"
-	"testing"
-	"time"
-
-	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
-	"gvisor.dev/gvisor/test/packetimpact/testbench"
-)
-
-func init() {
-	testbench.RegisterFlags(flag.CommandLine)
-}
-
-func TestCloseWaitAck(t *testing.T) {
-	for _, tt := range []struct {
-		description    string
-		makeTestingTCP func(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset, windowSize seqnum.Size) testbench.TCP
-		seqNumOffset   seqnum.Size
-		expectAck      bool
-	}{
-		{"OTW", generateOTWSeqSegment, 0, false},
-		{"OTW", generateOTWSeqSegment, 1, true},
-		{"OTW", generateOTWSeqSegment, 2, true},
-		{"ACK", generateUnaccACKSegment, 0, false},
-		{"ACK", generateUnaccACKSegment, 1, true},
-		{"ACK", generateUnaccACKSegment, 2, true},
-	} {
-		t.Run(fmt.Sprintf("%s%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
-			dut := testbench.NewDUT(t)
-			defer dut.TearDown()
-			listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
-			defer dut.Close(t, listenFd)
-			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
-			defer conn.Close(t)
-
-			conn.Connect(t)
-			acceptFd, _ := dut.Accept(t, listenFd)
-
-			// Send a FIN to DUT to intiate the active close
-			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)})
-			gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
-			if err != nil {
-				t.Fatalf("expected an ACK for our fin and DUT should enter CLOSE_WAIT: %s", err)
-			}
-			windowSize := seqnum.Size(*gotTCP.WindowSize)
-
-			// Send a segment with OTW Seq / unacc ACK and expect an ACK back
-			conn.Send(t, tt.makeTestingTCP(t, &conn, tt.seqNumOffset, windowSize), &testbench.Payload{Bytes: []byte("Sample Data")})
-			gotAck, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
-			if tt.expectAck && err != nil {
-				t.Fatalf("expected an ack but got none: %s", err)
-			}
-			if !tt.expectAck && gotAck != nil {
-				t.Fatalf("expected no ack but got one: %s", gotAck)
-			}
-
-			// Now let's verify DUT is indeed in CLOSE_WAIT
-			dut.Close(t, acceptFd)
-			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)}, time.Second); err != nil {
-				t.Fatalf("expected DUT to send a FIN: %s", err)
-			}
-			// Ack the FIN from DUT
-			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
-			// Send some extra data to DUT
-			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: []byte("Sample Data")})
-			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
-				t.Fatalf("expected DUT to send an RST: %s", err)
-			}
-		})
-	}
-}
-
-// generateOTWSeqSegment generates an segment with
-// seqnum = RCV.NXT + RCV.WND + seqNumOffset, the generated segment is only
-// acceptable when seqNumOffset is 0, otherwise an ACK is expected from the
-// receiver.
-func generateOTWSeqSegment(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP {
-	lastAcceptable := conn.LocalSeqNum(t).Add(windowSize)
-	otwSeq := uint32(lastAcceptable.Add(seqNumOffset))
-	return testbench.TCP{SeqNum: testbench.Uint32(otwSeq), Flags: testbench.Uint8(header.TCPFlagAck)}
-}
-
-// generateUnaccACKSegment generates an segment with
-// acknum = SND.NXT + seqNumOffset, the generated segment is only acceptable
-// when seqNumOffset is 0, otherwise an ACK is expected from the receiver.
-func generateUnaccACKSegment(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP {
-	lastAcceptable := conn.RemoteSeqNum(t)
-	unaccAck := uint32(lastAcceptable.Add(seqNumOffset))
-	return testbench.TCP{AckNum: testbench.Uint32(unaccAck), Flags: testbench.Uint8(header.TCPFlagAck)}
-}
diff --git a/test/packetimpact/tests/tcp_unacc_seq_ack_test.go b/test/packetimpact/tests/tcp_unacc_seq_ack_test.go
new file mode 100644
index 000000000..d078bbf15
--- /dev/null
+++ b/test/packetimpact/tests/tcp_unacc_seq_ack_test.go
@@ -0,0 +1,234 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_unacc_seq_ack_test
+
+import (
+	"flag"
+	"fmt"
+	"syscall"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func TestEstablishedUnaccSeqAck(t *testing.T) {
+	for _, tt := range []struct {
+		description    string
+		makeTestingTCP func(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset, windowSize seqnum.Size) testbench.TCP
+		seqNumOffset   seqnum.Size
+		expectAck      bool
+		restoreSeq     bool
+	}{
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 0, expectAck: true, restoreSeq: true},
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 1, expectAck: true, restoreSeq: true},
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 2, expectAck: true, restoreSeq: true},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 0, expectAck: true, restoreSeq: false},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 1, expectAck: false, restoreSeq: true},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 2, expectAck: false, restoreSeq: true},
+	} {
+		t.Run(fmt.Sprintf("%s:offset=%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			defer dut.TearDown()
+			listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/)
+			defer dut.Close(t, listenFD)
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close(t)
+
+			conn.Connect(t)
+			dut.Accept(t, listenFD)
+
+			sampleData := []byte("Sample Data")
+			samplePayload := &testbench.Payload{Bytes: sampleData}
+
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload)
+			gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+			if err != nil {
+				t.Fatalf("expected ack %s", err)
+			}
+			windowSize := seqnum.Size(*gotTCP.WindowSize)
+
+			origSeq := *conn.LocalSeqNum(t)
+			// Send a segment with OTW Seq / unacc ACK.
+			conn.Send(t, tt.makeTestingTCP(t, &conn, tt.seqNumOffset, windowSize), samplePayload)
+			if tt.restoreSeq {
+				// Restore the local sequence number to ensure that the incoming
+				// ACK matches the TCP layer state.
+				*conn.LocalSeqNum(t) = origSeq
+			}
+			gotAck, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+			if tt.expectAck && err != nil {
+				t.Fatalf("expected an ack but got none: %s", err)
+			}
+			if err == nil && !tt.expectAck && gotAck != nil {
+				t.Fatalf("expected no ack but got one: %s", gotAck)
+			}
+		})
+	}
+}
+
+func TestPassiveCloseUnaccSeqAck(t *testing.T) {
+	for _, tt := range []struct {
+		description    string
+		makeTestingTCP func(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset, windowSize seqnum.Size) testbench.TCP
+		seqNumOffset   seqnum.Size
+		expectAck      bool
+	}{
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 0, expectAck: false},
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 1, expectAck: true},
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 2, expectAck: true},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 0, expectAck: false},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 1, expectAck: true},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 2, expectAck: true},
+	} {
+		t.Run(fmt.Sprintf("%s:offset=%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			defer dut.TearDown()
+			listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/)
+			defer dut.Close(t, listenFD)
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close(t)
+
+			conn.Connect(t)
+			acceptFD, _ := dut.Accept(t, listenFD)
+
+			// Send a FIN to DUT to intiate the passive close.
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)})
+			gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+			if err != nil {
+				t.Fatalf("expected an ACK for our fin and DUT should enter CLOSE_WAIT: %s", err)
+			}
+			windowSize := seqnum.Size(*gotTCP.WindowSize)
+
+			sampleData := []byte("Sample Data")
+			samplePayload := &testbench.Payload{Bytes: sampleData}
+
+			// Send a segment with OTW Seq / unacc ACK.
+			conn.Send(t, tt.makeTestingTCP(t, &conn, tt.seqNumOffset, windowSize), samplePayload)
+			gotAck, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+			if tt.expectAck && err != nil {
+				t.Errorf("expected an ack but got none: %s", err)
+			}
+			if err == nil && !tt.expectAck && gotAck != nil {
+				t.Errorf("expected no ack but got one: %s", gotAck)
+			}
+
+			// Now let's verify DUT is indeed in CLOSE_WAIT
+			dut.Close(t, acceptFD)
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)}, time.Second); err != nil {
+				t.Fatalf("expected DUT to send a FIN: %s", err)
+			}
+			// Ack the FIN from DUT
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+			// Send some extra data to DUT
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, samplePayload)
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
+				t.Fatalf("expected DUT to send an RST: %s", err)
+			}
+		})
+	}
+}
+
+func TestActiveCloseUnaccpSeqAck(t *testing.T) {
+	for _, tt := range []struct {
+		description    string
+		makeTestingTCP func(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset, windowSize seqnum.Size) testbench.TCP
+		seqNumOffset   seqnum.Size
+		restoreSeq     bool
+	}{
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 0, restoreSeq: true},
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 1, restoreSeq: true},
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 2, restoreSeq: true},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 0, restoreSeq: false},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 1, restoreSeq: true},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 2, restoreSeq: true},
+	} {
+		t.Run(fmt.Sprintf("%s:offset=%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			defer dut.TearDown()
+			listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/)
+			defer dut.Close(t, listenFD)
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close(t)
+
+			conn.Connect(t)
+			acceptFD, _ := dut.Accept(t, listenFD)
+
+			// Trigger active close.
+			dut.Shutdown(t, acceptFD, syscall.SHUT_WR)
+
+			// Get to FIN_WAIT2
+			gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second)
+			if err != nil {
+				t.Fatalf("expected a FIN: %s", err)
+			}
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+
+			sendUnaccSeqAck := func(state string) {
+				t.Helper()
+				sampleData := []byte("Sample Data")
+				samplePayload := &testbench.Payload{Bytes: sampleData}
+
+				origSeq := *conn.LocalSeqNum(t)
+				// Send a segment with OTW Seq / unacc ACK.
+				conn.Send(t, tt.makeTestingTCP(t, &conn, tt.seqNumOffset, seqnum.Size(*gotTCP.WindowSize)), samplePayload)
+				if tt.restoreSeq {
+					// Restore the local sequence number to ensure that the
+					// incoming ACK matches the TCP layer state.
+					*conn.LocalSeqNum(t) = origSeq
+				}
+				if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil {
+					t.Errorf("expected an ack in %s state, but got none: %s", state, err)
+				}
+			}
+
+			sendUnaccSeqAck("FIN_WAIT2")
+
+			// Send a FIN to DUT to get to TIME_WAIT
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)})
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil {
+				t.Fatalf("expected an ACK for our fin and DUT should enter TIME_WAIT: %s", err)
+			}
+
+			sendUnaccSeqAck("TIME_WAIT")
+		})
+	}
+}
+
+// generateOTWSeqSegment generates an segment with
+// seqnum = RCV.NXT + RCV.WND + seqNumOffset, the generated segment is only
+// acceptable when seqNumOffset is 0, otherwise an ACK is expected from the
+// receiver.
+func generateOTWSeqSegment(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP {
+	lastAcceptable := conn.LocalSeqNum(t).Add(windowSize)
+	otwSeq := uint32(lastAcceptable.Add(seqNumOffset))
+	return testbench.TCP{SeqNum: testbench.Uint32(otwSeq), Flags: testbench.Uint8(header.TCPFlagAck)}
+}
+
+// generateUnaccACKSegment generates an segment with
+// acknum = SND.NXT + seqNumOffset, the generated segment is only acceptable
+// when seqNumOffset is 0, otherwise an ACK is expected from the receiver.
+func generateUnaccACKSegment(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP {
+	lastAcceptable := conn.RemoteSeqNum(t)
+	unaccAck := uint32(lastAcceptable.Add(seqNumOffset))
+	return testbench.TCP{AckNum: testbench.Uint32(unaccAck), Flags: testbench.Uint8(header.TCPFlagAck)}
+}
-- 
cgit v1.2.3


From 8a8f457862e093f8d513b92769ebdf637929891a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 1 Sep 2020 19:20:37 -0700
Subject: Implement setattr+clunk in 9P

This is to cover the common pattern: open->read/write->close,
where SetAttr needs to be called to update atime/mtime before
the file is closed.

Benchmark results:

BM_OpenReadClose/10240 CPU
setattr+clunk: 63783 ns
VFS2:          68109 ns
VFS1:          72507 ns

Updates #1198

PiperOrigin-RevId: 329628461
---
 pkg/p9/client_file.go                        |  38 ++++++-
 pkg/p9/file.go                               |  24 ++++
 pkg/p9/handlers.go                           |  31 +++++
 pkg/p9/messages.go                           |  60 ++++++++++
 pkg/p9/messages_test.go                      |  24 ++++
 pkg/p9/p9.go                                 | 162 ++++++++++++++-------------
 pkg/p9/p9test/client_test.go                 |  23 +++-
 pkg/p9/version.go                            |   8 +-
 pkg/sentry/fsimpl/gofer/gofer.go             |  40 ++++---
 pkg/sentry/fsimpl/gofer/p9file.go            |   7 ++
 runsc/fsgofer/fsgofer.go                     |   2 +
 test/perf/linux/BUILD                        |  16 +++
 test/perf/linux/open_read_close_benchmark.cc |  61 ++++++++++
 13 files changed, 389 insertions(+), 107 deletions(-)
 create mode 100644 test/perf/linux/open_read_close_benchmark.cc

(limited to 'pkg')

diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 2ee07b664..28fe081d6 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -54,6 +54,8 @@ func (c *Client) newFile(fid FID) *clientFile {
 //
 // This proxies all of the interfaces found in file.go.
 type clientFile struct {
+	DisallowServerCalls
+
 	// client is the originating client.
 	client *Client
 
@@ -283,6 +285,39 @@ func (c *clientFile) Close() error {
 	return nil
 }
 
+// SetAttrClose implements File.SetAttrClose.
+func (c *clientFile) SetAttrClose(valid SetAttrMask, attr SetAttr) error {
+	if !versionSupportsTsetattrclunk(c.client.version) {
+		setAttrErr := c.SetAttr(valid, attr)
+
+		// Try to close file even in case of failure above. Since the state of the
+		// file is unknown to the caller, it will not attempt to close the file
+		// again.
+		if err := c.Close(); err != nil {
+			return err
+		}
+
+		return setAttrErr
+	}
+
+	// Avoid double close.
+	if !atomic.CompareAndSwapUint32(&c.closed, 0, 1) {
+		return syscall.EBADF
+	}
+
+	// Send the message.
+	if err := c.client.sendRecv(&Tsetattrclunk{FID: c.fid, Valid: valid, SetAttr: attr}, &Rsetattrclunk{}); err != nil {
+		// If an error occurred, we toss away the FID. This isn't ideal,
+		// but I'm not sure what else makes sense in this context.
+		log.Warningf("Tsetattrclunk failed, losing FID %v: %v", c.fid, err)
+		return err
+	}
+
+	// Return the FID to the pool.
+	c.client.fidPool.Put(uint64(c.fid))
+	return nil
+}
+
 // Open implements File.Open.
 func (c *clientFile) Open(flags OpenFlags) (*fd.FD, QID, uint32, error) {
 	if atomic.LoadUint32(&c.closed) != 0 {
@@ -681,6 +716,3 @@ func (c *clientFile) Flush() error {
 
 	return c.client.sendRecv(&Tflushf{FID: c.fid}, &Rflushf{})
 }
-
-// Renamed implements File.Renamed.
-func (c *clientFile) Renamed(newDir File, newName string) {}
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index cab35896f..c2e3a3f98 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -135,6 +135,14 @@ type File interface {
 	// On the server, Close has no concurrency guarantee.
 	Close() error
 
+	// SetAttrClose is the equivalent of calling SetAttr() followed by Close().
+	// This can be used to set file times before closing the file in a single
+	// operation.
+	//
+	// On the server, SetAttr has a write concurrency guarantee.
+	// On the server, Close has no concurrency guarantee.
+	SetAttrClose(valid SetAttrMask, attr SetAttr) error
+
 	// Open must be called prior to using Read, Write or Readdir. Once Open
 	// is called, some operations, such as Walk, will no longer work.
 	//
@@ -286,3 +294,19 @@ type DefaultWalkGetAttr struct{}
 func (DefaultWalkGetAttr) WalkGetAttr([]string) ([]QID, File, AttrMask, Attr, error) {
 	return nil, nil, AttrMask{}, Attr{}, syscall.ENOSYS
 }
+
+// DisallowClientCalls panics if a client-only function is called.
+type DisallowClientCalls struct{}
+
+// SetAttrClose implements File.SetAttrClose.
+func (DisallowClientCalls) SetAttrClose(SetAttrMask, SetAttr) error {
+	panic("SetAttrClose should not be called on the server")
+}
+
+// DisallowServerCalls panics if a server-only function is called.
+type DisallowServerCalls struct{}
+
+// Renamed implements File.Renamed.
+func (*clientFile) Renamed(File, string) {
+	panic("Renamed should not be called on the client")
+}
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 1db5797dd..abd237f46 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -123,6 +123,37 @@ func (t *Tclunk) handle(cs *connState) message {
 	return &Rclunk{}
 }
 
+func (t *Tsetattrclunk) handle(cs *connState) message {
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	setAttrErr := ref.safelyWrite(func() error {
+		// We don't allow setattr on files that have been deleted.
+		// This might be technically incorrect, as it's possible that
+		// there were multiple links and you can still change the
+		// corresponding inode information.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+
+		// Set the attributes.
+		return ref.file.SetAttr(t.Valid, t.SetAttr)
+	})
+
+	// Try to delete FID even in case of failure above. Since the state of the
+	// file is unknown to the caller, it will not attempt to close the file again.
+	if !cs.DeleteFID(t.FID) {
+		return newErr(syscall.EBADF)
+	}
+	if setAttrErr != nil {
+		return newErr(setAttrErr)
+	}
+	return &Rsetattrclunk{}
+}
+
 // handle implements handler.handle.
 func (t *Tremove) handle(cs *connState) message {
 	ref, ok := cs.LookupFID(t.FID)
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index 2cb59f934..cf13cbb69 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -317,6 +317,64 @@ func (r *Rclunk) String() string {
 	return "Rclunk{}"
 }
 
+// Tsetattrclunk is a setattr+close request.
+type Tsetattrclunk struct {
+	// FID is the FID to change.
+	FID FID
+
+	// Valid is the set of bits which will be used.
+	Valid SetAttrMask
+
+	// SetAttr is the set request.
+	SetAttr SetAttr
+}
+
+// decode implements encoder.decode.
+func (t *Tsetattrclunk) decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Valid.decode(b)
+	t.SetAttr.decode(b)
+}
+
+// encode implements encoder.encode.
+func (t *Tsetattrclunk) encode(b *buffer) {
+	b.WriteFID(t.FID)
+	t.Valid.encode(b)
+	t.SetAttr.encode(b)
+}
+
+// Type implements message.Type.
+func (*Tsetattrclunk) Type() MsgType {
+	return MsgTsetattrclunk
+}
+
+// String implements fmt.Stringer.
+func (t *Tsetattrclunk) String() string {
+	return fmt.Sprintf("Tsetattrclunk{FID: %d, Valid: %v, SetAttr: %s}", t.FID, t.Valid, t.SetAttr)
+}
+
+// Rsetattrclunk is a setattr+close response.
+type Rsetattrclunk struct {
+}
+
+// decode implements encoder.decode.
+func (*Rsetattrclunk) decode(*buffer) {
+}
+
+// encode implements encoder.encode.
+func (*Rsetattrclunk) encode(*buffer) {
+}
+
+// Type implements message.Type.
+func (*Rsetattrclunk) Type() MsgType {
+	return MsgRsetattrclunk
+}
+
+// String implements fmt.Stringer.
+func (r *Rsetattrclunk) String() string {
+	return "Rsetattrclunk{}"
+}
+
 // Tremove is a remove request.
 //
 // This will eventually be replaced by Tunlinkat.
@@ -2657,6 +2715,8 @@ func init() {
 	msgRegistry.register(MsgRlconnect, func() message { return &Rlconnect{} })
 	msgRegistry.register(MsgTallocate, func() message { return &Tallocate{} })
 	msgRegistry.register(MsgRallocate, func() message { return &Rallocate{} })
+	msgRegistry.register(MsgTsetattrclunk, func() message { return &Tsetattrclunk{} })
+	msgRegistry.register(MsgRsetattrclunk, func() message { return &Rsetattrclunk{} })
 	msgRegistry.register(MsgTchannel, func() message { return &Tchannel{} })
 	msgRegistry.register(MsgRchannel, func() message { return &Rchannel{} })
 }
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index 7facc9f5e..bfeb6c236 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -376,6 +376,30 @@ func TestEncodeDecode(t *testing.T) {
 		&Rumknod{
 			Rmknod{QID: QID{Type: 1}},
 		},
+		&Tsetattrclunk{
+			FID: 1,
+			Valid: SetAttrMask{
+				Permissions:        true,
+				UID:                true,
+				GID:                true,
+				Size:               true,
+				ATime:              true,
+				MTime:              true,
+				CTime:              true,
+				ATimeNotSystemTime: true,
+				MTimeNotSystemTime: true,
+			},
+			SetAttr: SetAttr{
+				Permissions:      1,
+				UID:              2,
+				GID:              3,
+				Size:             4,
+				ATimeSeconds:     5,
+				ATimeNanoSeconds: 6,
+				MTimeSeconds:     7,
+				MTimeNanoSeconds: 8,
+			},
+		},
 	}
 
 	for _, enc := range objs {
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 122c457d2..2235f8968 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -315,86 +315,88 @@ type MsgType uint8
 
 // MsgType declarations.
 const (
-	MsgTlerror      MsgType = 6
-	MsgRlerror              = 7
-	MsgTstatfs              = 8
-	MsgRstatfs              = 9
-	MsgTlopen               = 12
-	MsgRlopen               = 13
-	MsgTlcreate             = 14
-	MsgRlcreate             = 15
-	MsgTsymlink             = 16
-	MsgRsymlink             = 17
-	MsgTmknod               = 18
-	MsgRmknod               = 19
-	MsgTrename              = 20
-	MsgRrename              = 21
-	MsgTreadlink            = 22
-	MsgRreadlink            = 23
-	MsgTgetattr             = 24
-	MsgRgetattr             = 25
-	MsgTsetattr             = 26
-	MsgRsetattr             = 27
-	MsgTlistxattr           = 28
-	MsgRlistxattr           = 29
-	MsgTxattrwalk           = 30
-	MsgRxattrwalk           = 31
-	MsgTxattrcreate         = 32
-	MsgRxattrcreate         = 33
-	MsgTgetxattr            = 34
-	MsgRgetxattr            = 35
-	MsgTsetxattr            = 36
-	MsgRsetxattr            = 37
-	MsgTremovexattr         = 38
-	MsgRremovexattr         = 39
-	MsgTreaddir             = 40
-	MsgRreaddir             = 41
-	MsgTfsync               = 50
-	MsgRfsync               = 51
-	MsgTlink                = 70
-	MsgRlink                = 71
-	MsgTmkdir               = 72
-	MsgRmkdir               = 73
-	MsgTrenameat            = 74
-	MsgRrenameat            = 75
-	MsgTunlinkat            = 76
-	MsgRunlinkat            = 77
-	MsgTversion             = 100
-	MsgRversion             = 101
-	MsgTauth                = 102
-	MsgRauth                = 103
-	MsgTattach              = 104
-	MsgRattach              = 105
-	MsgTflush               = 108
-	MsgRflush               = 109
-	MsgTwalk                = 110
-	MsgRwalk                = 111
-	MsgTread                = 116
-	MsgRread                = 117
-	MsgTwrite               = 118
-	MsgRwrite               = 119
-	MsgTclunk               = 120
-	MsgRclunk               = 121
-	MsgTremove              = 122
-	MsgRremove              = 123
-	MsgTflushf              = 124
-	MsgRflushf              = 125
-	MsgTwalkgetattr         = 126
-	MsgRwalkgetattr         = 127
-	MsgTucreate             = 128
-	MsgRucreate             = 129
-	MsgTumkdir              = 130
-	MsgRumkdir              = 131
-	MsgTumknod              = 132
-	MsgRumknod              = 133
-	MsgTusymlink            = 134
-	MsgRusymlink            = 135
-	MsgTlconnect            = 136
-	MsgRlconnect            = 137
-	MsgTallocate            = 138
-	MsgRallocate            = 139
-	MsgTchannel             = 250
-	MsgRchannel             = 251
+	MsgTlerror       MsgType = 6
+	MsgRlerror       MsgType = 7
+	MsgTstatfs       MsgType = 8
+	MsgRstatfs       MsgType = 9
+	MsgTlopen        MsgType = 12
+	MsgRlopen        MsgType = 13
+	MsgTlcreate      MsgType = 14
+	MsgRlcreate      MsgType = 15
+	MsgTsymlink      MsgType = 16
+	MsgRsymlink      MsgType = 17
+	MsgTmknod        MsgType = 18
+	MsgRmknod        MsgType = 19
+	MsgTrename       MsgType = 20
+	MsgRrename       MsgType = 21
+	MsgTreadlink     MsgType = 22
+	MsgRreadlink     MsgType = 23
+	MsgTgetattr      MsgType = 24
+	MsgRgetattr      MsgType = 25
+	MsgTsetattr      MsgType = 26
+	MsgRsetattr      MsgType = 27
+	MsgTlistxattr    MsgType = 28
+	MsgRlistxattr    MsgType = 29
+	MsgTxattrwalk    MsgType = 30
+	MsgRxattrwalk    MsgType = 31
+	MsgTxattrcreate  MsgType = 32
+	MsgRxattrcreate  MsgType = 33
+	MsgTgetxattr     MsgType = 34
+	MsgRgetxattr     MsgType = 35
+	MsgTsetxattr     MsgType = 36
+	MsgRsetxattr     MsgType = 37
+	MsgTremovexattr  MsgType = 38
+	MsgRremovexattr  MsgType = 39
+	MsgTreaddir      MsgType = 40
+	MsgRreaddir      MsgType = 41
+	MsgTfsync        MsgType = 50
+	MsgRfsync        MsgType = 51
+	MsgTlink         MsgType = 70
+	MsgRlink         MsgType = 71
+	MsgTmkdir        MsgType = 72
+	MsgRmkdir        MsgType = 73
+	MsgTrenameat     MsgType = 74
+	MsgRrenameat     MsgType = 75
+	MsgTunlinkat     MsgType = 76
+	MsgRunlinkat     MsgType = 77
+	MsgTversion      MsgType = 100
+	MsgRversion      MsgType = 101
+	MsgTauth         MsgType = 102
+	MsgRauth         MsgType = 103
+	MsgTattach       MsgType = 104
+	MsgRattach       MsgType = 105
+	MsgTflush        MsgType = 108
+	MsgRflush        MsgType = 109
+	MsgTwalk         MsgType = 110
+	MsgRwalk         MsgType = 111
+	MsgTread         MsgType = 116
+	MsgRread         MsgType = 117
+	MsgTwrite        MsgType = 118
+	MsgRwrite        MsgType = 119
+	MsgTclunk        MsgType = 120
+	MsgRclunk        MsgType = 121
+	MsgTremove       MsgType = 122
+	MsgRremove       MsgType = 123
+	MsgTflushf       MsgType = 124
+	MsgRflushf       MsgType = 125
+	MsgTwalkgetattr  MsgType = 126
+	MsgRwalkgetattr  MsgType = 127
+	MsgTucreate      MsgType = 128
+	MsgRucreate      MsgType = 129
+	MsgTumkdir       MsgType = 130
+	MsgRumkdir       MsgType = 131
+	MsgTumknod       MsgType = 132
+	MsgRumknod       MsgType = 133
+	MsgTusymlink     MsgType = 134
+	MsgRusymlink     MsgType = 135
+	MsgTlconnect     MsgType = 136
+	MsgRlconnect     MsgType = 137
+	MsgTallocate     MsgType = 138
+	MsgRallocate     MsgType = 139
+	MsgTsetattrclunk MsgType = 140
+	MsgRsetattrclunk MsgType = 141
+	MsgTchannel      MsgType = 250
+	MsgRchannel      MsgType = 251
 )
 
 // QIDType represents the file type for QIDs.
diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go
index 6e7bb3db2..6e605b14c 100644
--- a/pkg/p9/p9test/client_test.go
+++ b/pkg/p9/p9test/client_test.go
@@ -1225,22 +1225,31 @@ func TestOpen(t *testing.T) {
 func TestClose(t *testing.T) {
 	type closeTest struct {
 		name    string
-		closeFn func(backend *Mock, f p9.File)
+		closeFn func(backend *Mock, f p9.File) error
 	}
 
 	cases := []closeTest{
 		{
 			name: "close",
-			closeFn: func(_ *Mock, f p9.File) {
-				f.Close()
+			closeFn: func(_ *Mock, f p9.File) error {
+				return f.Close()
 			},
 		},
 		{
 			name: "remove",
-			closeFn: func(backend *Mock, f p9.File) {
+			closeFn: func(backend *Mock, f p9.File) error {
 				// Allow the rename call in the parent, automatically translated.
 				backend.parent.EXPECT().UnlinkAt(gomock.Any(), gomock.Any()).Times(1)
-				f.(deprecatedRemover).Remove()
+				return f.(deprecatedRemover).Remove()
+			},
+		},
+		{
+			name: "setAttrClose",
+			closeFn: func(backend *Mock, f p9.File) error {
+				valid := p9.SetAttrMask{ATime: true}
+				attr := p9.SetAttr{ATimeSeconds: 1, ATimeNanoSeconds: 2}
+				backend.EXPECT().SetAttr(valid, attr).Times(1)
+				return f.SetAttrClose(valid, attr)
 			},
 		},
 	}
@@ -1258,7 +1267,9 @@ func TestClose(t *testing.T) {
 				_, backend, f := walkHelper(h, name, root)
 
 				// Close via the prescribed method.
-				tc.closeFn(backend, f)
+				if err := tc.closeFn(backend, f); err != nil {
+					t.Fatalf("closeFn failed: %v", err)
+				}
 
 				// Everything should fail with EBADF.
 				if _, _, err := f.Walk(nil); err != syscall.EBADF {
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index 09cde9f5a..8d7168ef5 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -26,7 +26,7 @@ const (
 	//
 	// Clients are expected to start requesting this version number and
 	// to continuously decrement it until a Tversion request succeeds.
-	highestSupportedVersion uint32 = 11
+	highestSupportedVersion uint32 = 12
 
 	// lowestSupportedVersion is the lowest supported version X in a
 	// version string of the format 9P2000.L.Google.X.
@@ -173,3 +173,9 @@ func versionSupportsGetSetXattr(v uint32) bool {
 func versionSupportsListRemoveXattr(v uint32) bool {
 	return v >= 11
 }
+
+// versionSupportsTsetattrclunk returns true if version v supports
+// the Tsetattrclunk message.
+func versionSupportsTsetattrclunk(v uint32) bool {
+	return v >= 12
+}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 57bff1789..73d9e772d 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -1300,30 +1300,36 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 	d.handleMu.Unlock()
 
 	if !d.file.isNil() {
+		valid := p9.SetAttrMask{}
+		attr := p9.SetAttr{}
 		if !d.isDeleted() {
 			// Write dirty timestamps back to the remote filesystem.
-			atimeDirty := atomic.LoadUint32(&d.atimeDirty) != 0
-			mtimeDirty := atomic.LoadUint32(&d.mtimeDirty) != 0
-			if atimeDirty || mtimeDirty {
+			if atomic.LoadUint32(&d.atimeDirty) != 0 {
+				valid.ATime = true
+				valid.ATimeNotSystemTime = true
 				atime := atomic.LoadInt64(&d.atime)
+				attr.ATimeSeconds = uint64(atime / 1e9)
+				attr.ATimeNanoSeconds = uint64(atime % 1e9)
+			}
+			if atomic.LoadUint32(&d.mtimeDirty) != 0 {
+				valid.MTime = true
+				valid.MTimeNotSystemTime = true
 				mtime := atomic.LoadInt64(&d.mtime)
-				if err := d.file.setAttr(ctx, p9.SetAttrMask{
-					ATime:              atimeDirty,
-					ATimeNotSystemTime: atimeDirty,
-					MTime:              mtimeDirty,
-					MTimeNotSystemTime: mtimeDirty,
-				}, p9.SetAttr{
-					ATimeSeconds:     uint64(atime / 1e9),
-					ATimeNanoSeconds: uint64(atime % 1e9),
-					MTimeSeconds:     uint64(mtime / 1e9),
-					MTimeNanoSeconds: uint64(mtime % 1e9),
-				}); err != nil {
-					log.Warningf("gofer.dentry.destroyLocked: failed to write dirty timestamps back: %v", err)
-				}
+				attr.MTimeSeconds = uint64(mtime / 1e9)
+				attr.MTimeNanoSeconds = uint64(mtime % 1e9)
+			}
+		}
+
+		// Check if attributes need to be changed before closing the file.
+		if valid.ATime || valid.MTime {
+			if err := d.file.setAttrClose(ctx, valid, attr); err != nil {
+				log.Warningf("gofer.dentry.destroyLocked: failed to close file with write dirty timestamps: %v", err)
 			}
+		} else if err := d.file.close(ctx); err != nil {
+			log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err)
 		}
-		d.file.close(ctx)
 		d.file = p9file{}
+
 		// Remove d from the set of syncable dentries.
 		d.fs.syncMu.Lock()
 		delete(d.fs.syncableDentries, d)
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
index 87f0b877f..21b4a96fe 100644
--- a/pkg/sentry/fsimpl/gofer/p9file.go
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -127,6 +127,13 @@ func (f p9file) close(ctx context.Context) error {
 	return err
 }
 
+func (f p9file) setAttrClose(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.SetAttrClose(valid, attr)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 	ctx.UninterruptibleSleepStart(false)
 	fdobj, qid, iounit, err := f.file.Open(flags)
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index b0788bd23..4268d97a1 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -181,6 +181,8 @@ func (a *attachPoint) makeQID(stat unix.Stat_t) p9.QID {
 // The few exceptions where it cannot be done are: utimensat on symlinks, and
 // Connect() for the socket address.
 type localFile struct {
+	p9.DisallowClientCalls
+
 	// attachPoint is the attachPoint that serves this localFile.
 	attachPoint *attachPoint
 
diff --git a/test/perf/linux/BUILD b/test/perf/linux/BUILD
index b4e907826..dd1d2438c 100644
--- a/test/perf/linux/BUILD
+++ b/test/perf/linux/BUILD
@@ -354,3 +354,19 @@ cc_binary(
         "//test/util:test_util",
     ],
 )
+
+cc_binary(
+    name = "open_read_close_benchmark",
+    testonly = 1,
+    srcs = [
+        "open_read_close_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+    ],
+)
diff --git a/test/perf/linux/open_read_close_benchmark.cc b/test/perf/linux/open_read_close_benchmark.cc
new file mode 100644
index 000000000..8b023a3d8
--- /dev/null
+++ b/test/perf/linux/open_read_close_benchmark.cc
@@ -0,0 +1,61 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_OpenReadClose(benchmark::State& state) {
+  const int size = state.range(0);
+  std::vector<TempPath> cache;
+  for (int i = 0; i < size; i++) {
+    auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+        GetAbsoluteTestTmpdir(), "some content", 0644));
+    cache.emplace_back(std::move(path));
+  }
+
+  char buf[1];
+  unsigned int seed = 1;
+  for (auto _ : state) {
+    const int chosen = rand_r(&seed) % size;
+    int fd = open(cache[chosen].path().c_str(), O_RDONLY);
+    TEST_CHECK(fd != -1);
+    TEST_CHECK(read(fd, buf, 1) == 1);
+    close(fd);
+  }
+}
+
+// Gofer dentry cache is 1000 by default. Go over it to force files to be closed
+// for real.
+BENCHMARK(BM_OpenReadClose)->Range(1000, 16384)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 095c3ad9b1c64b36578c2ead86b62dcbafc268e2 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 2 Sep 2020 11:35:18 -0700
Subject: Improve sync.SeqCount performance.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Make sync.SeqCountEpoch not a struct. This allows sync.SeqCount.BeginRead()
  to be inlined.

- Mark sync.SeqAtomicLoad<T> nosplit to mitigate the Go compiler's refusal to
  inline it. (Best I could get was "cost 92 exceeds budget 80".)

- Use runtime-guided spinning in SeqCount.BeginRead().

Benchmarks:
name                               old time/op  new time/op   delta
pkg:pkg/sync/sync goos:linux goarch:amd64
SeqCountWriteUncontended-12        8.24ns ± 0%  11.40ns ± 0%  +38.35%  (p=0.000 n=10+10)
SeqCountReadUncontended-12         0.33ns ± 0%   0.14ns ± 3%  -57.77%  (p=0.000 n=7+8)
pkg:pkg/sync/seqatomictest/seqatomic goos:linux goarch:amd64
SeqAtomicLoadIntUncontended-12     0.64ns ± 1%   0.41ns ± 1%  -36.40%  (p=0.000 n=10+8)
SeqAtomicTryLoadIntUncontended-12  0.18ns ± 4%   0.18ns ± 1%     ~     (p=0.206 n=10+8)
AtomicValueLoadIntUncontended-12   0.27ns ± 3%   0.27ns ± 0%   -1.77%  (p=0.000 n=10+8)

(atomic.Value.Load is, of course, inlined. We would expect an uncontended
inline SeqAtomicLoad<int> to perform identically to SeqAtomicTryLoad<int>.) The
"regression" in BenchmarkSeqCountWriteUncontended, despite this CL changing
nothing in that path, is attributed to microarchitectural subtlety; the
benchmark loop is unchanged except for its address:

Before this CL:
  :0                    0x4e62d1                48ffc2                  INCQ DX
  :0                    0x4e62d4                48399110010000          CMPQ DX, 0x110(CX)
  :0                    0x4e62db                7e26                    JLE 0x4e6303
  :0                    0x4e62dd                90                      NOPL
  :0                    0x4e62de                bb01000000              MOVL $0x1, BX
  :0                    0x4e62e3                f00fc118                LOCK XADDL BX, 0(AX)
  :0                    0x4e62e7                ffc3                    INCL BX
  :0                    0x4e62e9                0fbae300                BTL $0x0, BX
  :0                    0x4e62ed                733a                    JAE 0x4e6329
  :0                    0x4e62ef                90                      NOPL
  :0                    0x4e62f0                bb01000000              MOVL $0x1, BX
  :0                    0x4e62f5                f00fc118                LOCK XADDL BX, 0(AX)
  :0                    0x4e62f9                ffc3                    INCL BX
  :0                    0x4e62fb                0fbae300                BTL $0x0, BX
  :0                    0x4e62ff                73d0                    JAE 0x4e62d1

After this CL:
  :0                    0x4e6361                48ffc2                  INCQ DX
  :0                    0x4e6364                48399110010000          CMPQ DX, 0x110(CX)
  :0                    0x4e636b                7e26                    JLE 0x4e6393
  :0                    0x4e636d                90                      NOPL
  :0                    0x4e636e                bb01000000              MOVL $0x1, BX
  :0                    0x4e6373                f00fc118                LOCK XADDL BX, 0(AX)
  :0                    0x4e6377                ffc3                    INCL BX
  :0                    0x4e6379                0fbae300                BTL $0x0, BX
  :0                    0x4e637d                733a                    JAE 0x4e63b9
  :0                    0x4e637f                90                      NOPL
  :0                    0x4e6380                bb01000000              MOVL $0x1, BX
  :0                    0x4e6385                f00fc118                LOCK XADDL BX, 0(AX)
  :0                    0x4e6389                ffc3                    INCL BX
  :0                    0x4e638b                0fbae300                BTL $0x0, BX
  :0                    0x4e638f                73d0                    JAE 0x4e6361

PiperOrigin-RevId: 329754148
---
 pkg/sync/BUILD               |  1 +
 pkg/sync/seqatomic_unsafe.go | 40 +++++++++++++++++-----------------------
 pkg/sync/seqcount.go         | 30 ++++++++++++++++++++----------
 pkg/sync/spin_unsafe.go      | 24 ++++++++++++++++++++++++
 4 files changed, 62 insertions(+), 33 deletions(-)
 create mode 100644 pkg/sync/spin_unsafe.go

(limited to 'pkg')

diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index 4d47207f7..68535c3b1 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -38,6 +38,7 @@ go_library(
         "race_unsafe.go",
         "rwmutex_unsafe.go",
         "seqcount.go",
+        "spin_unsafe.go",
         "sync.go",
     ],
     marshal = False,
diff --git a/pkg/sync/seqatomic_unsafe.go b/pkg/sync/seqatomic_unsafe.go
index eda6fb131..2184cb5ab 100644
--- a/pkg/sync/seqatomic_unsafe.go
+++ b/pkg/sync/seqatomic_unsafe.go
@@ -25,41 +25,35 @@ import (
 type Value struct{}
 
 // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
-// with any writer critical sections in sc.
-func SeqAtomicLoad(sc *sync.SeqCount, ptr *Value) Value {
-	// This function doesn't use SeqAtomicTryLoad because doing so is
-	// measurably, significantly (~20%) slower; Go is awful at inlining.
-	var val Value
+// with any writer critical sections in seq.
+//
+//go:nosplit
+func SeqAtomicLoad(seq *sync.SeqCount, ptr *Value) Value {
 	for {
-		epoch := sc.BeginRead()
-		if sync.RaceEnabled {
-			// runtime.RaceDisable() doesn't actually stop the race detector,
-			// so it can't help us here. Instead, call runtime.memmove
-			// directly, which is not instrumented by the race detector.
-			sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
-		} else {
-			// This is ~40% faster for short reads than going through memmove.
-			val = *ptr
-		}
-		if sc.ReadOk(epoch) {
-			break
+		if val, ok := SeqAtomicTryLoad(seq, seq.BeginRead(), ptr); ok {
+			return val
 		}
 	}
-	return val
 }
 
 // SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section
-// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read
-// would race with a writer critical section, SeqAtomicTryLoad returns
+// in seq initiated by a call to seq.BeginRead() that returned epoch. If the
+// read would race with a writer critical section, SeqAtomicTryLoad returns
 // (unspecified, false).
-func SeqAtomicTryLoad(sc *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (Value, bool) {
-	var val Value
+//
+//go:nosplit
+func SeqAtomicTryLoad(seq *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (val Value, ok bool) {
 	if sync.RaceEnabled {
+		// runtime.RaceDisable() doesn't actually stop the race detector, so it
+		// can't help us here. Instead, call runtime.memmove directly, which is
+		// not instrumented by the race detector.
 		sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
 	} else {
+		// This is ~40% faster for short reads than going through memmove.
 		val = *ptr
 	}
-	return val, sc.ReadOk(epoch)
+	ok = seq.ReadOk(epoch)
+	return
 }
 
 func init() {
diff --git a/pkg/sync/seqcount.go b/pkg/sync/seqcount.go
index a1e895352..2c5d3df99 100644
--- a/pkg/sync/seqcount.go
+++ b/pkg/sync/seqcount.go
@@ -8,7 +8,6 @@ package sync
 import (
 	"fmt"
 	"reflect"
-	"runtime"
 	"sync/atomic"
 )
 
@@ -43,9 +42,7 @@ type SeqCount struct {
 }
 
 // SeqCountEpoch tracks writer critical sections in a SeqCount.
-type SeqCountEpoch struct {
-	val uint32
-}
+type SeqCountEpoch uint32
 
 // We assume that:
 //
@@ -83,12 +80,25 @@ type SeqCountEpoch struct {
 // using this pattern. Most users of SeqCount will need to use the
 // SeqAtomicLoad function template in seqatomic.go.
 func (s *SeqCount) BeginRead() SeqCountEpoch {
-	epoch := atomic.LoadUint32(&s.epoch)
-	for epoch&1 != 0 {
-		runtime.Gosched()
-		epoch = atomic.LoadUint32(&s.epoch)
+	if epoch := atomic.LoadUint32(&s.epoch); epoch&1 == 0 {
+		return SeqCountEpoch(epoch)
+	}
+	return s.beginReadSlow()
+}
+
+func (s *SeqCount) beginReadSlow() SeqCountEpoch {
+	i := 0
+	for {
+		if canSpin(i) {
+			i++
+			doSpin()
+		} else {
+			goyield()
+		}
+		if epoch := atomic.LoadUint32(&s.epoch); epoch&1 == 0 {
+			return SeqCountEpoch(epoch)
+		}
 	}
-	return SeqCountEpoch{epoch}
 }
 
 // ReadOk returns true if the reader critical section initiated by a previous
@@ -99,7 +109,7 @@ func (s *SeqCount) BeginRead() SeqCountEpoch {
 // Reader critical sections do not need to be explicitly terminated; the last
 // call to ReadOk is implicitly the end of the reader critical section.
 func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool {
-	return atomic.LoadUint32(&s.epoch) == epoch.val
+	return atomic.LoadUint32(&s.epoch) == uint32(epoch)
 }
 
 // BeginWrite indicates the beginning of a writer critical section.
diff --git a/pkg/sync/spin_unsafe.go b/pkg/sync/spin_unsafe.go
new file mode 100644
index 000000000..f721449e3
--- /dev/null
+++ b/pkg/sync/spin_unsafe.go
@@ -0,0 +1,24 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build !go1.16
+
+// Check go:linkname function signatures when updating Go version.
+
+package sync
+
+import (
+	_ "unsafe" // for go:linkname
+)
+
+//go:linkname canSpin sync.runtime_canSpin
+func canSpin(i int) bool
+
+//go:linkname doSpin sync.runtime_doSpin
+func doSpin()
+
+//go:linkname goyield runtime.goyield
+func goyield()
-- 
cgit v1.2.3


From 9b7f3ce38ac818dcb9edde7eb7288de665d3de10 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 2 Sep 2020 15:35:34 -0700
Subject: Update Go version constraint on sync/spin_unsafe.go.

PiperOrigin-RevId: 329801584
---
 pkg/sync/spin_unsafe.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/sync/spin_unsafe.go b/pkg/sync/spin_unsafe.go
index f721449e3..cafb2d065 100644
--- a/pkg/sync/spin_unsafe.go
+++ b/pkg/sync/spin_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.13
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
-- 
cgit v1.2.3


From 3e87c8e14d1552ec918256639797b07aa999bd0a Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Wed, 2 Sep 2020 15:39:51 -0700
Subject: [vfs] Fix error handling in overlayfs OpenAt.

Updates #1199

PiperOrigin-RevId: 329802274
---
 pkg/sentry/fsimpl/overlay/filesystem.go | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index e720bfb0b..63df86481 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -743,6 +743,9 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 
 	start := rp.Start().Impl().(*dentry)
 	if rp.Done() {
+		if mayCreate && rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
@@ -766,6 +769,10 @@ afterTrailingSymlink:
 	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
+	// Reject attempts to open directories with O_CREAT.
+	if mayCreate && rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
 	// Determine whether or not we need to create a file.
 	parent.dirMu.Lock()
 	child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
@@ -774,12 +781,11 @@ afterTrailingSymlink:
 		parent.dirMu.Unlock()
 		return fd, err
 	}
+	parent.dirMu.Unlock()
 	if err != nil {
-		parent.dirMu.Unlock()
 		return nil, err
 	}
 	// Open existing child or follow symlink.
-	parent.dirMu.Unlock()
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
@@ -794,6 +800,9 @@ afterTrailingSymlink:
 		start = parent
 		goto afterTrailingSymlink
 	}
+	if rp.MustBeDir() && !child.isDir() {
+		return nil, syserror.ENOTDIR
+	}
 	if mayWrite {
 		if err := child.copyUpLocked(ctx); err != nil {
 			return nil, err
-- 
cgit v1.2.3


From f33077c875523d5f33edf8c395300e29e7c9e609 Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Wed, 2 Sep 2020 17:56:30 -0700
Subject: [vfs] Implement xattr for overlayfs.

PiperOrigin-RevId: 329825497
---
 pkg/sentry/fs/inode_overlay.go          |   9 ++-
 pkg/sentry/fsimpl/gofer/gofer.go        |  42 +++++------
 pkg/sentry/fsimpl/overlay/copy_up.go    |  52 ++++++++++++--
 pkg/sentry/fsimpl/overlay/filesystem.go | 120 +++++++++++++++++++++++++++++---
 pkg/sentry/fsimpl/overlay/overlay.go    |  36 ++++++++++
 pkg/sentry/fsimpl/tmpfs/tmpfs.go        |  46 +++---------
 pkg/sentry/vfs/permissions.go           |  38 ++++++++++
 7 files changed, 263 insertions(+), 80 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index dc2e353d9..0a2d64e3a 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -16,7 +16,6 @@ package fs
 
 import (
 	"fmt"
-	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -539,7 +538,7 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
 
 	// Don't forward the value of the extended attribute if it would
 	// unexpectedly change the behavior of a wrapping overlay layer.
-	if strings.HasPrefix(XattrOverlayPrefix, name) {
+	if isXattrOverlay(name) {
 		return "", syserror.ENODATA
 	}
 
@@ -555,7 +554,7 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
 
 func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
 	// Don't allow changes to overlay xattrs through a setxattr syscall.
-	if strings.HasPrefix(XattrOverlayPrefix, name) {
+	if isXattrOverlay(name) {
 		return syserror.EPERM
 	}
 
@@ -578,7 +577,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st
 	for name := range names {
 		// Same as overlayGetXattr, we shouldn't forward along
 		// overlay attributes.
-		if strings.HasPrefix(XattrOverlayPrefix, name) {
+		if isXattrOverlay(name) {
 			delete(names, name)
 		}
 	}
@@ -587,7 +586,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st
 
 func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error {
 	// Don't allow changes to overlay xattrs through a removexattr syscall.
-	if strings.HasPrefix(XattrOverlayPrefix, name) {
+	if isXattrOverlay(name) {
 		return syserror.EPERM
 	}
 
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 73d9e772d..78b07f1b3 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -1067,6 +1067,21 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+	// We only support xattrs prefixed with "user." (see b/148380782). Currently,
+	// there is no need to expose any other xattrs through a gofer.
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+	kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+		return err
+	}
+	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
 func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
 	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid)))
 }
@@ -1357,8 +1372,6 @@ func (d *dentry) setDeleted() {
 	atomic.StoreUint32(&d.deleted, 1)
 }
 
-// We only support xattrs prefixed with "user." (see b/148380782). Currently,
-// there is no need to expose any other xattrs through a gofer.
 func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
 	if d.file.isNil() || !d.userXattrSupported() {
 		return nil, nil
@@ -1369,6 +1382,7 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
 	}
 	xattrs := make([]string, 0, len(xattrMap))
 	for x := range xattrMap {
+		// We only support xattrs in the user.* namespace.
 		if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
 			xattrs = append(xattrs, x)
 		}
@@ -1380,15 +1394,9 @@ func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vf
 	if d.file.isNil() {
 		return "", syserror.ENODATA
 	}
-	if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
 		return "", err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return "", syserror.EOPNOTSUPP
-	}
-	if !d.userXattrSupported() {
-		return "", syserror.ENODATA
-	}
 	return d.file.getXattr(ctx, opts.Name, opts.Size)
 }
 
@@ -1396,15 +1404,9 @@ func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vf
 	if d.file.isNil() {
 		return syserror.EPERM
 	}
-	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return syserror.EOPNOTSUPP
-	}
-	if !d.userXattrSupported() {
-		return syserror.EPERM
-	}
 	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
 }
 
@@ -1412,15 +1414,9 @@ func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name
 	if d.file.isNil() {
 		return syserror.EPERM
 	}
-	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
-		return syserror.EOPNOTSUPP
-	}
-	if !d.userXattrSupported() {
-		return syserror.EPERM
-	}
 	return d.file.removeXattr(ctx, name)
 }
 
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index 13735eb05..ba7b8495a 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -91,6 +91,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		if err != nil {
 			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)
 		}
+		if d.upperVD.Ok() {
+			d.upperVD.DecRef(ctx)
+			d.upperVD = vfs.VirtualDentry{}
+		}
 	}
 	switch ftype {
 	case linux.S_IFREG:
@@ -234,7 +238,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		panic(fmt.Sprintf("unexpected file type %o", ftype))
 	}
 
-	// TODO(gvisor.dev/issue/1199): copy up xattrs
+	if err := d.copyXattrsLocked(ctx); err != nil {
+		cleanupUndoCopyUp()
+		return err
+	}
 
 	// Update the dentry's device and inode numbers (except for directories,
 	// for which these remain overlay-assigned).
@@ -246,14 +253,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 			Mask: linux.STATX_INO,
 		})
 		if err != nil {
-			d.upperVD.DecRef(ctx)
-			d.upperVD = vfs.VirtualDentry{}
 			cleanupUndoCopyUp()
 			return err
 		}
 		if upperStat.Mask&linux.STATX_INO == 0 {
-			d.upperVD.DecRef(ctx)
-			d.upperVD = vfs.VirtualDentry{}
 			cleanupUndoCopyUp()
 			return syserror.EREMOTE
 		}
@@ -265,3 +268,42 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 	atomic.StoreUint32(&d.copiedUp, 1)
 	return nil
 }
+
+// copyXattrsLocked copies a subset of lower's extended attributes to upper.
+// Attributes that configure an overlay in the lower are not copied up.
+//
+// Preconditions: d.copyMu must be locked for writing.
+func (d *dentry) copyXattrsLocked(ctx context.Context) error {
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	lowerPop := &vfs.PathOperation{Root: d.lowerVDs[0], Start: d.lowerVDs[0]}
+	upperPop := &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}
+
+	lowerXattrs, err := vfsObj.ListxattrAt(ctx, d.fs.creds, lowerPop, 0)
+	if err != nil {
+		if err == syserror.EOPNOTSUPP {
+			// There are no guarantees as to the contents of lowerXattrs.
+			return nil
+		}
+		ctx.Warningf("failed to copy up xattrs because ListxattrAt failed: %v", err)
+		return err
+	}
+
+	for _, name := range lowerXattrs {
+		// Do not copy up overlay attributes.
+		if isOverlayXattr(name) {
+			continue
+		}
+
+		value, err := vfsObj.GetxattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetxattrOptions{Name: name, Size: 0})
+		if err != nil {
+			ctx.Warningf("failed to copy up xattrs because GetxattrAt failed: %v", err)
+			return err
+		}
+
+		if err := vfsObj.SetxattrAt(ctx, d.fs.creds, upperPop, &vfs.SetxattrOptions{Name: name, Value: value}); err != nil {
+			ctx.Warningf("failed to copy up xattrs because SetxattrAt failed: %v", err)
+			return err
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index 63df86481..46528c99c 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -15,6 +15,7 @@
 package overlay
 
 import (
+	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -27,10 +28,15 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs
+// attributes.
+// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_PREFIX
+const _OVL_XATTR_PREFIX = linux.XATTR_TRUSTED_PREFIX + "overlay."
+
 // _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
 // opaque directories.
 // Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
-const _OVL_XATTR_OPAQUE = linux.XATTR_TRUSTED_PREFIX + "overlay.opaque"
+const _OVL_XATTR_OPAQUE = _OVL_XATTR_PREFIX + "opaque"
 
 func isWhiteout(stat *linux.Statx) bool {
 	return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
@@ -1347,18 +1353,42 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return nil
 }
 
+// isOverlayXattr returns whether the given extended attribute configures the
+// overlay.
+func isOverlayXattr(name string) bool {
+	return strings.HasPrefix(name, _OVL_XATTR_PREFIX)
+}
+
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
 func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
 		return nil, err
 	}
-	// TODO(gvisor.dev/issue/1199): Linux overlayfs actually allows listxattr,
-	// but not any other xattr syscalls. For now we just reject all of them.
-	return nil, syserror.ENOTSUP
+
+	return fs.listXattr(ctx, d, size)
+}
+
+func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]string, error) {
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	top := d.topLayer()
+	names, err := vfsObj.ListxattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size)
+	if err != nil {
+		return nil, err
+	}
+
+	// Filter out all overlay attributes.
+	n := 0
+	for _, name := range names {
+		if !isOverlayXattr(name) {
+			names[n] = name
+			n++
+		}
+	}
+	return names[:n], err
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
@@ -1366,11 +1396,29 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
 		return "", err
 	}
-	return "", syserror.ENOTSUP
+
+	return fs.getXattr(ctx, d, rp.Credentials(), &opts)
+}
+
+func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
+		return "", err
+	}
+
+	// Return EOPNOTSUPP when fetching an overlay attribute.
+	// See fs/overlayfs/super.c:ovl_own_xattr_get().
+	if isOverlayXattr(opts.Name) {
+		return "", syserror.EOPNOTSUPP
+	}
+
+	// Analogous to fs/overlayfs/super.c:ovl_other_xattr_get().
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	top := d.topLayer()
+	return vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts)
 }
 
 // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
@@ -1378,11 +1426,36 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
 		return err
 	}
-	return syserror.ENOTSUP
+
+	return fs.setXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), &opts)
+}
+
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
+		return err
+	}
+
+	// Return EOPNOTSUPP when setting an overlay attribute.
+	// See fs/overlayfs/super.c:ovl_own_xattr_set().
+	if isOverlayXattr(opts.Name) {
+		return syserror.EOPNOTSUPP
+	}
+
+	// Analogous to fs/overlayfs/super.c:ovl_other_xattr_set().
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	return vfsObj.SetxattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts)
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
@@ -1390,11 +1463,36 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
 		return err
 	}
-	return syserror.ENOTSUP
+
+	return fs.removeXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), name)
+}
+
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, name string) error {
+	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
+		return err
+	}
+
+	// Like SetxattrAt, return EOPNOTSUPP when removing an overlay attribute.
+	// Linux passes the remove request to xattr_handler->set.
+	// See fs/xattr.c:vfs_removexattr().
+	if isOverlayXattr(name) {
+		return syserror.EOPNOTSUPP
+	}
+
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	return vfsObj.RemovexattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name)
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index 00562667f..e706f9d4e 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -570,6 +570,16 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+	kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+		return err
+	}
+	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
 // statInternalMask is the set of stat fields that is set by
 // dentry.statInternalTo().
 const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
@@ -622,6 +632,32 @@ func (fd *fileDescription) dentry() *dentry {
 	return fd.vfsfd.Dentry().Impl().(*dentry)
 }
 
+// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
+func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.filesystem().listXattr(ctx, fd.dentry(), size)
+}
+
+// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
+func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
+	return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts)
+}
+
+// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
+func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+	fs := fd.filesystem()
+	fs.renameMu.RLock()
+	defer fs.renameMu.RUnlock()
+	return fs.setXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts)
+}
+
+// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
+func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+	fs := fd.filesystem()
+	fs.renameMu.RLock()
+	defer fs.renameMu.RUnlock()
+	return fs.removeXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name)
+}
+
 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
 	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index c4cec4130..d6074f20f 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -652,44 +652,18 @@ func (i *inode) removexattr(creds *auth.Credentials, name string) error {
 }
 
 func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
-	switch {
-	case ats&vfs.MayRead == vfs.MayRead:
-		if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
-			return err
-		}
-	case ats&vfs.MayWrite == vfs.MayWrite:
-		if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
-			return err
-		}
-	default:
-		panic(fmt.Sprintf("checkXattrPermissions called with impossible AccessTypes: %v", ats))
+	// We currently only support extended attributes in the user.* and
+	// trusted.* namespaces. See b/148380782.
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) && !strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
+		return syserror.EOPNOTSUPP
 	}
-
-	switch {
-	case strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX):
-		// The trusted.* namespace can only be accessed by privileged
-		// users.
-		if creds.HasCapability(linux.CAP_SYS_ADMIN) {
-			return nil
-		}
-		if ats&vfs.MayWrite == vfs.MayWrite {
-			return syserror.EPERM
-		}
-		return syserror.ENODATA
-	case strings.HasPrefix(name, linux.XATTR_USER_PREFIX):
-		// Extended attributes in the user.* namespace are only
-		// supported for regular files and directories.
-		filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
-		if filetype == linux.S_IFREG || filetype == linux.S_IFDIR {
-			return nil
-		}
-		if ats&vfs.MayWrite == vfs.MayWrite {
-			return syserror.EPERM
-		}
-		return syserror.ENODATA
-
+	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+	kuid := auth.KUID(atomic.LoadUint32(&i.uid))
+	kgid := auth.KGID(atomic.LoadUint32(&i.gid))
+	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+		return err
 	}
-	return syserror.EOPNOTSUPP
+	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
 }
 
 // fileDescription is embedded by tmpfs implementations of
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index 014b928ed..00eeb8842 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -16,6 +16,7 @@ package vfs
 
 import (
 	"math"
+	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -284,3 +285,40 @@ func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
 	}
 	return size, nil
 }
+
+// CheckXattrPermissions checks permissions for extended attribute access.
+// This is analogous to fs/xattr.c:xattr_permission(). Some key differences:
+// * Does not check for read-only filesystem property.
+// * Does not check inode immutability or append only mode. In both cases EPERM
+//   must be returned by filesystem implementations.
+// * Does not do inode permission checks. Filesystem implementations should
+//   handle inode permission checks as they may differ across implementations.
+func CheckXattrPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, name string) error {
+	switch {
+	case strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX):
+		// The trusted.* namespace can only be accessed by privileged
+		// users.
+		if creds.HasCapability(linux.CAP_SYS_ADMIN) {
+			return nil
+		}
+		if ats.MayWrite() {
+			return syserror.EPERM
+		}
+		return syserror.ENODATA
+	case strings.HasPrefix(name, linux.XATTR_USER_PREFIX):
+		// In the user.* namespace, only regular files and directories can have
+		// extended attributes. For sticky directories, only the owner and
+		// privileged users can write attributes.
+		filetype := mode.FileType()
+		if filetype != linux.ModeRegular && filetype != linux.ModeDirectory {
+			if ats.MayWrite() {
+				return syserror.EPERM
+			}
+			return syserror.ENODATA
+		}
+		if filetype == linux.ModeDirectory && mode&linux.ModeSticky != 0 && ats.MayWrite() && !CanActAsOwner(creds, kuid) {
+			return syserror.EPERM
+		}
+	}
+	return nil
+}
-- 
cgit v1.2.3


From 89185098fae9cfe2725e7645aeb2ebd57aa26320 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 2 Sep 2020 18:19:50 -0700
Subject: Fix Accept to not return error for sockets in accept queue.

Accept on gVisor will return an error if a socket in the accept queue was closed
before Accept() was called. Linux will return the new fd even if the returned
socket is already closed by the peer say due to a RST being sent by the peer.

This seems to be intentional in linux more details on the github issue.

Fixes #3780

PiperOrigin-RevId: 329828404
---
 pkg/sentry/socket/netstack/netstack.go             | 22 +++--
 pkg/sentry/socket/netstack/netstack_vfs2.go        | 16 ++--
 pkg/sentry/socket/unix/transport/connectioned.go   | 14 +++-
 pkg/sentry/socket/unix/transport/connectionless.go |  4 +-
 pkg/sentry/socket/unix/transport/unix.go           |  5 +-
 pkg/sentry/socket/unix/unix.go                     | 22 +++--
 pkg/sentry/socket/unix/unix_vfs2.go                | 22 +++--
 pkg/tcpip/adapters/gonet/gonet.go                  |  4 +-
 pkg/tcpip/sample/tun_tcp_echo/main.go              |  2 +-
 pkg/tcpip/stack/transport_test.go                  |  4 +-
 pkg/tcpip/tcpip.go                                 |  5 +-
 pkg/tcpip/transport/icmp/endpoint.go               |  2 +-
 pkg/tcpip/transport/packet/endpoint.go             | 20 ++---
 pkg/tcpip/transport/raw/endpoint.go                |  8 +-
 pkg/tcpip/transport/tcp/dual_stack_test.go         | 30 +++----
 pkg/tcpip/transport/tcp/endpoint.go                | 13 ++-
 pkg/tcpip/transport/tcp/tcp_test.go                | 96 +++++++++++-----------
 pkg/tcpip/transport/tcp/testing/context/context.go |  4 +-
 pkg/tcpip/transport/udp/endpoint.go                |  2 +-
 test/syscalls/linux/socket_inet_loopback.cc        | 20 ++---
 20 files changed, 163 insertions(+), 152 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 36c17d1ba..91790834b 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -830,7 +830,7 @@ func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
 
 // blockingAccept implements a blocking version of accept(2), that is, if no
 // connections are ready to be accept, it will block until one becomes ready.
-func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
+func (s *socketOpsCommon) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
 	// Register for notifications.
 	e, ch := waiter.NewChannelEntry(nil)
 	s.EventRegister(&e, waiter.EventIn)
@@ -839,7 +839,7 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waite
 	// Try to accept the connection again; if it fails, then wait until we
 	// get a notification.
 	for {
-		if ep, wq, err := s.Endpoint.Accept(); err != tcpip.ErrWouldBlock {
+		if ep, wq, err := s.Endpoint.Accept(peerAddr); err != tcpip.ErrWouldBlock {
 			return ep, wq, syserr.TranslateNetstackError(err)
 		}
 
@@ -852,15 +852,18 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waite
 // Accept implements the linux syscall accept(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
-	// Issue the accept request to get the new endpoint.
-	ep, wq, terr := s.Endpoint.Accept()
+	var peerAddr *tcpip.FullAddress
+	if peerRequested {
+		peerAddr = &tcpip.FullAddress{}
+	}
+	ep, wq, terr := s.Endpoint.Accept(peerAddr)
 	if terr != nil {
 		if terr != tcpip.ErrWouldBlock || !blocking {
 			return 0, nil, 0, syserr.TranslateNetstackError(terr)
 		}
 
 		var err *syserr.Error
-		ep, wq, err = s.blockingAccept(t)
+		ep, wq, err = s.blockingAccept(t, peerAddr)
 		if err != nil {
 			return 0, nil, 0, err
 		}
@@ -880,13 +883,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 
 	var addr linux.SockAddr
 	var addrLen uint32
-	if peerRequested {
-		// Get address of the peer and write it to peer slice.
-		var err *syserr.Error
-		addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t)
-		if err != nil {
-			return 0, nil, 0, err
-		}
+	if peerAddr != nil {
+		addr, addrLen = ConvertAddress(s.family, *peerAddr)
 	}
 
 	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
index 1f7d17f5f..0f342e655 100644
--- a/pkg/sentry/socket/netstack/netstack_vfs2.go
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -151,14 +151,18 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs
 // tcpip.Endpoint.
 func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
 	// Issue the accept request to get the new endpoint.
-	ep, wq, terr := s.Endpoint.Accept()
+	var peerAddr *tcpip.FullAddress
+	if peerRequested {
+		peerAddr = &tcpip.FullAddress{}
+	}
+	ep, wq, terr := s.Endpoint.Accept(peerAddr)
 	if terr != nil {
 		if terr != tcpip.ErrWouldBlock || !blocking {
 			return 0, nil, 0, syserr.TranslateNetstackError(terr)
 		}
 
 		var err *syserr.Error
-		ep, wq, err = s.blockingAccept(t)
+		ep, wq, err = s.blockingAccept(t, peerAddr)
 		if err != nil {
 			return 0, nil, 0, err
 		}
@@ -176,13 +180,9 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
 
 	var addr linux.SockAddr
 	var addrLen uint32
-	if peerRequested {
+	if peerAddr != nil {
 		// Get address of the peer and write it to peer slice.
-		var err *syserr.Error
-		addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
-		if err != nil {
-			return 0, nil, 0, err
-		}
+		addr, addrLen = ConvertAddress(s.family, *peerAddr)
 	}
 
 	fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index e3a75b519..aa4f3c04d 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -391,7 +391,7 @@ func (e *connectionedEndpoint) Listen(backlog int) *syserr.Error {
 }
 
 // Accept accepts a new connection.
-func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) {
+func (e *connectionedEndpoint) Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error) {
 	e.Lock()
 	defer e.Unlock()
 
@@ -401,6 +401,18 @@ func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) {
 
 	select {
 	case ne := <-e.acceptedChan:
+		if peerAddr != nil {
+			ne.Lock()
+			c := ne.connected
+			ne.Unlock()
+			if c != nil {
+				addr, err := c.GetLocalAddress()
+				if err != nil {
+					return nil, syserr.TranslateNetstackError(err)
+				}
+				*peerAddr = addr
+			}
+		}
 		return ne, nil
 
 	default:
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 4751b2fd8..f8aacca13 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -144,12 +144,12 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi
 }
 
 // Listen starts listening on the connection.
-func (e *connectionlessEndpoint) Listen(int) *syserr.Error {
+func (*connectionlessEndpoint) Listen(int) *syserr.Error {
 	return syserr.ErrNotSupported
 }
 
 // Accept accepts a new connection.
-func (e *connectionlessEndpoint) Accept() (Endpoint, *syserr.Error) {
+func (*connectionlessEndpoint) Accept(*tcpip.FullAddress) (Endpoint, *syserr.Error) {
 	return nil, syserr.ErrNotSupported
 }
 
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 1200cf9bb..cbbdd000f 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -151,7 +151,10 @@ type Endpoint interface {
 	// block if no new connections are available.
 	//
 	// The returned Queue is the wait queue for the newly created endpoint.
-	Accept() (Endpoint, *syserr.Error)
+	//
+	// peerAddr if not nil will be populated with the address of the connected
+	// peer on a successful accept.
+	Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error)
 
 	// Bind binds the endpoint to a specific local address and port.
 	// Specifying a NIC is optional.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 0a7a26495..616530eb6 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -205,7 +205,7 @@ func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
 
 // blockingAccept implements a blocking version of accept(2), that is, if no
 // connections are ready to be accept, it will block until one becomes ready.
-func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
+func (s *SocketOperations) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) {
 	// Register for notifications.
 	e, ch := waiter.NewChannelEntry(nil)
 	s.EventRegister(&e, waiter.EventIn)
@@ -214,7 +214,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
 	// Try to accept the connection; if it fails, then wait until we get a
 	// notification.
 	for {
-		if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+		if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock {
 			return ep, err
 		}
 
@@ -227,15 +227,18 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
 // Accept implements the linux syscall accept(2) for sockets backed by
 // a transport.Endpoint.
 func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
-	// Issue the accept request to get the new endpoint.
-	ep, err := s.ep.Accept()
+	var peerAddr *tcpip.FullAddress
+	if peerRequested {
+		peerAddr = &tcpip.FullAddress{}
+	}
+	ep, err := s.ep.Accept(peerAddr)
 	if err != nil {
 		if err != syserr.ErrWouldBlock || !blocking {
 			return 0, nil, 0, err
 		}
 
 		var err *syserr.Error
-		ep, err = s.blockingAccept(t)
+		ep, err = s.blockingAccept(t, peerAddr)
 		if err != nil {
 			return 0, nil, 0, err
 		}
@@ -252,13 +255,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 
 	var addr linux.SockAddr
 	var addrLen uint32
-	if peerRequested {
-		// Get address of the peer.
-		var err *syserr.Error
-		addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t)
-		if err != nil {
-			return 0, nil, 0, err
-		}
+	if peerAddr != nil {
+		addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr)
 	}
 
 	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index 65a285b8f..e25c7e84a 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -96,7 +96,7 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
 
 // blockingAccept implements a blocking version of accept(2), that is, if no
 // connections are ready to be accept, it will block until one becomes ready.
-func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
+func (s *SocketVFS2) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) {
 	// Register for notifications.
 	e, ch := waiter.NewChannelEntry(nil)
 	s.socketOpsCommon.EventRegister(&e, waiter.EventIn)
@@ -105,7 +105,7 @@ func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr
 	// Try to accept the connection; if it fails, then wait until we get a
 	// notification.
 	for {
-		if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+		if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock {
 			return ep, err
 		}
 
@@ -118,15 +118,18 @@ func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr
 // Accept implements the linux syscall accept(2) for sockets backed by
 // a transport.Endpoint.
 func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
-	// Issue the accept request to get the new endpoint.
-	ep, err := s.ep.Accept()
+	var peerAddr *tcpip.FullAddress
+	if peerRequested {
+		peerAddr = &tcpip.FullAddress{}
+	}
+	ep, err := s.ep.Accept(peerAddr)
 	if err != nil {
 		if err != syserr.ErrWouldBlock || !blocking {
 			return 0, nil, 0, err
 		}
 
 		var err *syserr.Error
-		ep, err = s.blockingAccept(t)
+		ep, err = s.blockingAccept(t, peerAddr)
 		if err != nil {
 			return 0, nil, 0, err
 		}
@@ -144,13 +147,8 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
 
 	var addr linux.SockAddr
 	var addrLen uint32
-	if peerRequested {
-		// Get address of the peer.
-		var err *syserr.Error
-		addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
-		if err != nil {
-			return 0, nil, 0, err
-		}
+	if peerAddr != nil {
+		addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr)
 	}
 
 	fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index 68a954a10..4f551cd92 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -245,7 +245,7 @@ func NewTCPConn(wq *waiter.Queue, ep tcpip.Endpoint) *TCPConn {
 
 // Accept implements net.Conn.Accept.
 func (l *TCPListener) Accept() (net.Conn, error) {
-	n, wq, err := l.ep.Accept()
+	n, wq, err := l.ep.Accept(nil)
 
 	if err == tcpip.ErrWouldBlock {
 		// Create wait queue entry that notifies a channel.
@@ -254,7 +254,7 @@ func (l *TCPListener) Accept() (net.Conn, error) {
 		defer l.wq.EventUnregister(&waitEntry)
 
 		for {
-			n, wq, err = l.ep.Accept()
+			n, wq, err = l.ep.Accept(nil)
 
 			if err != tcpip.ErrWouldBlock {
 				break
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index 9e37cab18..3f58a15ea 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -188,7 +188,7 @@ func main() {
 	defer wq.EventUnregister(&waitEntry)
 
 	for {
-		n, wq, err := ep.Accept()
+		n, wq, err := ep.Accept(nil)
 		if err != nil {
 			if err == tcpip.ErrWouldBlock {
 				<-notifyCh
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index a1458c899..9292bfccb 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -180,7 +180,7 @@ func (*fakeTransportEndpoint) Listen(int) *tcpip.Error {
 	return nil
 }
 
-func (f *fakeTransportEndpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (f *fakeTransportEndpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	if len(f.acceptQueue) == 0 {
 		return nil, nil, nil
 	}
@@ -631,7 +631,7 @@ func TestTransportForwarding(t *testing.T) {
 		Data: req.ToVectorisedView(),
 	}))
 
-	aep, _, err := ep.Accept()
+	aep, _, err := ep.Accept(nil)
 	if err != nil || aep == nil {
 		t.Fatalf("Accept failed: %v, %v", aep, err)
 	}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index b113d8613..8ba615521 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -561,7 +561,10 @@ type Endpoint interface {
 	// block if no new connections are available.
 	//
 	// The returned Queue is the wait queue for the newly created endpoint.
-	Accept() (Endpoint, *waiter.Queue, *Error)
+	//
+	// If peerAddr is not nil then it is populated with the peer address of the
+	// returned endpoint.
+	Accept(peerAddr *FullAddress) (Endpoint, *waiter.Queue, *Error)
 
 	// Bind binds the endpoint to a specific local address and port.
 	// Specifying a NIC is optional.
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 346ca4bda..ad71ff3b6 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -597,7 +597,7 @@ func (*endpoint) Listen(int) *tcpip.Error {
 }
 
 // Accept is not supported by UDP, it just fails.
-func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 81093e9ca..8bd4e5e37 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -192,13 +192,13 @@ func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMes
 	return ep.ReadPacket(addr, nil)
 }
 
-func (ep *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+func (*endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
 	// TODO(gvisor.dev/issue/173): Implement.
 	return 0, nil, tcpip.ErrInvalidOptionValue
 }
 
 // Peek implements tcpip.Endpoint.Peek.
-func (ep *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+func (*endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 	return 0, tcpip.ControlMessages{}, nil
 }
 
@@ -210,25 +210,25 @@ func (*endpoint) Disconnect() *tcpip.Error {
 
 // Connect implements tcpip.Endpoint.Connect. Packet sockets cannot be
 // connected, and this function always returnes tcpip.ErrNotSupported.
-func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+func (*endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // Shutdown implements tcpip.Endpoint.Shutdown. Packet sockets cannot be used
 // with Shutdown, and this function always returns tcpip.ErrNotSupported.
-func (ep *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+func (*endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // Listen implements tcpip.Endpoint.Listen. Packet sockets cannot be used with
 // Listen, and this function always returns tcpip.ErrNotSupported.
-func (ep *endpoint) Listen(backlog int) *tcpip.Error {
+func (*endpoint) Listen(backlog int) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // Accept implements tcpip.Endpoint.Accept. Packet sockets cannot be used with
 // Accept, and this function always returns tcpip.ErrNotSupported.
-func (ep *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
@@ -267,12 +267,12 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 }
 
 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
-func (ep *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+func (*endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 	return tcpip.FullAddress{}, tcpip.ErrNotSupported
 }
 
 // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress.
-func (ep *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	// Even a connected socket doesn't return a remote address.
 	return tcpip.FullAddress{}, tcpip.ErrNotConnected
 }
@@ -371,7 +371,7 @@ func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error {
 }
 
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (ep *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+func (*endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	return false, tcpip.ErrNotSupported
 }
 
@@ -508,7 +508,7 @@ func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress,
 }
 
 // State implements socket.Socket.State.
-func (ep *endpoint) State() uint32 {
+func (*endpoint) State() uint32 {
 	return 0
 }
 
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 71feeb748..fb03e6047 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -446,12 +446,12 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 }
 
 // Listen implements tcpip.Endpoint.Listen.
-func (e *endpoint) Listen(backlog int) *tcpip.Error {
+func (*endpoint) Listen(backlog int) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // Accept implements tcpip.Endpoint.Accept.
-func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
@@ -482,12 +482,12 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 }
 
 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
-func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+func (*endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 	return tcpip.FullAddress{}, tcpip.ErrNotSupported
 }
 
 // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress.
-func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	// Even a connected socket doesn't return a remote address.
 	return tcpip.FullAddress{}, tcpip.ErrNotConnected
 }
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index 6074cc24e..80e9dd465 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -371,12 +371,12 @@ func testV4Accept(t *testing.T, c *context.Context) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	nep, _, err := c.EP.Accept()
+	nep, _, err := c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			nep, _, err = c.EP.Accept()
+			nep, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %v", err)
 			}
@@ -510,13 +510,13 @@ func TestV6AcceptOnV6(t *testing.T) {
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
-
-	nep, _, err := c.EP.Accept()
+	var addr tcpip.FullAddress
+	nep, _, err := c.EP.Accept(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			nep, _, err = c.EP.Accept()
+			nep, _, err = c.EP.Accept(&addr)
 			if err != nil {
 				t.Fatalf("Accept failed: %v", err)
 			}
@@ -526,20 +526,14 @@ func TestV6AcceptOnV6(t *testing.T) {
 		}
 	}
 
+	if addr.Addr != context.TestV6Addr {
+		t.Errorf("Unexpected remote address: got %s, want %s", addr.Addr, context.TestV6Addr)
+	}
+
 	// Make sure we can still query the v6 only status of the new endpoint,
 	// that is, that it is in fact a v6 socket.
 	if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != nil {
-		t.Fatalf("GetSockOpt failed failed: %v", err)
-	}
-
-	// Check the peer address.
-	addr, err := nep.GetRemoteAddress()
-	if err != nil {
-		t.Fatalf("GetRemoteAddress failed failed: %v", err)
-	}
-
-	if addr.Addr != context.TestV6Addr {
-		t.Fatalf("Unexpected remote address: got %v, want %v", addr.Addr, context.TestV6Addr)
+		t.Errorf("GetSockOptBool(tcpip.V6OnlyOption) failed: %s", err)
 	}
 }
 
@@ -610,12 +604,12 @@ func testV4ListenClose(t *testing.T, c *context.Context) {
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
-	nep, _, err := c.EP.Accept()
+	nep, _, err := c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			nep, _, err = c.EP.Accept()
+			nep, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %v", err)
 			}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 3f18efeef..4cf966b65 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2453,7 +2453,9 @@ func (e *endpoint) startAcceptedLoop() {
 
 // Accept returns a new endpoint if a peer has established a connection
 // to an endpoint previously set to listen mode.
-func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+//
+// addr if not-nil will contain the peer address of the returned endpoint.
+func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	e.LockUser()
 	defer e.UnlockUser()
 
@@ -2475,6 +2477,9 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	default:
 		return nil, nil, tcpip.ErrWouldBlock
 	}
+	if peerAddr != nil {
+		*peerAddr = n.getRemoteAddress()
+	}
 	return n, n.waiterQueue, nil
 }
 
@@ -2577,11 +2582,15 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 		return tcpip.FullAddress{}, tcpip.ErrNotConnected
 	}
 
+	return e.getRemoteAddress(), nil
+}
+
+func (e *endpoint) getRemoteAddress() tcpip.FullAddress {
 	return tcpip.FullAddress{
 		Addr: e.ID.RemoteAddress,
 		Port: e.ID.RemotePort,
 		NIC:  e.boundNICID,
-	}, nil
+	}
 }
 
 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index adb32e428..3d09d6def 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -291,12 +291,12 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -2203,12 +2203,12 @@ func TestScaledWindowAccept(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -2277,12 +2277,12 @@ func TestNonScaledWindowAccept(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -2840,12 +2840,12 @@ func TestPassiveSendMSSLessThanMTU(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -2895,12 +2895,12 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5135,12 +5135,12 @@ func TestListenBacklogFull(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	for i := 0; i < listenBacklog; i++ {
-		_, _, err = c.EP.Accept()
+		_, _, err = c.EP.Accept(nil)
 		if err == tcpip.ErrWouldBlock {
 			// Wait for connection to be established.
 			select {
 			case <-ch:
-				_, _, err = c.EP.Accept()
+				_, _, err = c.EP.Accept(nil)
 				if err != nil {
 					t.Fatalf("Accept failed: %s", err)
 				}
@@ -5152,7 +5152,7 @@ func TestListenBacklogFull(t *testing.T) {
 	}
 
 	// Now verify that there are no more connections that can be accepted.
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err != tcpip.ErrWouldBlock {
 		select {
 		case <-ch:
@@ -5164,12 +5164,12 @@ func TestListenBacklogFull(t *testing.T) {
 	// Now a new handshake must succeed.
 	executeHandshake(t, c, context.TestPort+2, false /*synCookieInUse */)
 
-	newEP, _, err := c.EP.Accept()
+	newEP, _, err := c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			newEP, _, err = c.EP.Accept()
+			newEP, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5476,12 +5476,12 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	newEP, _, err := c.EP.Accept()
+	newEP, _, err := c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			newEP, _, err = c.EP.Accept()
+			newEP, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5552,12 +5552,12 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			_, _, err = c.EP.Accept()
+			_, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5568,7 +5568,7 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	}
 
 	// Now verify that there are no more connections that can be accepted.
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err != tcpip.ErrWouldBlock {
 		select {
 		case <-ch:
@@ -5657,7 +5657,7 @@ func TestSynRcvdBadSeqNumber(t *testing.T) {
 		RcvWnd:  30000,
 	})
 
-	newEP, _, err := c.EP.Accept()
+	newEP, _, err := c.EP.Accept(nil)
 
 	if err != nil && err != tcpip.ErrWouldBlock {
 		t.Fatalf("Accept failed: %s", err)
@@ -5672,7 +5672,7 @@ func TestSynRcvdBadSeqNumber(t *testing.T) {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			newEP, _, err = c.EP.Accept()
+			newEP, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5730,12 +5730,12 @@ func TestPassiveConnectionAttemptIncrement(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	// Verify that there is only one acceptable connection at this point.
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			_, _, err = c.EP.Accept()
+			_, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5800,12 +5800,12 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	// Now check that there is one acceptable connections.
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			_, _, err = c.EP.Accept()
+			_, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5853,12 +5853,12 @@ func TestEndpointBindListenAcceptState(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	aep, _, err := ep.Accept()
+	aep, _, err := ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			aep, _, err = ep.Accept()
+			aep, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6293,12 +6293,12 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6412,12 +6412,12 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6519,12 +6519,12 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6602,12 +6602,12 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 	c.SendPacket(nil, ackHeaders)
 
 	// Try to accept the connection.
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6675,12 +6675,12 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6824,12 +6824,12 @@ func TestTCPCloseWithData(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -7271,8 +7271,8 @@ func TestTCPDeferAccept(t *testing.T) {
 
 	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
 
-	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: %s", err, tcpip.ErrWouldBlock)
+	if _, _, err := c.EP.Accept(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Accept(nil) = %s, want: %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Send data. This should result in an acceptable endpoint.
@@ -7293,9 +7293,9 @@ func TestTCPDeferAccept(t *testing.T) {
 
 	// Give a bit of time for the socket to be delivered to the accept queue.
 	time.Sleep(50 * time.Millisecond)
-	aep, _, err := c.EP.Accept()
+	aep, _, err := c.EP.Accept(nil)
 	if err != nil {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: nil", err)
+		t.Fatalf("got c.EP.Accept(nil) = %s, want: nil", err)
 	}
 
 	aep.Close()
@@ -7329,8 +7329,8 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 
 	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
 
-	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: %s", err, tcpip.ErrWouldBlock)
+	if _, _, err := c.EP.Accept(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Accept(nil) = %s, want: %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Sleep for a little of the tcpDeferAccept timeout.
@@ -7362,9 +7362,9 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 
 	// Give sometime for the endpoint to be delivered to the accept queue.
 	time.Sleep(50 * time.Millisecond)
-	aep, _, err := c.EP.Accept()
+	aep, _, err := c.EP.Accept(nil)
 	if err != nil {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: nil", err)
+		t.Fatalf("got c.EP.Accept(nil) = %s, want: nil", err)
 	}
 
 	aep.Close()
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 1f5340cd0..8bb5e5f6d 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -948,12 +948,12 @@ func (c *Context) AcceptWithOptions(wndScale int, synOptions header.TCPSynOption
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				c.t.Fatalf("Accept failed: %v", err)
 			}
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index c74bc4d94..2828b2c01 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1218,7 +1218,7 @@ func (*endpoint) Listen(int) *tcpip.Error {
 }
 
 // Accept is not supported by UDP, it just fails.
-func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index ffcd90475..54fee2e82 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -1161,30 +1161,26 @@ TEST_P(SocketInetLoopbackTest, TCPAcceptAfterReset) {
       SyscallSucceeds());
   ASSERT_THAT(close(conn_fd.release()), SyscallSucceeds());
 
-  // TODO(gvisor.dev/issue/3780): Remove this.
   if (IsRunningOnGvisor()) {
-    // Wait for the RST to be observed.
+    // Gvisor packet procssing is asynchronous and can take a bit of time in
+    // some cases so we give it a bit of time to process the RST packet before
+    // calling accept.
+    //
+    // There is nothing to poll() on so we have no choice but to use a sleep
+    // here.
     absl::SleepFor(absl::Milliseconds(100));
   }
 
   sockaddr_storage accept_addr;
   socklen_t addrlen = sizeof(accept_addr);
 
-  // TODO(gvisor.dev/issue/3780): Remove this.
-  if (IsRunningOnGvisor()) {
-    ASSERT_THAT(accept(listen_fd.get(),
-                       reinterpret_cast<sockaddr*>(&accept_addr), &addrlen),
-                SyscallFailsWithErrno(ENOTCONN));
-    return;
-  }
-
-  conn_fd = ASSERT_NO_ERRNO_AND_VALUE(Accept(
+  auto accept_fd = ASSERT_NO_ERRNO_AND_VALUE(Accept(
       listen_fd.get(), reinterpret_cast<sockaddr*>(&accept_addr), &addrlen));
   ASSERT_EQ(addrlen, listener.addr_len);
 
   int err;
   socklen_t optlen = sizeof(err);
-  ASSERT_THAT(getsockopt(conn_fd.get(), SOL_SOCKET, SO_ERROR, &err, &optlen),
+  ASSERT_THAT(getsockopt(accept_fd.get(), SOL_SOCKET, SO_ERROR, &err, &optlen),
               SyscallSucceeds());
   ASSERT_EQ(err, ECONNRESET);
   ASSERT_EQ(optlen, sizeof(err));
-- 
cgit v1.2.3


From 6ff4234587a7509db17262c7a64db17daee12806 Mon Sep 17 00:00:00 2001
From: Zeling Feng <zeling@google.com>
Date: Wed, 2 Sep 2020 19:17:32 -0700
Subject: Add support to run packetimpact tests against Fuchsia

blaze test <test_name>_fuchsia_test will run the corresponding packetimpact
test against fuchsia.

PiperOrigin-RevId: 329835290
---
 images/Makefile                               |   4 +-
 images/packetimpact/Dockerfile                |   8 +-
 pkg/test/dockerutil/container.go              |  12 +-
 test/packetimpact/dut/BUILD                   |  10 +
 test/packetimpact/dut/posix_server.cc         |   5 +-
 test/packetimpact/runner/BUILD                |  26 +-
 test/packetimpact/runner/defs.bzl             |   5 +-
 test/packetimpact/runner/dut.go               | 438 ++++++++++++++++++++++++++
 test/packetimpact/runner/packetimpact_test.go | 359 +--------------------
 9 files changed, 492 insertions(+), 375 deletions(-)
 create mode 100644 test/packetimpact/runner/dut.go

(limited to 'pkg')

diff --git a/images/Makefile b/images/Makefile
index 278dec02f..d183155a8 100644
--- a/images/Makefile
+++ b/images/Makefile
@@ -59,9 +59,9 @@ local_image = $(LOCAL_IMAGE_PREFIX)/$(subst _,/,$(1))
 # we need to explicitly repull the base layer in order to ensure that the
 # architecture is correct. Note that we use the term "rebuild" here to avoid
 # conflicting with the bazel "build" terminology, which is used elsewhere.
-rebuild-%: FROM=$(shell grep FROM $(call path,$*)/Dockerfile } cut -d' ' -f2)
+rebuild-%: FROM=$(shell grep FROM $(call path,$*)/Dockerfile | cut -d' ' -f2)
 rebuild-%: register-cross
-	$(foreach IMAGE,$(FROM),docker $(DOCKER_PLATFORM_ARGS) $(IMAGE); &&) true
+	$(foreach IMAGE,$(FROM),docker pull $(DOCKER_PLATFORM_ARGS) $(IMAGE) &&) true && \
 	T=$$(mktemp -d) && cp -a $(call path,$*)/* $$T && \
 		docker build $(DOCKER_PLATFORM_ARGS) -t $(call remote_image,$*) $$T && \
 		rm -rf $$T
diff --git a/images/packetimpact/Dockerfile b/images/packetimpact/Dockerfile
index 87aa99ef2..82b7e8abd 100644
--- a/images/packetimpact/Dockerfile
+++ b/images/packetimpact/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:bionic
+FROM ubuntu:focal
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
         # iptables to disable OS native packet processing.
         iptables \
@@ -11,6 +11,10 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
         # tshark to log verbose packet sniffing.
         tshark \
         # killall for cleanup.
-        psmisc
+        psmisc \
+        # qemu-system-x86 to emulate fuchsia.
+        qemu-system-x86 \
+        # sha1sum to generate entropy.
+        libdigest-sha-perl
 RUN hash -r
 CMD /bin/bash
diff --git a/pkg/test/dockerutil/container.go b/pkg/test/dockerutil/container.go
index 052b6b99d..64d17f661 100644
--- a/pkg/test/dockerutil/container.go
+++ b/pkg/test/dockerutil/container.go
@@ -22,6 +22,7 @@ import (
 	"net"
 	"os"
 	"path"
+	"path/filepath"
 	"regexp"
 	"strconv"
 	"strings"
@@ -403,10 +404,13 @@ func (c *Container) CopyFiles(opts *RunOpts, target string, sources ...string) {
 		return
 	}
 	for _, name := range sources {
-		src, err := testutil.FindFile(name)
-		if err != nil {
-			c.copyErr = fmt.Errorf("testutil.FindFile(%q) failed: %v", name, err)
-			return
+		src := name
+		if !filepath.IsAbs(src) {
+			src, err = testutil.FindFile(name)
+			if err != nil {
+				c.copyErr = fmt.Errorf("testutil.FindFile(%q) failed: %w", name, err)
+				return
+			}
 		}
 		dst := path.Join(dir, path.Base(name))
 		if err := testutil.Copy(src, dst); err != nil {
diff --git a/test/packetimpact/dut/BUILD b/test/packetimpact/dut/BUILD
index 3ce63c2c6..ccf1c735f 100644
--- a/test/packetimpact/dut/BUILD
+++ b/test/packetimpact/dut/BUILD
@@ -16,3 +16,13 @@ cc_binary(
         "//test/packetimpact/proto:posix_server_cc_proto",
     ],
 )
+
+cc_binary(
+    name = "posix_server_dynamic",
+    srcs = ["posix_server.cc"],
+    deps = [
+        grpcpp,
+        "//test/packetimpact/proto:posix_server_cc_grpc_proto",
+        "//test/packetimpact/proto:posix_server_cc_proto",
+    ],
+)
diff --git a/test/packetimpact/dut/posix_server.cc b/test/packetimpact/dut/posix_server.cc
index de5b4be93..2f3becfba 100644
--- a/test/packetimpact/dut/posix_server.cc
+++ b/test/packetimpact/dut/posix_server.cc
@@ -21,6 +21,7 @@
 #include <string.h>
 #include <sys/socket.h>
 #include <sys/types.h>
+#include <time.h>
 #include <unistd.h>
 
 #include <iostream>
@@ -307,9 +308,9 @@ class PosixImpl final : public posix_server::Posix::Service {
         break;
       }
       case ::posix_server::SockOptVal::kTimeval: {
-        timeval tv = {.tv_sec = static_cast<__time_t>(
+        timeval tv = {.tv_sec = static_cast<time_t>(
                           request->optval().timeval().seconds()),
-                      .tv_usec = static_cast<__suseconds_t>(
+                      .tv_usec = static_cast<suseconds_t>(
                           request->optval().timeval().microseconds())};
         response->set_ret(setsockopt(request->sockfd(), request->level(),
                                      request->optname(), &tv, sizeof(tv)));
diff --git a/test/packetimpact/runner/BUILD b/test/packetimpact/runner/BUILD
index ff2be9b30..605dd4972 100644
--- a/test/packetimpact/runner/BUILD
+++ b/test/packetimpact/runner/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "bzl_library", "go_test")
+load("//tools:defs.bzl", "bzl_library", "go_library", "go_test")
 
 package(
     default_visibility = ["//test/packetimpact:__subpackages__"],
@@ -7,21 +7,31 @@ package(
 
 go_test(
     name = "packetimpact_test",
-    srcs = ["packetimpact_test.go"],
+    srcs = [
+        "packetimpact_test.go",
+    ],
     tags = [
         # Not intended to be run directly.
         "local",
         "manual",
     ],
-    deps = [
-        "//pkg/test/dockerutil",
-        "//test/packetimpact/netdevs",
-        "@com_github_docker_docker//api/types/mount:go_default_library",
-    ],
+    deps = [":runner"],
 )
 
 bzl_library(
     name = "defs_bzl",
     srcs = ["defs.bzl"],
-    visibility = ["//visibility:private"],
+    visibility = ["//test/packetimpact:__subpackages__"],
+)
+
+go_library(
+    name = "runner",
+    testonly = True,
+    srcs = ["dut.go"],
+    visibility = ["//test/packetimpact:__subpackages__"],
+    deps = [
+        "//pkg/test/dockerutil",
+        "//test/packetimpact/netdevs",
+        "@com_github_docker_docker//api/types/mount:go_default_library",
+    ],
 )
diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl
index d72c63fe6..f56d3c42e 100644
--- a/test/packetimpact/runner/defs.bzl
+++ b/test/packetimpact/runner/defs.bzl
@@ -23,8 +23,9 @@ def _packetimpact_test_impl(ctx):
     transitive_files = []
     if hasattr(ctx.attr._test_runner, "data_runfiles"):
         transitive_files.append(ctx.attr._test_runner.data_runfiles.files)
+    files = [test_runner] + ctx.files.testbench_binary + ctx.files._posix_server
     runfiles = ctx.runfiles(
-        files = [test_runner] + ctx.files.testbench_binary + ctx.files._posix_server_binary,
+        files = files,
         transitive_files = depset(transitive = transitive_files),
         collect_default = True,
         collect_data = True,
@@ -38,7 +39,7 @@ _packetimpact_test = rule(
             cfg = "target",
             default = ":packetimpact_test",
         ),
-        "_posix_server_binary": attr.label(
+        "_posix_server": attr.label(
             cfg = "target",
             default = "//test/packetimpact/dut:posix_server",
         ),
diff --git a/test/packetimpact/runner/dut.go b/test/packetimpact/runner/dut.go
new file mode 100644
index 000000000..be7b52f18
--- /dev/null
+++ b/test/packetimpact/runner/dut.go
@@ -0,0 +1,438 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package runner starts docker containers and networking for a packetimpact test.
+package runner
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"math/rand"
+	"net"
+	"os"
+	"os/exec"
+	"path"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/docker/docker/api/types/mount"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/test/packetimpact/netdevs"
+)
+
+// stringList implements flag.Value.
+type stringList []string
+
+// String implements flag.Value.String.
+func (l *stringList) String() string {
+	return strings.Join(*l, ",")
+}
+
+// Set implements flag.Value.Set.
+func (l *stringList) Set(value string) error {
+	*l = append(*l, value)
+	return nil
+}
+
+var (
+	native          = false
+	testbenchBinary = ""
+	tshark          = false
+	extraTestArgs   = stringList{}
+	expectFailure   = false
+
+	// DutAddr is the IP addres for DUT.
+	DutAddr       = net.IPv4(0, 0, 0, 10)
+	testbenchAddr = net.IPv4(0, 0, 0, 20)
+)
+
+// RegisterFlags defines flags and associates them with the package-level
+// exported variables above. It should be called by tests in their init
+// functions.
+func RegisterFlags(fs *flag.FlagSet) {
+	fs.BoolVar(&native, "native", false, "whether the test should be run natively")
+	fs.StringVar(&testbenchBinary, "testbench_binary", "", "path to the testbench binary")
+	fs.BoolVar(&tshark, "tshark", false, "use more verbose tshark in logs instead of tcpdump")
+	flag.Var(&extraTestArgs, "extra_test_arg", "extra arguments to pass to the testbench")
+	flag.BoolVar(&expectFailure, "expect_failure", false, "expect that the test will fail when run")
+}
+
+// CtrlPort is the port that posix_server listens on.
+const CtrlPort = "40000"
+
+// logger implements testutil.Logger.
+//
+// Labels logs based on their source and formats multi-line logs.
+type logger string
+
+// Name implements testutil.Logger.Name.
+func (l logger) Name() string {
+	return string(l)
+}
+
+// Logf implements testutil.Logger.Logf.
+func (l logger) Logf(format string, args ...interface{}) {
+	lines := strings.Split(fmt.Sprintf(format, args...), "\n")
+	log.Printf("%s: %s", l, lines[0])
+	for _, line := range lines[1:] {
+		log.Printf("%*s  %s", len(l), "", line)
+	}
+}
+
+// TestWithDUT runs a packetimpact test with the given information.
+func TestWithDUT(ctx context.Context, t *testing.T, mkDevice func(*dockerutil.Container) DUT, containerAddr net.IP) {
+	if testbenchBinary == "" {
+		t.Fatal("--testbench_binary is missing")
+	}
+	dockerutil.EnsureSupportedDockerVersion()
+
+	// Create the networks needed for the test. One control network is needed for
+	// the gRPC control packets and one test network on which to transmit the test
+	// packets.
+	ctrlNet := dockerutil.NewNetwork(ctx, logger("ctrlNet"))
+	testNet := dockerutil.NewNetwork(ctx, logger("testNet"))
+	for _, dn := range []*dockerutil.Network{ctrlNet, testNet} {
+		for {
+			if err := createDockerNetwork(ctx, dn); err != nil {
+				t.Log("creating docker network:", err)
+				const wait = 100 * time.Millisecond
+				t.Logf("sleeping %s and will try creating docker network again", wait)
+				// This can fail if another docker network claimed the same IP so we'll
+				// just try again.
+				time.Sleep(wait)
+				continue
+			}
+			break
+		}
+		dn := dn
+		t.Cleanup(func() {
+			if err := dn.Cleanup(ctx); err != nil {
+				t.Errorf("unable to cleanup container %s: %s", dn.Name, err)
+			}
+		})
+		// Sanity check.
+		if inspect, err := dn.Inspect(ctx); err != nil {
+			t.Fatalf("failed to inspect network %s: %v", dn.Name, err)
+		} else if inspect.Name != dn.Name {
+			t.Fatalf("name mismatch for network want: %s got: %s", dn.Name, inspect.Name)
+		}
+	}
+
+	tmpDir, err := ioutil.TempDir("", "container-output")
+	if err != nil {
+		t.Fatal("creating temp dir:", err)
+	}
+	t.Cleanup(func() {
+		if err := exec.Command("/bin/cp", "-r", tmpDir, os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR")).Run(); err != nil {
+			t.Errorf("unable to copy container output files: %s", err)
+		}
+		if err := os.RemoveAll(tmpDir); err != nil {
+			t.Errorf("failed to remove tmpDir %s: %s", tmpDir, err)
+		}
+	})
+
+	const testOutputDir = "/tmp/testoutput"
+
+	// Create the Docker container for the DUT.
+	var dut *dockerutil.Container
+	if native {
+		dut = dockerutil.MakeNativeContainer(ctx, logger("dut"))
+	} else {
+		dut = dockerutil.MakeContainer(ctx, logger("dut"))
+	}
+	t.Cleanup(func() {
+		dut.CleanUp(ctx)
+	})
+
+	runOpts := dockerutil.RunOpts{
+		Image:  "packetimpact",
+		CapAdd: []string{"NET_ADMIN"},
+		Mounts: []mount.Mount{{
+			Type:     mount.TypeBind,
+			Source:   tmpDir,
+			Target:   testOutputDir,
+			ReadOnly: false,
+		}},
+	}
+
+	// Add ctrlNet as eth1 and testNet as eth2.
+	const testNetDev = "eth2"
+
+	device := mkDevice(dut)
+	remoteIPv6, remoteMAC, dutDeviceID := device.Prepare(ctx, t, runOpts, ctrlNet, testNet, containerAddr)
+
+	// Create the Docker container for the testbench.
+	testbench := dockerutil.MakeNativeContainer(ctx, logger("testbench"))
+
+	tbb := path.Base(testbenchBinary)
+	containerTestbenchBinary := filepath.Join("/packetimpact", tbb)
+	testbench.CopyFiles(&runOpts, "/packetimpact", filepath.Join("test/packetimpact/tests", tbb))
+
+	// Run tcpdump in the test bench unbuffered, without DNS resolution, just on
+	// the interface with the test packets.
+	snifferArgs := []string{
+		"tcpdump",
+		"-S", "-vvv", "-U", "-n",
+		"-i", testNetDev,
+		"-w", testOutputDir + "/dump.pcap",
+	}
+	snifferRegex := "tcpdump: listening.*\n"
+	if tshark {
+		// Run tshark in the test bench unbuffered, without DNS resolution, just on
+		// the interface with the test packets.
+		snifferArgs = []string{
+			"tshark", "-V", "-l", "-n", "-i", testNetDev,
+			"-o", "tcp.check_checksum:TRUE",
+			"-o", "udp.check_checksum:TRUE",
+		}
+		snifferRegex = "Capturing on.*\n"
+	}
+
+	if err := StartContainer(
+		ctx,
+		runOpts,
+		testbench,
+		testbenchAddr,
+		[]*dockerutil.Network{ctrlNet, testNet},
+		snifferArgs...,
+	); err != nil {
+		t.Fatalf("failed to start docker container for testbench sniffer: %s", err)
+	}
+	// Kill so that it will flush output.
+	t.Cleanup(func() {
+		time.Sleep(1 * time.Second)
+		testbench.Exec(ctx, dockerutil.ExecOpts{}, "killall", snifferArgs[0])
+	})
+
+	if _, err := testbench.WaitForOutput(ctx, snifferRegex, 60*time.Second); err != nil {
+		t.Fatalf("sniffer on %s never listened: %s", dut.Name, err)
+	}
+
+	// When the Linux kernel receives a SYN-ACK for a SYN it didn't send, it
+	// will respond with an RST. In most packetimpact tests, the SYN is sent
+	// by the raw socket and the kernel knows nothing about the connection, this
+	// behavior will break lots of TCP related packetimpact tests. To prevent
+	// this, we can install the following iptables rules. The raw socket that
+	// packetimpact tests use will still be able to see everything.
+	for _, bin := range []string{"iptables", "ip6tables"} {
+		if logs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, bin, "-A", "INPUT", "-i", testNetDev, "-p", "tcp", "-j", "DROP"); err != nil {
+			t.Fatalf("unable to Exec %s on container %s: %s, logs from testbench:\n%s", bin, testbench.Name, err, logs)
+		}
+	}
+
+	// FIXME(b/156449515): Some piece of the system has a race. The old
+	// bash script version had a sleep, so we have one too. The race should
+	// be fixed and this sleep removed.
+	time.Sleep(time.Second)
+
+	// Start a packetimpact test on the test bench. The packetimpact test sends
+	// and receives packets and also sends POSIX socket commands to the
+	// posix_server to be executed on the DUT.
+	testArgs := []string{containerTestbenchBinary}
+	testArgs = append(testArgs, extraTestArgs...)
+	testArgs = append(testArgs,
+		"--posix_server_ip", AddressInSubnet(DutAddr, *ctrlNet.Subnet).String(),
+		"--posix_server_port", CtrlPort,
+		"--remote_ipv4", AddressInSubnet(DutAddr, *testNet.Subnet).String(),
+		"--local_ipv4", AddressInSubnet(testbenchAddr, *testNet.Subnet).String(),
+		"--remote_ipv6", remoteIPv6.String(),
+		"--remote_mac", remoteMAC.String(),
+		"--remote_interface_id", fmt.Sprintf("%d", dutDeviceID),
+		"--device", testNetDev,
+		fmt.Sprintf("--native=%t", native),
+	)
+	testbenchLogs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, testArgs...)
+	if (err != nil) != expectFailure {
+		var dutLogs string
+		if logs, err := device.Logs(ctx); err != nil {
+			dutLogs = fmt.Sprintf("failed to fetch DUT logs: %s", err)
+		} else {
+			dutLogs = logs
+		}
+
+		t.Errorf(`test error: %v, expect failure: %t
+
+%s
+
+====== Begin of Testbench Logs ======
+
+%s
+
+====== End of Testbench Logs ======`,
+			err, expectFailure, dutLogs, testbenchLogs)
+	}
+}
+
+// DUT describes how to setup/teardown the dut for packetimpact tests.
+type DUT interface {
+	// Prepare prepares the dut, starts posix_server and returns the IPv6, MAC
+	// address and the interface ID for the testNet on DUT.
+	Prepare(ctx context.Context, t *testing.T, runOpts dockerutil.RunOpts, ctrlNet, testNet *dockerutil.Network, containerAddr net.IP) (net.IP, net.HardwareAddr, uint32)
+	// Logs retrieves the logs from the dut.
+	Logs(ctx context.Context) (string, error)
+}
+
+// DockerDUT describes a docker based DUT.
+type DockerDUT struct {
+	c *dockerutil.Container
+}
+
+// NewDockerDUT creates a docker based DUT.
+func NewDockerDUT(c *dockerutil.Container) DUT {
+	return &DockerDUT{
+		c: c,
+	}
+}
+
+// Prepare implements DUT.Prepare.
+func (dut *DockerDUT) Prepare(ctx context.Context, t *testing.T, runOpts dockerutil.RunOpts, ctrlNet, testNet *dockerutil.Network, containerAddr net.IP) (net.IP, net.HardwareAddr, uint32) {
+	const containerPosixServerBinary = "/packetimpact/posix_server"
+	dut.c.CopyFiles(&runOpts, "/packetimpact", "test/packetimpact/dut/posix_server")
+
+	if err := StartContainer(
+		ctx,
+		runOpts,
+		dut.c,
+		containerAddr,
+		[]*dockerutil.Network{ctrlNet, testNet},
+		containerPosixServerBinary,
+		"--ip=0.0.0.0",
+		"--port="+CtrlPort,
+	); err != nil {
+		t.Fatalf("failed to start docker container for DUT: %s", err)
+	}
+
+	if _, err := dut.c.WaitForOutput(ctx, "Server listening.*\n", 60*time.Second); err != nil {
+		t.Fatalf("%s on container %s never listened: %s", containerPosixServerBinary, dut.c.Name, err)
+	}
+
+	dutTestDevice, dutDeviceInfo, err := deviceByIP(ctx, dut.c, AddressInSubnet(containerAddr, *testNet.Subnet))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	remoteMAC := dutDeviceInfo.MAC
+	remoteIPv6 := dutDeviceInfo.IPv6Addr
+	// Netstack as DUT doesn't assign IPv6 addresses automatically so do it if
+	// needed.
+	if remoteIPv6 == nil {
+		if _, err := dut.c.Exec(ctx, dockerutil.ExecOpts{}, "ip", "addr", "add", netdevs.MACToIP(remoteMAC).String(), "scope", "link", "dev", dutTestDevice); err != nil {
+			t.Fatalf("unable to ip addr add on container %s: %s", dut.c.Name, err)
+		}
+		// Now try again, to make sure that it worked.
+		_, dutDeviceInfo, err = deviceByIP(ctx, dut.c, AddressInSubnet(containerAddr, *testNet.Subnet))
+		if err != nil {
+			t.Fatal(err)
+		}
+		remoteIPv6 = dutDeviceInfo.IPv6Addr
+		if remoteIPv6 == nil {
+			t.Fatalf("unable to set IPv6 address on container %s", dut.c.Name)
+		}
+	}
+	return remoteIPv6, dutDeviceInfo.MAC, dutDeviceInfo.ID
+}
+
+// Logs implements DUT.Logs.
+func (dut *DockerDUT) Logs(ctx context.Context) (string, error) {
+	logs, err := dut.c.Logs(ctx)
+	if err != nil {
+		return "", err
+	}
+	return fmt.Sprintf(`====== Begin of DUT Logs ======
+
+%s
+
+====== End of DUT Logs ======`, logs), nil
+}
+
+// AddNetworks connects docker network with the container and assigns the specific IP.
+func AddNetworks(ctx context.Context, d *dockerutil.Container, addr net.IP, networks []*dockerutil.Network) error {
+	for _, dn := range networks {
+		ip := AddressInSubnet(addr, *dn.Subnet)
+		// Connect to the network with the specified IP address.
+		if err := dn.Connect(ctx, d, ip.String(), ""); err != nil {
+			return fmt.Errorf("unable to connect container %s to network %s: %w", d.Name, dn.Name, err)
+		}
+	}
+	return nil
+}
+
+// AddressInSubnet combines the subnet provided with the address and returns a
+// new address. The return address bits come from the subnet where the mask is 1
+// and from the ip address where the mask is 0.
+func AddressInSubnet(addr net.IP, subnet net.IPNet) net.IP {
+	var octets []byte
+	for i := 0; i < 4; i++ {
+		octets = append(octets, (subnet.IP.To4()[i]&subnet.Mask[i])+(addr.To4()[i]&(^subnet.Mask[i])))
+	}
+	return net.IP(octets)
+}
+
+// deviceByIP finds a deviceInfo and device name from an IP address.
+func deviceByIP(ctx context.Context, d *dockerutil.Container, ip net.IP) (string, netdevs.DeviceInfo, error) {
+	out, err := d.Exec(ctx, dockerutil.ExecOpts{}, "ip", "addr", "show")
+	if err != nil {
+		return "", netdevs.DeviceInfo{}, fmt.Errorf("listing devices on %s container: %w\n%s", d.Name, err, out)
+	}
+	devs, err := netdevs.ParseDevices(out)
+	if err != nil {
+		return "", netdevs.DeviceInfo{}, fmt.Errorf("parsing devices from %s container: %w\n%s", d.Name, err, out)
+	}
+	testDevice, deviceInfo, err := netdevs.FindDeviceByIP(ip, devs)
+	if err != nil {
+		return "", netdevs.DeviceInfo{}, fmt.Errorf("can't find deviceInfo for container %s: %w", d.Name, err)
+	}
+	return testDevice, deviceInfo, nil
+}
+
+// createDockerNetwork makes a randomly-named network that will start with the
+// namePrefix. The network will be a random /24 subnet.
+func createDockerNetwork(ctx context.Context, n *dockerutil.Network) error {
+	randSource := rand.NewSource(time.Now().UnixNano())
+	r1 := rand.New(randSource)
+	// Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
+	ip := net.IPv4(byte(r1.Intn(224-192)+192), byte(r1.Intn(256)), byte(r1.Intn(256)), 0)
+	n.Subnet = &net.IPNet{
+		IP:   ip,
+		Mask: ip.DefaultMask(),
+	}
+	return n.Create(ctx)
+}
+
+// StartContainer will create a container instance from runOpts, connect it
+// with the specified docker networks and start executing the specified cmd.
+func StartContainer(ctx context.Context, runOpts dockerutil.RunOpts, c *dockerutil.Container, containerAddr net.IP, ns []*dockerutil.Network, cmd ...string) error {
+	conf, hostconf, netconf := c.ConfigsFrom(runOpts, cmd...)
+	_ = netconf
+	hostconf.AutoRemove = true
+	hostconf.Sysctls = map[string]string{"net.ipv6.conf.all.disable_ipv6": "0"}
+
+	if err := c.CreateFrom(ctx, conf, hostconf, nil); err != nil {
+		return fmt.Errorf("unable to create container %s: %w", c.Name, err)
+	}
+
+	if err := AddNetworks(ctx, c, containerAddr, ns); err != nil {
+		return fmt.Errorf("unable to connect the container with the networks: %w", err)
+	}
+
+	if err := c.Start(ctx); err != nil {
+		return fmt.Errorf("unable to start container %s: %w", c.Name, err)
+	}
+	return nil
+}
diff --git a/test/packetimpact/runner/packetimpact_test.go b/test/packetimpact/runner/packetimpact_test.go
index cb9bfd5b7..c598bfc29 100644
--- a/test/packetimpact/runner/packetimpact_test.go
+++ b/test/packetimpact/runner/packetimpact_test.go
@@ -18,366 +18,15 @@ package packetimpact_test
 import (
 	"context"
 	"flag"
-	"fmt"
-	"io/ioutil"
-	"log"
-	"math/rand"
-	"net"
-	"os"
-	"os/exec"
-	"path"
-	"strings"
 	"testing"
-	"time"
 
-	"github.com/docker/docker/api/types/mount"
-	"gvisor.dev/gvisor/pkg/test/dockerutil"
-	"gvisor.dev/gvisor/test/packetimpact/netdevs"
+	"gvisor.dev/gvisor/test/packetimpact/runner"
 )
 
-// stringList implements flag.Value.
-type stringList []string
-
-// String implements flag.Value.String.
-func (l *stringList) String() string {
-	return strings.Join(*l, ",")
-}
-
-// Set implements flag.Value.Set.
-func (l *stringList) Set(value string) error {
-	*l = append(*l, value)
-	return nil
-}
-
-var (
-	native          = flag.Bool("native", false, "whether the test should be run natively")
-	testbenchBinary = flag.String("testbench_binary", "", "path to the testbench binary")
-	tshark          = flag.Bool("tshark", false, "use more verbose tshark in logs instead of tcpdump")
-	extraTestArgs   = stringList{}
-	expectFailure   = flag.Bool("expect_failure", false, "expect that the test will fail when run")
-
-	dutAddr       = net.IPv4(0, 0, 0, 10)
-	testbenchAddr = net.IPv4(0, 0, 0, 20)
-)
-
-const ctrlPort = "40000"
-
-// logger implements testutil.Logger.
-//
-// Labels logs based on their source and formats multi-line logs.
-type logger string
-
-// Name implements testutil.Logger.Name.
-func (l logger) Name() string {
-	return string(l)
-}
-
-// Logf implements testutil.Logger.Logf.
-func (l logger) Logf(format string, args ...interface{}) {
-	lines := strings.Split(fmt.Sprintf(format, args...), "\n")
-	log.Printf("%s: %s", l, lines[0])
-	for _, line := range lines[1:] {
-		log.Printf("%*s  %s", len(l), "", line)
-	}
+func init() {
+	runner.RegisterFlags(flag.CommandLine)
 }
 
 func TestOne(t *testing.T) {
-	flag.Var(&extraTestArgs, "extra_test_arg", "extra arguments to pass to the testbench")
-	flag.Parse()
-	if *testbenchBinary == "" {
-		t.Fatal("--testbench_binary is missing")
-	}
-	dockerutil.EnsureSupportedDockerVersion()
-	ctx := context.Background()
-
-	// Create the networks needed for the test. One control network is needed for
-	// the gRPC control packets and one test network on which to transmit the test
-	// packets.
-	ctrlNet := dockerutil.NewNetwork(ctx, logger("ctrlNet"))
-	testNet := dockerutil.NewNetwork(ctx, logger("testNet"))
-	for _, dn := range []*dockerutil.Network{ctrlNet, testNet} {
-		for {
-			if err := createDockerNetwork(ctx, dn); err != nil {
-				t.Log("creating docker network:", err)
-				const wait = 100 * time.Millisecond
-				t.Logf("sleeping %s and will try creating docker network again", wait)
-				// This can fail if another docker network claimed the same IP so we'll
-				// just try again.
-				time.Sleep(wait)
-				continue
-			}
-			break
-		}
-		defer func(dn *dockerutil.Network) {
-			if err := dn.Cleanup(ctx); err != nil {
-				t.Errorf("unable to cleanup container %s: %s", dn.Name, err)
-			}
-		}(dn)
-		// Sanity check.
-		inspect, err := dn.Inspect(ctx)
-		if err != nil {
-			t.Fatalf("failed to inspect network %s: %v", dn.Name, err)
-		} else if inspect.Name != dn.Name {
-			t.Fatalf("name mismatch for network want: %s got: %s", dn.Name, inspect.Name)
-		}
-
-	}
-
-	tmpDir, err := ioutil.TempDir("", "container-output")
-	if err != nil {
-		t.Fatal("creating temp dir:", err)
-	}
-	defer os.RemoveAll(tmpDir)
-
-	const testOutputDir = "/tmp/testoutput"
-
-	// Create the Docker container for the DUT.
-	var dut *dockerutil.Container
-	if *native {
-		dut = dockerutil.MakeNativeContainer(ctx, logger("dut"))
-	} else {
-		dut = dockerutil.MakeContainer(ctx, logger("dut"))
-	}
-
-	runOpts := dockerutil.RunOpts{
-		Image:  "packetimpact",
-		CapAdd: []string{"NET_ADMIN"},
-		Mounts: []mount.Mount{mount.Mount{
-			Type:     mount.TypeBind,
-			Source:   tmpDir,
-			Target:   testOutputDir,
-			ReadOnly: false,
-		}},
-	}
-
-	const containerPosixServerBinary = "/packetimpact/posix_server"
-	dut.CopyFiles(&runOpts, "/packetimpact", "/test/packetimpact/dut/posix_server")
-
-	conf, hostconf, _ := dut.ConfigsFrom(runOpts, containerPosixServerBinary, "--ip=0.0.0.0", "--port="+ctrlPort)
-	hostconf.AutoRemove = true
-	hostconf.Sysctls = map[string]string{"net.ipv6.conf.all.disable_ipv6": "0"}
-
-	if err := dut.CreateFrom(ctx, conf, hostconf, nil); err != nil {
-		t.Fatalf("unable to create container %s: %v", dut.Name, err)
-	}
-
-	defer dut.CleanUp(ctx)
-
-	// Add ctrlNet as eth1 and testNet as eth2.
-	const testNetDev = "eth2"
-	if err := addNetworks(ctx, dut, dutAddr, []*dockerutil.Network{ctrlNet, testNet}); err != nil {
-		t.Fatal(err)
-	}
-
-	if err := dut.Start(ctx); err != nil {
-		t.Fatalf("unable to start container %s: %s", dut.Name, err)
-	}
-
-	if _, err := dut.WaitForOutput(ctx, "Server listening.*\n", 60*time.Second); err != nil {
-		t.Fatalf("%s on container %s never listened: %s", containerPosixServerBinary, dut.Name, err)
-	}
-
-	dutTestDevice, dutDeviceInfo, err := deviceByIP(ctx, dut, addressInSubnet(dutAddr, *testNet.Subnet))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	remoteMAC := dutDeviceInfo.MAC
-	remoteIPv6 := dutDeviceInfo.IPv6Addr
-	// Netstack as DUT doesn't assign IPv6 addresses automatically so do it if
-	// needed.
-	if remoteIPv6 == nil {
-		if _, err := dut.Exec(ctx, dockerutil.ExecOpts{}, "ip", "addr", "add", netdevs.MACToIP(remoteMAC).String(), "scope", "link", "dev", dutTestDevice); err != nil {
-			t.Fatalf("unable to ip addr add on container %s: %s", dut.Name, err)
-		}
-		// Now try again, to make sure that it worked.
-		_, dutDeviceInfo, err = deviceByIP(ctx, dut, addressInSubnet(dutAddr, *testNet.Subnet))
-		if err != nil {
-			t.Fatal(err)
-		}
-		remoteIPv6 = dutDeviceInfo.IPv6Addr
-		if remoteIPv6 == nil {
-			t.Fatal("unable to set IPv6 address on container", dut.Name)
-		}
-	}
-
-	// Create the Docker container for the testbench.
-	testbench := dockerutil.MakeNativeContainer(ctx, logger("testbench"))
-
-	tbb := path.Base(*testbenchBinary)
-	containerTestbenchBinary := "/packetimpact/" + tbb
-	runOpts = dockerutil.RunOpts{
-		Image:  "packetimpact",
-		CapAdd: []string{"NET_ADMIN"},
-		Mounts: []mount.Mount{mount.Mount{
-			Type:     mount.TypeBind,
-			Source:   tmpDir,
-			Target:   testOutputDir,
-			ReadOnly: false,
-		}},
-	}
-	testbench.CopyFiles(&runOpts, "/packetimpact", "/test/packetimpact/tests/"+tbb)
-
-	// Run tcpdump in the test bench unbuffered, without DNS resolution, just on
-	// the interface with the test packets.
-	snifferArgs := []string{
-		"tcpdump",
-		"-S", "-vvv", "-U", "-n",
-		"-i", testNetDev,
-		"-w", testOutputDir + "/dump.pcap",
-	}
-	snifferRegex := "tcpdump: listening.*\n"
-	if *tshark {
-		// Run tshark in the test bench unbuffered, without DNS resolution, just on
-		// the interface with the test packets.
-		snifferArgs = []string{
-			"tshark", "-V", "-l", "-n", "-i", testNetDev,
-			"-o", "tcp.check_checksum:TRUE",
-			"-o", "udp.check_checksum:TRUE",
-		}
-		snifferRegex = "Capturing on.*\n"
-	}
-
-	defer func() {
-		if err := exec.Command("/bin/cp", "-r", tmpDir, os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR")).Run(); err != nil {
-			t.Error("unable to copy container output files:", err)
-		}
-	}()
-
-	conf, hostconf, _ = testbench.ConfigsFrom(runOpts, snifferArgs...)
-	hostconf.AutoRemove = true
-	hostconf.Sysctls = map[string]string{"net.ipv6.conf.all.disable_ipv6": "0"}
-
-	if err := testbench.CreateFrom(ctx, conf, hostconf, nil); err != nil {
-		t.Fatalf("unable to create container %s: %s", testbench.Name, err)
-	}
-	defer testbench.CleanUp(ctx)
-
-	// Add ctrlNet as eth1 and testNet as eth2.
-	if err := addNetworks(ctx, testbench, testbenchAddr, []*dockerutil.Network{ctrlNet, testNet}); err != nil {
-		t.Fatal(err)
-	}
-
-	if err := testbench.Start(ctx); err != nil {
-		t.Fatalf("unable to start container %s: %s", testbench.Name, err)
-	}
-
-	// Kill so that it will flush output.
-	defer func() {
-		time.Sleep(1 * time.Second)
-		testbench.Exec(ctx, dockerutil.ExecOpts{}, "killall", snifferArgs[0])
-	}()
-
-	if _, err := testbench.WaitForOutput(ctx, snifferRegex, 60*time.Second); err != nil {
-		t.Fatalf("sniffer on %s never listened: %s", dut.Name, err)
-	}
-
-	// Because the Linux kernel receives the SYN-ACK but didn't send the SYN it
-	// will issue an RST. To prevent this IPtables can be used to filter out all
-	// incoming packets. The raw socket that packetimpact tests use will still see
-	// everything.
-	for _, bin := range []string{"iptables", "ip6tables"} {
-		if logs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, bin, "-A", "INPUT", "-i", testNetDev, "-p", "tcp", "-j", "DROP"); err != nil {
-			t.Fatalf("unable to Exec %s on container %s: %s, logs from testbench:\n%s", bin, testbench.Name, err, logs)
-		}
-	}
-
-	// FIXME(b/156449515): Some piece of the system has a race. The old
-	// bash script version had a sleep, so we have one too. The race should
-	// be fixed and this sleep removed.
-	time.Sleep(time.Second)
-
-	// Start a packetimpact test on the test bench. The packetimpact test sends
-	// and receives packets and also sends POSIX socket commands to the
-	// posix_server to be executed on the DUT.
-	testArgs := []string{containerTestbenchBinary}
-	testArgs = append(testArgs, extraTestArgs...)
-	testArgs = append(testArgs,
-		"--posix_server_ip", addressInSubnet(dutAddr, *ctrlNet.Subnet).String(),
-		"--posix_server_port", ctrlPort,
-		"--remote_ipv4", addressInSubnet(dutAddr, *testNet.Subnet).String(),
-		"--local_ipv4", addressInSubnet(testbenchAddr, *testNet.Subnet).String(),
-		"--remote_ipv6", remoteIPv6.String(),
-		"--remote_mac", remoteMAC.String(),
-		"--remote_interface_id", fmt.Sprintf("%d", dutDeviceInfo.ID),
-		"--device", testNetDev,
-		fmt.Sprintf("--native=%t", *native),
-	)
-	testbenchLogs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, testArgs...)
-	if (err != nil) != *expectFailure {
-		var dutLogs string
-		if logs, err := dut.Logs(ctx); err != nil {
-			dutLogs = fmt.Sprintf("failed to fetch DUT logs: %s", err)
-		} else {
-			dutLogs = logs
-		}
-
-		t.Errorf(`test error: %v, expect failure: %t
-
-====== Begin of DUT Logs ======
-
-%s
-
-====== End of DUT Logs ======
-
-====== Begin of Testbench Logs ======
-
-%s
-
-====== End of Testbench Logs ======`,
-			err, *expectFailure, dutLogs, testbenchLogs)
-	}
-}
-
-func addNetworks(ctx context.Context, d *dockerutil.Container, addr net.IP, networks []*dockerutil.Network) error {
-	for _, dn := range networks {
-		ip := addressInSubnet(addr, *dn.Subnet)
-		// Connect to the network with the specified IP address.
-		if err := dn.Connect(ctx, d, ip.String(), ""); err != nil {
-			return fmt.Errorf("unable to connect container %s to network %s: %w", d.Name, dn.Name, err)
-		}
-	}
-	return nil
-}
-
-// addressInSubnet combines the subnet provided with the address and returns a
-// new address. The return address bits come from the subnet where the mask is 1
-// and from the ip address where the mask is 0.
-func addressInSubnet(addr net.IP, subnet net.IPNet) net.IP {
-	var octets []byte
-	for i := 0; i < 4; i++ {
-		octets = append(octets, (subnet.IP.To4()[i]&subnet.Mask[i])+(addr.To4()[i]&(^subnet.Mask[i])))
-	}
-	return net.IP(octets)
-}
-
-// createDockerNetwork makes a randomly-named network that will start with the
-// namePrefix. The network will be a random /24 subnet.
-func createDockerNetwork(ctx context.Context, n *dockerutil.Network) error {
-	randSource := rand.NewSource(time.Now().UnixNano())
-	r1 := rand.New(randSource)
-	// Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
-	ip := net.IPv4(byte(r1.Intn(224-192)+192), byte(r1.Intn(256)), byte(r1.Intn(256)), 0)
-	n.Subnet = &net.IPNet{
-		IP:   ip,
-		Mask: ip.DefaultMask(),
-	}
-	return n.Create(ctx)
-}
-
-// deviceByIP finds a deviceInfo and device name from an IP address.
-func deviceByIP(ctx context.Context, d *dockerutil.Container, ip net.IP) (string, netdevs.DeviceInfo, error) {
-	out, err := d.Exec(ctx, dockerutil.ExecOpts{}, "ip", "addr", "show")
-	if err != nil {
-		return "", netdevs.DeviceInfo{}, fmt.Errorf("listing devices on %s container: %w\n%s", d.Name, err, out)
-	}
-	devs, err := netdevs.ParseDevices(out)
-	if err != nil {
-		return "", netdevs.DeviceInfo{}, fmt.Errorf("parsing devices from %s container: %w\n%s", d.Name, err, out)
-	}
-	testDevice, deviceInfo, err := netdevs.FindDeviceByIP(ip, devs)
-	if err != nil {
-		return "", netdevs.DeviceInfo{}, fmt.Errorf("can't find deviceInfo for container %s: %w", d.Name, err)
-	}
-	return testDevice, deviceInfo, nil
+	runner.TestWithDUT(context.Background(), t, runner.NewDockerDUT, runner.DutAddr)
 }
-- 
cgit v1.2.3


From f6d2444ed32ffef47ccc72a595d97721f3fa0561 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 3 Sep 2020 15:21:10 -0700
Subject: Use atomic.Value for Stack.tcpProbeFunc.

b/166980357#comment56 shows:

- 837 goroutines blocked in:
gvisor/pkg/sync/sync.(*RWMutex).Lock
gvisor/pkg/tcpip/stack/stack.(*Stack).StartTransportEndpointCleanup
gvisor/pkg/tcpip/transport/tcp/tcp.(*endpoint).cleanupLocked
gvisor/pkg/tcpip/transport/tcp/tcp.(*endpoint).completeWorkerLocked
gvisor/pkg/tcpip/transport/tcp/tcp.(*endpoint).protocolMainLoop.func1
gvisor/pkg/tcpip/transport/tcp/tcp.(*endpoint).protocolMainLoop

- 695 goroutines blocked in:
gvisor/pkg/sync/sync.(*RWMutex).Lock
gvisor/pkg/tcpip/stack/stack.(*Stack).CompleteTransportEndpointCleanup
gvisor/pkg/tcpip/transport/tcp/tcp.(*endpoint).cleanupLocked
gvisor/pkg/tcpip/transport/tcp/tcp.(*endpoint).completeWorkerLocked
gvisor/pkg/tcpip/transport/tcp/tcp.(*endpoint).protocolMainLoop.func1
gvisor/pkg/tcpip/transport/tcp/tcp.(*endpoint).protocolMainLoop

- 3882 goroutines blocked in:
gvisor/pkg/sync/sync.(*RWMutex).Lock
gvisor/pkg/tcpip/stack/stack.(*Stack).GetTCPProbe
gvisor/pkg/tcpip/transport/tcp/tcp.newEndpoint
gvisor/pkg/tcpip/transport/tcp/tcp.(*protocol).NewEndpoint
gvisor/pkg/tcpip/stack/stack.(*Stack).NewEndpoint

All of these are contending on Stack.mu. Stack.StartTransportEndpointCleanup()
and Stack.CompleteTransportEndpointCleanup() insert/delete TransportEndpoints
in a map (Stack.cleanupEndpoints), and the former also does endpoint
unregistration while holding Stack.mu, so it's not immediately clear how
feasible it is to replace the map with a mutex-less implementation or how much
doing so would help. However, Stack.GetTCPProbe() just reads a function object
(Stack.tcpProbeFunc) that is almost always nil (as far as I can tell,
Stack.AddTCPProbe() is only called in tests), and it's called for every new TCP
endpoint. So converting it to an atomic.Value should significantly reduce
contention on Stack.mu, improving TCP endpoint creation latency and allowing
TCP endpoint cleanup to proceed.

PiperOrigin-RevId: 330004140
---
 pkg/tcpip/stack/stack.go | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index c86ee1c13..66ce10357 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -429,7 +429,7 @@ type Stack struct {
 
 	// If not nil, then any new endpoints will have this probe function
 	// invoked everytime they receive a TCP segment.
-	tcpProbeFunc TCPProbeFunc
+	tcpProbeFunc atomic.Value // TCPProbeFunc
 
 	// clock is used to generate user-visible times.
 	clock tcpip.Clock
@@ -1795,18 +1795,17 @@ func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) Tra
 // guarantee provided on which probe will be invoked. Ideally this should only
 // be called once per stack.
 func (s *Stack) AddTCPProbe(probe TCPProbeFunc) {
-	s.mu.Lock()
-	s.tcpProbeFunc = probe
-	s.mu.Unlock()
+	s.tcpProbeFunc.Store(probe)
 }
 
 // GetTCPProbe returns the TCPProbeFunc if installed with AddTCPProbe, nil
 // otherwise.
 func (s *Stack) GetTCPProbe() TCPProbeFunc {
-	s.mu.Lock()
-	p := s.tcpProbeFunc
-	s.mu.Unlock()
-	return p
+	p := s.tcpProbeFunc.Load()
+	if p == nil {
+		return nil
+	}
+	return p.(TCPProbeFunc)
 }
 
 // RemoveTCPProbe removes an installed TCP probe.
@@ -1815,9 +1814,8 @@ func (s *Stack) GetTCPProbe() TCPProbeFunc {
 // have a probe attached. Endpoints already created will continue to invoke
 // TCP probe.
 func (s *Stack) RemoveTCPProbe() {
-	s.mu.Lock()
-	s.tcpProbeFunc = nil
-	s.mu.Unlock()
+	// This must be TCPProbeFunc(nil) because atomic.Value.Store(nil) panics.
+	s.tcpProbeFunc.Store(TCPProbeFunc(nil))
 }
 
 // JoinGroup joins the given multicast group on the given NIC.
-- 
cgit v1.2.3


From 4d5627f76346e8afbab1506b8151c7ccb3f82f16 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 3 Sep 2020 17:34:56 -0700
Subject: Use fine-grained mutex for stack.cleanupEndpoints.

stack.cleanupEndpoints is protected by the stack.mu but that can cause
contention as the stack mutex is already acquired in a lot of hot paths during
new endpoint creation /cleanup etc. Moving this to a fine grained mutex should
reduce contention on the stack.mu.

PiperOrigin-RevId: 330026151
---
 pkg/tcpip/stack/stack.go | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 66ce10357..133d90815 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -415,10 +415,13 @@ type Stack struct {
 
 	linkAddrCache *linkAddrCache
 
-	mu               sync.RWMutex
-	nics             map[tcpip.NICID]*NIC
-	forwarding       bool
-	cleanupEndpoints map[TransportEndpoint]struct{}
+	mu         sync.RWMutex
+	nics       map[tcpip.NICID]*NIC
+	forwarding bool
+
+	// cleanupEndpointsMu protects cleanupEndpoints.
+	cleanupEndpointsMu sync.Mutex
+	cleanupEndpoints   map[TransportEndpoint]struct{}
 
 	// route is the route table passed in by the user via SetRouteTable(),
 	// it is used by FindRoute() to build a route for a specific
@@ -1528,10 +1531,9 @@ func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip
 // StartTransportEndpointCleanup removes the endpoint with the given id from
 // the stack transport dispatcher. It also transitions it to the cleanup stage.
 func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
+	s.cleanupEndpointsMu.Lock()
 	s.cleanupEndpoints[ep] = struct{}{}
+	s.cleanupEndpointsMu.Unlock()
 
 	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
 }
@@ -1539,9 +1541,9 @@ func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcp
 // CompleteTransportEndpointCleanup removes the endpoint from the cleanup
 // stage.
 func (s *Stack) CompleteTransportEndpointCleanup(ep TransportEndpoint) {
-	s.mu.Lock()
+	s.cleanupEndpointsMu.Lock()
 	delete(s.cleanupEndpoints, ep)
-	s.mu.Unlock()
+	s.cleanupEndpointsMu.Unlock()
 }
 
 // FindTransportEndpoint finds an endpoint that most closely matches the provided
@@ -1584,23 +1586,23 @@ func (s *Stack) RegisteredEndpoints() []TransportEndpoint {
 
 // CleanupEndpoints returns endpoints currently in the cleanup state.
 func (s *Stack) CleanupEndpoints() []TransportEndpoint {
-	s.mu.Lock()
+	s.cleanupEndpointsMu.Lock()
 	es := make([]TransportEndpoint, 0, len(s.cleanupEndpoints))
 	for e := range s.cleanupEndpoints {
 		es = append(es, e)
 	}
-	s.mu.Unlock()
+	s.cleanupEndpointsMu.Unlock()
 	return es
 }
 
 // RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful
 // for restoring a stack after a save.
 func (s *Stack) RestoreCleanupEndpoints(es []TransportEndpoint) {
-	s.mu.Lock()
+	s.cleanupEndpointsMu.Lock()
 	for _, e := range es {
 		s.cleanupEndpoints[e] = struct{}{}
 	}
-	s.mu.Unlock()
+	s.cleanupEndpointsMu.Unlock()
 }
 
 // Close closes all currently registered transport endpoints.
-- 
cgit v1.2.3


From fb6c6faea2cefb05440845fccce9dcab0779b90d Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 3 Sep 2020 23:29:13 -0700
Subject: Adjust input file offset when sendfile only completes a partial
 write.

Fixes #3779.

PiperOrigin-RevId: 330057268
---
 pkg/sentry/syscalls/linux/vfs2/BUILD     |  1 +
 pkg/sentry/syscalls/linux/vfs2/splice.go | 22 ++++++++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index 64696b438..0030dee39 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -44,6 +44,7 @@ go_library(
         "//pkg/context",
         "//pkg/fspath",
         "//pkg/gohacks",
+        "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsbridge",
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
index 68ce94778..5543cfac2 100644
--- a/pkg/sentry/syscalls/linux/vfs2/splice.go
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -18,6 +18,7 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
@@ -390,16 +391,21 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 					err = dw.waitForOut(t)
 				}
 				if err != nil {
-					// We didn't complete the write. Only
-					// report the bytes that were actually
-					// written, and rewind the offset.
+					// We didn't complete the write. Only report the bytes that were actually
+					// written, and rewind offsets as needed.
 					notWritten := int64(len(wbuf))
 					n -= notWritten
-					if offset != -1 {
-						// TODO(gvisor.dev/issue/3779): The inFile offset will be incorrect if we
-						// roll back, because it has already been advanced by the full amount.
-						// Merely seeking on inFile does not work, because there may be concurrent
-						// file operations.
+					if offset == -1 {
+						// We modified the offset of the input file itself during the read
+						// operation. Rewind it.
+						if _, seekErr := inFile.Seek(t, -notWritten, linux.SEEK_CUR); seekErr != nil {
+							// Log the error but don't return it, since the write has already
+							// completed successfully.
+							log.Warningf("failed to roll back input file offset: %v", seekErr)
+						}
+					} else {
+						// The sendfile call was provided an offset parameter that should be
+						// adjusted to reflect the number of bytes sent. Rewind it.
 						offset -= notWritten
 					}
 					break
-- 
cgit v1.2.3


From 3daaddb90c3d72c6244ef379c1a9a651aa971bef Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 4 Sep 2020 11:36:41 -0700
Subject: Simplify FD handling for container start/exec

VFS1 and VFS2 host FDs have different dupping behavior,
making error prone to code for both. Change the contract
so that FDs are released as they are used, so the caller
can simple defer a block that closes all remaining files.
This also addresses handling of partial failures.

With this fix, more VFS2 tests can be enabled.

Updates #1487

PiperOrigin-RevId: 330112266
---
 pkg/fd/fd.go                            | 42 ++++++++++----
 pkg/sentry/control/BUILD                |  1 -
 pkg/sentry/control/proc.go              | 28 +++-------
 pkg/sentry/fdimport/BUILD               |  1 +
 pkg/sentry/fdimport/fdimport.go         | 22 +++++---
 runsc/boot/BUILD                        |  2 +
 runsc/boot/controller.go                | 12 +++-
 runsc/boot/fs.go                        |  7 ++-
 runsc/boot/loader.go                    | 99 ++++++++++++---------------------
 runsc/boot/loader_test.go               |  9 ++-
 runsc/container/multi_container_test.go |  9 ++-
 11 files changed, 120 insertions(+), 112 deletions(-)

(limited to 'pkg')

diff --git a/pkg/fd/fd.go b/pkg/fd/fd.go
index 83bcfe220..cc6b0cdf1 100644
--- a/pkg/fd/fd.go
+++ b/pkg/fd/fd.go
@@ -49,7 +49,7 @@ func fixCount(n int, err error) (int, error) {
 
 // Read implements io.Reader.
 func (r *ReadWriter) Read(b []byte) (int, error) {
-	c, err := fixCount(syscall.Read(int(atomic.LoadInt64(&r.fd)), b))
+	c, err := fixCount(syscall.Read(r.FD(), b))
 	if c == 0 && len(b) > 0 && err == nil {
 		return 0, io.EOF
 	}
@@ -62,7 +62,7 @@ func (r *ReadWriter) Read(b []byte) (int, error) {
 func (r *ReadWriter) ReadAt(b []byte, off int64) (c int, err error) {
 	for len(b) > 0 {
 		var m int
-		m, err = fixCount(syscall.Pread(int(atomic.LoadInt64(&r.fd)), b, off))
+		m, err = fixCount(syscall.Pread(r.FD(), b, off))
 		if m == 0 && err == nil {
 			return c, io.EOF
 		}
@@ -82,7 +82,7 @@ func (r *ReadWriter) Write(b []byte) (int, error) {
 	var n, remaining int
 	for remaining = len(b); remaining > 0; {
 		woff := len(b) - remaining
-		n, err = syscall.Write(int(atomic.LoadInt64(&r.fd)), b[woff:])
+		n, err = syscall.Write(r.FD(), b[woff:])
 
 		if n > 0 {
 			// syscall.Write wrote some bytes. This is the common case.
@@ -110,7 +110,7 @@ func (r *ReadWriter) Write(b []byte) (int, error) {
 func (r *ReadWriter) WriteAt(b []byte, off int64) (c int, err error) {
 	for len(b) > 0 {
 		var m int
-		m, err = fixCount(syscall.Pwrite(int(atomic.LoadInt64(&r.fd)), b, off))
+		m, err = fixCount(syscall.Pwrite(r.FD(), b, off))
 		if err != nil {
 			break
 		}
@@ -121,6 +121,16 @@ func (r *ReadWriter) WriteAt(b []byte, off int64) (c int, err error) {
 	return
 }
 
+// FD returns the owned file descriptor. Ownership remains unchanged.
+func (r *ReadWriter) FD() int {
+	return int(atomic.LoadInt64(&r.fd))
+}
+
+// String implements Stringer.String().
+func (r *ReadWriter) String() string {
+	return fmt.Sprintf("FD: %d", r.FD())
+}
+
 // FD owns a host file descriptor.
 //
 // It is similar to os.File, with a few important distinctions:
@@ -167,6 +177,23 @@ func NewFromFile(file *os.File) (*FD, error) {
 	return New(fd), nil
 }
 
+// NewFromFiles creates new FDs for each file in the slice.
+func NewFromFiles(files []*os.File) ([]*FD, error) {
+	rv := make([]*FD, 0, len(files))
+	for _, f := range files {
+		new, err := NewFromFile(f)
+		if err != nil {
+			// Cleanup on error.
+			for _, fd := range rv {
+				fd.Close()
+			}
+			return nil, err
+		}
+		rv = append(rv, new)
+	}
+	return rv, nil
+}
+
 // Open is equivalent to open(2).
 func Open(path string, openmode int, perm uint32) (*FD, error) {
 	f, err := syscall.Open(path, openmode|syscall.O_LARGEFILE, perm)
@@ -204,11 +231,6 @@ func (f *FD) Release() int {
 	return int(atomic.SwapInt64(&f.fd, -1))
 }
 
-// FD returns the file descriptor owned by FD. FD retains ownership.
-func (f *FD) FD() int {
-	return int(atomic.LoadInt64(&f.fd))
-}
-
 // File converts the FD to an os.File.
 //
 // FD does not transfer ownership of the file descriptor (it will be
@@ -219,7 +241,7 @@ func (f *FD) FD() int {
 // This operation is somewhat expensive, so care should be taken to minimize
 // its use.
 func (f *FD) File() (*os.File, error) {
-	fd, err := syscall.Dup(int(atomic.LoadInt64(&f.fd)))
+	fd, err := syscall.Dup(f.FD())
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 2c5d14be5..deaf5fa23 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -35,7 +35,6 @@ go_library(
         "//pkg/sync",
         "//pkg/tcpip/link/sniffer",
         "//pkg/urpc",
-        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index dfa936563..668f47802 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -23,8 +23,8 @@ import (
 	"text/tabwriter"
 	"time"
 
-	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/sentry/fdimport"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
@@ -203,27 +203,17 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 	}
 	initArgs.Filename = resolved
 
-	fds := make([]int, len(args.FilePayload.Files))
-	for i, file := range args.FilePayload.Files {
-		if kernel.VFS2Enabled {
-			// Need to dup to remove ownership from os.File.
-			dup, err := unix.Dup(int(file.Fd()))
-			if err != nil {
-				return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err)
-			}
-			fds[i] = dup
-		} else {
-			// VFS1 dups the file on import.
-			fds[i] = int(file.Fd())
-		}
+	fds, err := fd.NewFromFiles(args.Files)
+	if err != nil {
+		return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err)
 	}
+	defer func() {
+		for _, fd := range fds {
+			_ = fd.Close()
+		}
+	}()
 	ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds)
 	if err != nil {
-		if kernel.VFS2Enabled {
-			for _, fd := range fds {
-				unix.Close(fd)
-			}
-		}
 		return nil, 0, nil, nil, err
 	}
 
diff --git a/pkg/sentry/fdimport/BUILD b/pkg/sentry/fdimport/BUILD
index 5e41ceb4e..6b4f8b0ed 100644
--- a/pkg/sentry/fdimport/BUILD
+++ b/pkg/sentry/fdimport/BUILD
@@ -10,6 +10,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/context",
+        "//pkg/fd",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/host",
         "//pkg/sentry/fsimpl/host",
diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go
index 1b7cb94c0..314661475 100644
--- a/pkg/sentry/fdimport/fdimport.go
+++ b/pkg/sentry/fdimport/fdimport.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
 	hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
@@ -27,8 +28,9 @@ import (
 
 // Import imports a slice of FDs into the given FDTable. If console is true,
 // sets up TTY for the first 3 FDs in the slice representing stdin, stdout,
-// stderr. Upon success, Import takes ownership of all FDs.
-func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+// stderr. Used FDs are either closed or released. It's safe for the caller to
+// close any remaining files upon return.
+func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
 	if kernel.VFS2Enabled {
 		ttyFile, err := importVFS2(ctx, fdTable, console, fds)
 		return nil, ttyFile, err
@@ -37,7 +39,7 @@ func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []in
 	return ttyFile, nil, err
 }
 
-func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, error) {
+func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, error) {
 	var ttyFile *fs.File
 	for appFD, hostFD := range fds {
 		var appFile *fs.File
@@ -46,11 +48,12 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
 			// Import the file as a host TTY file.
 			if ttyFile == nil {
 				var err error
-				appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */)
+				appFile, err = host.ImportFile(ctx, hostFD.FD(), true /* isTTY */)
 				if err != nil {
 					return nil, err
 				}
 				defer appFile.DecRef(ctx)
+				_ = hostFD.Close() // FD is dup'd i ImportFile.
 
 				// Remember this in the TTY file, as we will
 				// use it for the other stdio FDs.
@@ -65,11 +68,12 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
 		} else {
 			// Import the file as a regular host file.
 			var err error
-			appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */)
+			appFile, err = host.ImportFile(ctx, hostFD.FD(), false /* isTTY */)
 			if err != nil {
 				return nil, err
 			}
 			defer appFile.DecRef(ctx)
+			_ = hostFD.Close() // FD is dup'd i ImportFile.
 		}
 
 		// Add the file to the FD map.
@@ -84,7 +88,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
 	return ttyFile.FileOperations.(*host.TTYFileOperations), nil
 }
 
-func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []int) (*hostvfs2.TTYFileDescription, error) {
+func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []*fd.FD) (*hostvfs2.TTYFileDescription, error) {
 	k := kernel.KernelFromContext(ctx)
 	if k == nil {
 		return nil, fmt.Errorf("cannot find kernel from context")
@@ -98,11 +102,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi
 			// Import the file as a host TTY file.
 			if ttyFile == nil {
 				var err error
-				appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, true /* isTTY */)
+				appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), true /* isTTY */)
 				if err != nil {
 					return nil, err
 				}
 				defer appFile.DecRef(ctx)
+				hostFD.Release() // FD is transfered to host FD.
 
 				// Remember this in the TTY file, as we will use it for the other stdio
 				// FDs.
@@ -115,11 +120,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi
 			}
 		} else {
 			var err error
-			appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, false /* isTTY */)
+			appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), false /* isTTY */)
 			if err != nil {
 				return nil, err
 			}
 			defer appFile.DecRef(ctx)
+			hostFD.Release() // FD is transfered to host FD.
 		}
 
 		if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 040f6a72d..704c66742 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -30,6 +30,7 @@ go_library(
         "//pkg/control/server",
         "//pkg/cpuid",
         "//pkg/eventchannel",
+        "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/memutil",
@@ -123,6 +124,7 @@ go_test(
     library = ":boot",
     deps = [
         "//pkg/control/server",
+        "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 68a2b45cf..894651519 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -22,6 +22,7 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -257,13 +258,20 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	// All validation passed, logs the spec for debugging.
 	specutils.LogSpec(args.Spec)
 
-	err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+	fds, err := fd.NewFromFiles(args.FilePayload.Files)
 	if err != nil {
+		return err
+	}
+	defer func() {
+		for _, fd := range fds {
+			_ = fd.Close()
+		}
+	}()
+	if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil {
 		log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
 		return err
 	}
 	log.Debugf("Container %q started", args.CID)
-
 	return nil
 }
 
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index ea0461a3d..e2c5f5fb1 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -34,6 +34,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
@@ -320,14 +321,14 @@ func adjustDirentCache(k *kernel.Kernel) error {
 }
 
 type fdDispenser struct {
-	fds []int
+	fds []*fd.FD
 }
 
 func (f *fdDispenser) remove() int {
 	if f.empty() {
 		panic("fdDispenser out of fds")
 	}
-	rv := f.fds[0]
+	rv := f.fds[0].Release()
 	f.fds = f.fds[1:]
 	return rv
 }
@@ -564,7 +565,7 @@ type containerMounter struct {
 	hints *podMountHints
 }
 
-func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints) *containerMounter {
 	return &containerMounter{
 		root:   spec.Root,
 		mounts: compileMounts(spec),
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 882cf270b..246ae3c3e 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/memutil"
 	"gvisor.dev/gvisor/pkg/rand"
@@ -89,10 +90,10 @@ type containerInfo struct {
 	procArgs kernel.CreateProcessArgs
 
 	// stdioFDs contains stdin, stdout, and stderr.
-	stdioFDs []int
+	stdioFDs []*fd.FD
 
 	// goferFDs are the FDs that attach the sandbox to the gofers.
-	goferFDs []int
+	goferFDs []*fd.FD
 }
 
 // Loader keeps state needed to start the kernel and run the container..
@@ -356,12 +357,17 @@ func New(args Args) (*Loader, error) {
 		k.SetHostMount(hostMount)
 	}
 
+	info := containerInfo{
+		conf:     args.Conf,
+		spec:     args.Spec,
+		procArgs: procArgs,
+	}
+
 	// Make host FDs stable between invocations. Host FDs must map to the exact
 	// same number when the sandbox is restored. Otherwise the wrong FD will be
 	// used.
-	var stdioFDs []int
 	newfd := startingStdioFD
-	for _, fd := range args.StdioFDs {
+	for _, stdioFD := range args.StdioFDs {
 		// Check that newfd is unused to avoid clobbering over it.
 		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
 			if err != nil {
@@ -370,14 +376,17 @@ func New(args Args) (*Loader, error) {
 			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
 		}
 
-		err := unix.Dup3(fd, newfd, unix.O_CLOEXEC)
+		err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
 		if err != nil {
-			return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+			return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
 		}
-		stdioFDs = append(stdioFDs, newfd)
-		_ = unix.Close(fd)
+		info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
+		_ = unix.Close(stdioFD)
 		newfd++
 	}
+	for _, goferFD := range args.GoferFDs {
+		info.goferFDs = append(info.goferFDs, fd.New(goferFD))
+	}
 
 	eid := execID{cid: args.ID}
 	l := &Loader{
@@ -386,13 +395,7 @@ func New(args Args) (*Loader, error) {
 		sandboxID:  args.ID,
 		processes:  map[execID]*execProcess{eid: {}},
 		mountHints: mountHints,
-		root: containerInfo{
-			conf:     args.Conf,
-			stdioFDs: stdioFDs,
-			goferFDs: args.GoferFDs,
-			spec:     args.Spec,
-			procArgs: procArgs,
-		},
+		root:       info,
 	}
 
 	// We don't care about child signals; some platforms can generate a
@@ -466,9 +469,14 @@ func (l *Loader) Destroy() {
 	}
 	l.watchdog.Stop()
 
-	for i, fd := range l.root.stdioFDs {
-		_ = unix.Close(fd)
-		l.root.stdioFDs[i] = -1
+	// In the success case, stdioFDs and goferFDs will only contain
+	// released/closed FDs that ownership has been passed over to host FDs and
+	// gofer sessions. Close them here in case on failure.
+	for _, fd := range l.root.stdioFDs {
+		_ = fd.Close()
+	}
+	for _, fd := range l.root.goferFDs {
+		_ = fd.Close()
 	}
 }
 
@@ -598,17 +606,6 @@ func (l *Loader) run() error {
 		}
 	})
 
-	// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
-	// either in createFDTable() during initial start or in descriptor.initAfterLoad()
-	// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
-	// passed FDs, so only close for VFS1.
-	if !kernel.VFS2Enabled {
-		for i, fd := range l.root.stdioFDs {
-			_ = unix.Close(fd)
-			l.root.stdioFDs[i] = -1
-		}
-	}
-
 	log.Infof("Process should have started...")
 	l.watchdog.Start()
 	return l.k.Start()
@@ -628,9 +625,9 @@ func (l *Loader) createContainer(cid string) error {
 }
 
 // startContainer starts a child container. It returns the thread group ID of
-// the newly created process. Caller owns 'files' and may close them after
-// this method returns.
-func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*os.File) error {
+// the newly created process. Used FDs are either closed or released. It's safe
+// for the caller to close any remaining files upon return.
+func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
 	if err != nil {
@@ -681,37 +678,15 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin
 	}
 
 	info := &containerInfo{
-		conf: conf,
-		spec: spec,
+		conf:     conf,
+		spec:     spec,
+		stdioFDs: files[:3],
+		goferFDs: files[3:],
 	}
 	info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
 	if err != nil {
 		return fmt.Errorf("creating new process: %v", err)
 	}
-
-	// VFS1 dups stdioFDs, so we don't need to dup them here. VFS2 takes
-	// ownership of the passed FDs, and we need to dup them here.
-	for _, f := range files[:3] {
-		if !kernel.VFS2Enabled {
-			info.stdioFDs = append(info.stdioFDs, int(f.Fd()))
-		} else {
-			fd, err := unix.Dup(int(f.Fd()))
-			if err != nil {
-				return fmt.Errorf("failed to dup file: %v", err)
-			}
-			info.stdioFDs = append(info.stdioFDs, fd)
-		}
-	}
-
-	// Can't take ownership away from os.File. dup them to get a new FDs.
-	for _, f := range files[3:] {
-		fd, err := unix.Dup(int(f.Fd()))
-		if err != nil {
-			return fmt.Errorf("failed to dup file: %v", err)
-		}
-		info.goferFDs = append(info.goferFDs, fd)
-	}
-
 	tg, err := l.createContainerProcess(false, cid, info, ep)
 	if err != nil {
 		return err
@@ -795,13 +770,13 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
 // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
 // the gofer FDs looking for disconnects, and destroys the container if a
 // disconnect occurs in any of the gofer FDs.
-func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
+func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
 	go func() {
 		log.Debugf("Monitoring gofer health for container %q", cid)
 		var events []unix.PollFd
-		for _, fd := range goferFDs {
+		for _, goferFD := range goferFDs {
 			events = append(events, unix.PollFd{
-				Fd:     int32(fd),
+				Fd:     int32(goferFD.FD()),
 				Events: unix.POLLHUP | unix.POLLRDHUP,
 			})
 		}
@@ -1281,7 +1256,7 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2
 	return ep.tty, ep.ttyVFS2, nil
 }
 
-func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
 	if len(stdioFDs) != 3 {
 		return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
 	}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 2343ce76c..dc9861389 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -26,6 +26,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
@@ -444,7 +445,7 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 			defer cleanup()
 
-			mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{})
+			mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{})
 			mns, err := mntr.createMountNamespace(ctx, conf)
 			if err != nil {
 				t.Fatalf("failed to create mount namespace: %v", err)
@@ -702,7 +703,11 @@ func TestRestoreEnvironment(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
-			mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{})
+			var ioFDs []*fd.FD
+			for _, ioFD := range tc.ioFDs {
+				ioFDs = append(ioFDs, fd.New(ioFD))
+			}
+			mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{})
 			actualRenv, err := mntr.createRestoreEnvironment(conf)
 			if !tc.errorExpected && err != nil {
 				t.Fatalf("could not create restore environment for test:%s", tc.name)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index da1694280..5b790c6c8 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -169,7 +169,7 @@ func TestMultiContainerSanity(t *testing.T) {
 // TestMultiPIDNS checks that it is possible to run 2 dead-simple
 // containers in the same sandbox with different pidns.
 func TestMultiPIDNS(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -214,7 +214,7 @@ func TestMultiPIDNS(t *testing.T) {
 
 // TestMultiPIDNSPath checks the pidns path.
 func TestMultiPIDNSPath(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -580,7 +580,7 @@ func TestMultiContainerDestroy(t *testing.T) {
 		t.Fatal("error finding test_app:", err)
 	}
 
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -1252,8 +1252,7 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) {
 
 // Test that shared pod mounts continue to work after container is restarted.
 func TestMultiContainerSharedMountRestart(t *testing.T) {
-	//TODO(gvisor.dev/issue/1487): This is failing with VFS2.
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
-- 
cgit v1.2.3


From 531340c7ba4da9bc4773dd7db77b62522c45aa20 Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Tue, 8 Sep 2020 11:49:51 -0700
Subject: [vfs] Capitalize x in the {Get/Set/Remove/List}xattr functions.

PiperOrigin-RevId: 330554450
---
 pkg/sentry/fs/inode.go                       |  2 +-
 pkg/sentry/fs/inode_overlay.go               |  2 +-
 pkg/sentry/fsimpl/ext/filesystem.go          | 16 ++++-----
 pkg/sentry/fsimpl/gofer/filesystem.go        | 24 ++++++-------
 pkg/sentry/fsimpl/gofer/gofer.go             | 32 +++++++++---------
 pkg/sentry/fsimpl/kernfs/filesystem.go       | 16 ++++-----
 pkg/sentry/fsimpl/overlay/copy_up.go         | 12 +++----
 pkg/sentry/fsimpl/overlay/filesystem.go      | 36 ++++++++++----------
 pkg/sentry/fsimpl/overlay/overlay.go         | 16 ++++-----
 pkg/sentry/fsimpl/tmpfs/filesystem.go        | 24 ++++++-------
 pkg/sentry/fsimpl/tmpfs/tmpfs.go             | 40 +++++++++++-----------
 pkg/sentry/fsimpl/verity/filesystem.go       | 26 +++++++--------
 pkg/sentry/syscalls/linux/vfs2/vfs2.go       | 16 ++++-----
 pkg/sentry/syscalls/linux/vfs2/xattr.go      | 32 +++++++++---------
 pkg/sentry/vfs/anonfs.go                     | 16 ++++-----
 pkg/sentry/vfs/file_description.go           | 50 ++++++++++++++--------------
 pkg/sentry/vfs/file_description_impl_util.go | 18 +++++-----
 pkg/sentry/vfs/filesystem.go                 | 24 ++++++-------
 pkg/sentry/vfs/memxattr/xattr.go             | 16 ++++-----
 pkg/sentry/vfs/options.go                    | 16 ++++-----
 pkg/sentry/vfs/vfs.go                        | 24 ++++++-------
 test/syscalls/linux/xattr.cc                 | 50 ++++++++++++++--------------
 22 files changed, 254 insertions(+), 254 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index b79cd9877..004910453 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -270,7 +270,7 @@ func (i *Inode) GetXattr(ctx context.Context, name string, size uint64) (string,
 // SetXattr calls i.InodeOperations.SetXattr with i as the Inode.
 func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, flags uint32) error {
 	if i.overlay != nil {
-		return overlaySetxattr(ctx, i.overlay, d, name, value, flags)
+		return overlaySetXattr(ctx, i.overlay, d, name, value, flags)
 	}
 	return i.InodeOperations.SetXattr(ctx, i, name, value, flags)
 }
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 0a2d64e3a..b16ab08ba 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -552,7 +552,7 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
 	return s, err
 }
 
-func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
+func overlaySetXattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
 	// Don't allow changes to overlay xattrs through a setxattr syscall.
 	if isXattrOverlay(name) {
 		return syserror.EPERM
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 8565d1a66..075de0e22 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -504,8 +504,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return nil, err
@@ -513,8 +513,8 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 	return nil, syserror.ENOTSUP
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return "", err
@@ -522,8 +522,8 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return "", syserror.ENOTSUP
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return err
@@ -531,8 +531,8 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return syserror.ENOTSUP
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return err
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 5d0f487db..b01121f9e 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -1519,8 +1519,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -1528,11 +1528,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 	if err != nil {
 		return nil, err
 	}
-	return d.listxattr(ctx, rp.Credentials(), size)
+	return d.listXattr(ctx, rp.Credentials(), size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -1540,11 +1540,11 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	if err != nil {
 		return "", err
 	}
-	return d.getxattr(ctx, rp.Credentials(), &opts)
+	return d.getXattr(ctx, rp.Credentials(), &opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	d, err := fs.resolveLocked(ctx, rp, &ds)
@@ -1552,7 +1552,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 		return err
 	}
-	if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil {
+	if err := d.setXattr(ctx, rp.Credentials(), &opts); err != nil {
 		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 		return err
 	}
@@ -1562,8 +1562,8 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return nil
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	d, err := fs.resolveLocked(ctx, rp, &ds)
@@ -1571,7 +1571,7 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 		return err
 	}
-	if err := d.removexattr(ctx, rp.Credentials(), name); err != nil {
+	if err := d.removeXattr(ctx, rp.Credentials(), name); err != nil {
 		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 		return err
 	}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 78b07f1b3..fa4e19113 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -1372,7 +1372,7 @@ func (d *dentry) setDeleted() {
 	atomic.StoreUint32(&d.deleted, 1)
 }
 
-func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
 	if d.file.isNil() || !d.userXattrSupported() {
 		return nil, nil
 	}
@@ -1390,7 +1390,7 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
 	return xattrs, nil
 }
 
-func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
 	if d.file.isNil() {
 		return "", syserror.ENODATA
 	}
@@ -1400,7 +1400,7 @@ func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vf
 	return d.file.getXattr(ctx, opts.Name, opts.Size)
 }
 
-func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
 	if d.file.isNil() {
 		return syserror.EPERM
 	}
@@ -1410,7 +1410,7 @@ func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vf
 	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
 }
 
-func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error {
 	if d.file.isNil() {
 		return syserror.EPERM
 	}
@@ -1668,30 +1668,30 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	return nil
 }
 
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
-	return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size)
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size)
 }
 
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
-	return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+	return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
 }
 
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
 	d := fd.dentry()
-	if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
+	if err := d.setXattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
 		return err
 	}
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
 }
 
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
 	d := fd.dentry()
-	if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
+	if err := d.removeXattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
 		return err
 	}
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 0e3011689..c428053e8 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -810,8 +810,8 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
 	_, _, err := fs.walkExistingLocked(ctx, rp)
 	fs.mu.RUnlock()
@@ -823,8 +823,8 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 	return nil, syserror.ENOTSUP
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	fs.mu.RLock()
 	_, _, err := fs.walkExistingLocked(ctx, rp)
 	fs.mu.RUnlock()
@@ -836,8 +836,8 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return "", syserror.ENOTSUP
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	fs.mu.RLock()
 	_, _, err := fs.walkExistingLocked(ctx, rp)
 	fs.mu.RUnlock()
@@ -849,8 +849,8 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return syserror.ENOTSUP
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
 	_, _, err := fs.walkExistingLocked(ctx, rp)
 	fs.mu.RUnlock()
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index ba7b8495a..c589b4746 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -278,13 +278,13 @@ func (d *dentry) copyXattrsLocked(ctx context.Context) error {
 	lowerPop := &vfs.PathOperation{Root: d.lowerVDs[0], Start: d.lowerVDs[0]}
 	upperPop := &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}
 
-	lowerXattrs, err := vfsObj.ListxattrAt(ctx, d.fs.creds, lowerPop, 0)
+	lowerXattrs, err := vfsObj.ListXattrAt(ctx, d.fs.creds, lowerPop, 0)
 	if err != nil {
 		if err == syserror.EOPNOTSUPP {
 			// There are no guarantees as to the contents of lowerXattrs.
 			return nil
 		}
-		ctx.Warningf("failed to copy up xattrs because ListxattrAt failed: %v", err)
+		ctx.Warningf("failed to copy up xattrs because ListXattrAt failed: %v", err)
 		return err
 	}
 
@@ -294,14 +294,14 @@ func (d *dentry) copyXattrsLocked(ctx context.Context) error {
 			continue
 		}
 
-		value, err := vfsObj.GetxattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetxattrOptions{Name: name, Size: 0})
+		value, err := vfsObj.GetXattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetXattrOptions{Name: name, Size: 0})
 		if err != nil {
-			ctx.Warningf("failed to copy up xattrs because GetxattrAt failed: %v", err)
+			ctx.Warningf("failed to copy up xattrs because GetXattrAt failed: %v", err)
 			return err
 		}
 
-		if err := vfsObj.SetxattrAt(ctx, d.fs.creds, upperPop, &vfs.SetxattrOptions{Name: name, Value: value}); err != nil {
-			ctx.Warningf("failed to copy up xattrs because SetxattrAt failed: %v", err)
+		if err := vfsObj.SetXattrAt(ctx, d.fs.creds, upperPop, &vfs.SetXattrOptions{Name: name, Value: value}); err != nil {
+			ctx.Warningf("failed to copy up xattrs because SetXattrAt failed: %v", err)
 			return err
 		}
 	}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index 46528c99c..87afeeaf3 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -273,10 +273,10 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 
 		// Directories are merged with directories from lower layers if they
 		// are not explicitly opaque.
-		opaqueVal, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{
+		opaqueVal, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
 			Root:  childVD,
 			Start: childVD,
-		}, &vfs.GetxattrOptions{
+		}, &vfs.GetXattrOptions{
 			Name: _OVL_XATTR_OPAQUE,
 			Size: 1,
 		})
@@ -671,7 +671,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 			// There may be directories on lower layers (previously hidden by
 			// the whiteout) that the new directory should not be merged with.
 			// Mark it opaque to prevent merging.
-			if err := vfsObj.SetxattrAt(ctx, fs.creds, &pop, &vfs.SetxattrOptions{
+			if err := vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{
 				Name:  _OVL_XATTR_OPAQUE,
 				Value: "y",
 			}); err != nil {
@@ -1359,8 +1359,8 @@ func isOverlayXattr(name string) bool {
 	return strings.HasPrefix(name, _OVL_XATTR_PREFIX)
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -1375,7 +1375,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]string, error) {
 	vfsObj := d.fs.vfsfs.VirtualFilesystem()
 	top := d.topLayer()
-	names, err := vfsObj.ListxattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size)
+	names, err := vfsObj.ListXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size)
 	if err != nil {
 		return nil, err
 	}
@@ -1391,8 +1391,8 @@ func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]
 	return names[:n], err
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -1404,7 +1404,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return fs.getXattr(ctx, d, rp.Credentials(), &opts)
 }
 
-func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
 	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
 		return "", err
 	}
@@ -1418,11 +1418,11 @@ func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Crede
 	// Analogous to fs/overlayfs/super.c:ovl_other_xattr_get().
 	vfsObj := d.fs.vfsfs.VirtualFilesystem()
 	top := d.topLayer()
-	return vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts)
+	return vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -1435,7 +1435,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 }
 
 // Precondition: fs.renameMu must be locked.
-func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
 	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
 		return err
 	}
@@ -1455,11 +1455,11 @@ func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mo
 		return err
 	}
 	vfsObj := d.fs.vfsfs.VirtualFilesystem()
-	return vfsObj.SetxattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts)
+	return vfsObj.SetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts)
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -1477,7 +1477,7 @@ func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs
 		return err
 	}
 
-	// Like SetxattrAt, return EOPNOTSUPP when removing an overlay attribute.
+	// Like SetXattrAt, return EOPNOTSUPP when removing an overlay attribute.
 	// Linux passes the remove request to xattr_handler->set.
 	// See fs/xattr.c:vfs_removexattr().
 	if isOverlayXattr(name) {
@@ -1492,7 +1492,7 @@ func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs
 		return err
 	}
 	vfsObj := d.fs.vfsfs.VirtualFilesystem()
-	return vfsObj.RemovexattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name)
+	return vfsObj.RemoveXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name)
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index e706f9d4e..9a8f7010e 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -632,26 +632,26 @@ func (fd *fileDescription) dentry() *dentry {
 	return fd.vfsfd.Dentry().Impl().(*dentry)
 }
 
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
 	return fd.filesystem().listXattr(ctx, fd.dentry(), size)
 }
 
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
 	return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts)
 }
 
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
 	fs := fd.filesystem()
 	fs.renameMu.RLock()
 	defer fs.renameMu.RUnlock()
 	return fs.setXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts)
 }
 
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
 	fs := fd.filesystem()
 	fs.renameMu.RLock()
 	defer fs.renameMu.RUnlock()
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index e0de04e05..61d925a73 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -792,37 +792,37 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	}
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
-	return d.inode.listxattr(size)
+	return d.inode.listXattr(size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		return "", err
 	}
-	return d.inode.getxattr(rp.Credentials(), &opts)
+	return d.inode.getXattr(rp.Credentials(), &opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	fs.mu.RLock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		fs.mu.RUnlock()
 		return err
 	}
-	if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
+	if err := d.inode.setXattr(rp.Credentials(), &opts); err != nil {
 		fs.mu.RUnlock()
 		return err
 	}
@@ -832,15 +832,15 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return nil
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		fs.mu.RUnlock()
 		return err
 	}
-	if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
+	if err := d.inode.removeXattr(rp.Credentials(), name); err != nil {
 		fs.mu.RUnlock()
 		return err
 	}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index d6074f20f..4871e55d3 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -626,29 +626,29 @@ func (i *inode) touchCMtimeLocked() {
 	atomic.StoreInt64(&i.ctime, now)
 }
 
-func (i *inode) listxattr(size uint64) ([]string, error) {
-	return i.xattrs.Listxattr(size)
+func (i *inode) listXattr(size uint64) ([]string, error) {
+	return i.xattrs.ListXattr(size)
 }
 
-func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
 	if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
 		return "", err
 	}
-	return i.xattrs.Getxattr(opts)
+	return i.xattrs.GetXattr(opts)
 }
 
-func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
 	if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
 		return err
 	}
-	return i.xattrs.Setxattr(opts)
+	return i.xattrs.SetXattr(opts)
 }
 
-func (i *inode) removexattr(creds *auth.Credentials, name string) error {
+func (i *inode) removeXattr(creds *auth.Credentials, name string) error {
 	if err := i.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
 		return err
 	}
-	return i.xattrs.Removexattr(name)
+	return i.xattrs.RemoveXattr(name)
 }
 
 func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
@@ -712,20 +712,20 @@ func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
 	return globalStatfs, nil
 }
 
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
-	return fd.inode().listxattr(size)
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.inode().listXattr(size)
 }
 
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
-	return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts)
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+	return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts)
 }
 
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
 	d := fd.dentry()
-	if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+	if err := d.inode.setXattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
 		return err
 	}
 
@@ -734,10 +734,10 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption
 	return nil
 }
 
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
 	d := fd.dentry()
-	if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+	if err := d.inode.removeXattr(auth.CredentialsFromContext(ctx), name); err != nil {
 		return err
 	}
 
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index 0e17dbddc..e944fd5d2 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -179,10 +179,10 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
 	// corresponding Merkle tree file.
 	// This is the offset of the root hash for child in its parent's Merkle
 	// tree file.
-	off, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{
+	off, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
 		Root:  child.lowerMerkleVD,
 		Start: child.lowerMerkleVD,
-	}, &vfs.GetxattrOptions{
+	}, &vfs.GetXattrOptions{
 		Name: merkleOffsetInParentXattr,
 		// Offset is a 32 bit integer.
 		Size: sizeOfInt32,
@@ -233,7 +233,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
 	// dataSize is the size of raw data for the Merkle tree. For a file,
 	// dataSize is the size of the whole file. For a directory, dataSize is
 	// the size of all its children's root hashes.
-	dataSize, err := parentMerkleFD.Getxattr(ctx, &vfs.GetxattrOptions{
+	dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{
 		Name: merkleSizeXattr,
 		Size: sizeOfInt32,
 	})
@@ -660,8 +660,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -670,14 +670,14 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 		return nil, err
 	}
 	lowerVD := d.lowerVD
-	return fs.vfsfs.VirtualFilesystem().ListxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+	return fs.vfsfs.VirtualFilesystem().ListXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
 		Root:  lowerVD,
 		Start: lowerVD,
 	}, size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -686,20 +686,20 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 		return "", err
 	}
 	lowerVD := d.lowerVD
-	return fs.vfsfs.VirtualFilesystem().GetxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+	return fs.vfsfs.VirtualFilesystem().GetXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
 		Root:  lowerVD,
 		Start: lowerVD,
 	}, &opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	// Verity file system is read-only.
 	return syserror.EROFS
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	// Verity file system is read-only.
 	return syserror.EROFS
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
index c576d9475..0df3bd449 100644
--- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -93,16 +93,16 @@ func Override() {
 	s.Table[165] = syscalls.Supported("mount", Mount)
 	s.Table[166] = syscalls.Supported("umount2", Umount2)
 	s.Table[187] = syscalls.Supported("readahead", Readahead)
-	s.Table[188] = syscalls.Supported("setxattr", Setxattr)
+	s.Table[188] = syscalls.Supported("setxattr", SetXattr)
 	s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
 	s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
-	s.Table[191] = syscalls.Supported("getxattr", Getxattr)
+	s.Table[191] = syscalls.Supported("getxattr", GetXattr)
 	s.Table[192] = syscalls.Supported("lgetxattr", Lgetxattr)
 	s.Table[193] = syscalls.Supported("fgetxattr", Fgetxattr)
-	s.Table[194] = syscalls.Supported("listxattr", Listxattr)
+	s.Table[194] = syscalls.Supported("listxattr", ListXattr)
 	s.Table[195] = syscalls.Supported("llistxattr", Llistxattr)
 	s.Table[196] = syscalls.Supported("flistxattr", Flistxattr)
-	s.Table[197] = syscalls.Supported("removexattr", Removexattr)
+	s.Table[197] = syscalls.Supported("removexattr", RemoveXattr)
 	s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
 	s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
 	s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"})
@@ -163,16 +163,16 @@ func Override() {
 
 	// Override ARM64.
 	s = linux.ARM64
-	s.Table[5] = syscalls.Supported("setxattr", Setxattr)
+	s.Table[5] = syscalls.Supported("setxattr", SetXattr)
 	s.Table[6] = syscalls.Supported("lsetxattr", Lsetxattr)
 	s.Table[7] = syscalls.Supported("fsetxattr", Fsetxattr)
-	s.Table[8] = syscalls.Supported("getxattr", Getxattr)
+	s.Table[8] = syscalls.Supported("getxattr", GetXattr)
 	s.Table[9] = syscalls.Supported("lgetxattr", Lgetxattr)
 	s.Table[10] = syscalls.Supported("fgetxattr", Fgetxattr)
-	s.Table[11] = syscalls.Supported("listxattr", Listxattr)
+	s.Table[11] = syscalls.Supported("listxattr", ListXattr)
 	s.Table[12] = syscalls.Supported("llistxattr", Llistxattr)
 	s.Table[13] = syscalls.Supported("flistxattr", Flistxattr)
-	s.Table[14] = syscalls.Supported("removexattr", Removexattr)
+	s.Table[14] = syscalls.Supported("removexattr", RemoveXattr)
 	s.Table[15] = syscalls.Supported("lremovexattr", Lremovexattr)
 	s.Table[16] = syscalls.Supported("fremovexattr", Fremovexattr)
 	s.Table[17] = syscalls.Supported("getcwd", Getcwd)
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
index ef99246ed..e05723ef9 100644
--- a/pkg/sentry/syscalls/linux/vfs2/xattr.go
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -26,8 +26,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// Listxattr implements Linux syscall listxattr(2).
-func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// ListXattr implements Linux syscall listxattr(2).
+func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return listxattr(t, args, followFinalSymlink)
 }
 
@@ -51,7 +51,7 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml
 	}
 	defer tpop.Release(t)
 
-	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
+	names, err := t.Kernel().VFS().ListXattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
 	if err != nil {
 		return 0, nil, err
 	}
@@ -74,7 +74,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 	defer file.DecRef(t)
 
-	names, err := file.Listxattr(t, uint64(size))
+	names, err := file.ListXattr(t, uint64(size))
 	if err != nil {
 		return 0, nil, err
 	}
@@ -85,8 +85,8 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	return uintptr(n), nil, nil
 }
 
-// Getxattr implements Linux syscall getxattr(2).
-func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// GetXattr implements Linux syscall getxattr(2).
+func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return getxattr(t, args, followFinalSymlink)
 }
 
@@ -116,7 +116,7 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
 		return 0, nil, err
 	}
 
-	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{
+	value, err := t.Kernel().VFS().GetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetXattrOptions{
 		Name: name,
 		Size: uint64(size),
 	})
@@ -148,7 +148,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 
-	value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)})
+	value, err := file.GetXattr(t, &vfs.GetXattrOptions{Name: name, Size: uint64(size)})
 	if err != nil {
 		return 0, nil, err
 	}
@@ -159,8 +159,8 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return uintptr(n), nil, nil
 }
 
-// Setxattr implements Linux syscall setxattr(2).
-func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// SetXattr implements Linux syscall setxattr(2).
+func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return 0, nil, setxattr(t, args, followFinalSymlink)
 }
 
@@ -199,7 +199,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
 		return err
 	}
 
-	return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{
+	return t.Kernel().VFS().SetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetXattrOptions{
 		Name:  name,
 		Value: value,
 		Flags: uint32(flags),
@@ -233,15 +233,15 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 
-	return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{
+	return 0, nil, file.SetXattr(t, &vfs.SetXattrOptions{
 		Name:  name,
 		Value: value,
 		Flags: uint32(flags),
 	})
 }
 
-// Removexattr implements Linux syscall removexattr(2).
-func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// RemoveXattr implements Linux syscall removexattr(2).
+func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return 0, nil, removexattr(t, args, followFinalSymlink)
 }
 
@@ -269,7 +269,7 @@ func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSy
 		return err
 	}
 
-	return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name)
+	return t.Kernel().VFS().RemoveXattrAt(t, t.Credentials(), &tpop.pop, name)
 }
 
 // Fremovexattr implements Linux syscall fremovexattr(2).
@@ -288,7 +288,7 @@ func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 		return 0, nil, err
 	}
 
-	return 0, nil, file.Removexattr(t, name)
+	return 0, nil, file.RemoveXattr(t, name)
 }
 
 func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index 5a0e3e6b5..9c4db3047 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -245,32 +245,32 @@ func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements FilesystemImpl.ListXattrAt.
+func (fs *anonFilesystem) ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
 	if !rp.Done() {
 		return nil, syserror.ENOTDIR
 	}
 	return nil, nil
 }
 
-// GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) {
+// GetXattrAt implements FilesystemImpl.GetXattrAt.
+func (fs *anonFilesystem) GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) {
 	if !rp.Done() {
 		return "", syserror.ENOTDIR
 	}
 	return "", syserror.ENOTSUP
 }
 
-// SetxattrAt implements FilesystemImpl.SetxattrAt.
-func (fs *anonFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+// SetXattrAt implements FilesystemImpl.SetXattrAt.
+func (fs *anonFilesystem) SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error {
 	if !rp.Done() {
 		return syserror.ENOTDIR
 	}
 	return syserror.EPERM
 }
 
-// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
-func (fs *anonFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+// RemoveXattrAt implements FilesystemImpl.RemoveXattrAt.
+func (fs *anonFilesystem) RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
 	if !rp.Done() {
 		return syserror.ENOTDIR
 	}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 22a54fa48..2b29a3c3f 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -101,7 +101,7 @@ type FileDescriptionOptions struct {
 
 	// If UseDentryMetadata is true, calls to FileDescription methods that
 	// interact with file and filesystem metadata (Stat, SetStat, StatFS,
-	// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
+	// ListXattr, GetXattr, SetXattr, RemoveXattr) are implemented by calling
 	// the corresponding FilesystemImpl methods instead of the corresponding
 	// FileDescriptionImpl methods.
 	//
@@ -420,19 +420,19 @@ type FileDescriptionImpl interface {
 	// Ioctl implements the ioctl(2) syscall.
 	Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
 
-	// Listxattr returns all extended attribute names for the file.
-	Listxattr(ctx context.Context, size uint64) ([]string, error)
+	// ListXattr returns all extended attribute names for the file.
+	ListXattr(ctx context.Context, size uint64) ([]string, error)
 
-	// Getxattr returns the value associated with the given extended attribute
+	// GetXattr returns the value associated with the given extended attribute
 	// for the file.
-	Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
+	GetXattr(ctx context.Context, opts GetXattrOptions) (string, error)
 
-	// Setxattr changes the value associated with the given extended attribute
+	// SetXattr changes the value associated with the given extended attribute
 	// for the file.
-	Setxattr(ctx context.Context, opts SetxattrOptions) error
+	SetXattr(ctx context.Context, opts SetXattrOptions) error
 
-	// Removexattr removes the given extended attribute from the file.
-	Removexattr(ctx context.Context, name string) error
+	// RemoveXattr removes the given extended attribute from the file.
+	RemoveXattr(ctx context.Context, name string) error
 
 	// LockBSD tries to acquire a BSD-style advisory file lock.
 	LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
@@ -635,25 +635,25 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
 	return fd.impl.Ioctl(ctx, uio, args)
 }
 
-// Listxattr returns all extended attribute names for the file represented by
+// ListXattr returns all extended attribute names for the file represented by
 // fd.
 //
 // If the size of the list (including a NUL terminating byte after every entry)
 // would exceed size, ERANGE may be returned. Note that implementations
 // are free to ignore size entirely and return without error). In all cases,
 // if size is 0, the list should be returned without error, regardless of size.
-func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+func (fd *FileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
+		names, err := fd.vd.mount.fs.impl.ListXattrAt(ctx, rp, size)
 		vfsObj.putResolvingPath(ctx, rp)
 		return names, err
 	}
-	names, err := fd.impl.Listxattr(ctx, size)
+	names, err := fd.impl.ListXattr(ctx, size)
 	if err == syserror.ENOTSUP {
 		// Linux doesn't actually return ENOTSUP in this case; instead,
 		// fs/xattr.c:vfs_listxattr() falls back to allowing the security
@@ -664,57 +664,57 @@ func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string
 	return names, err
 }
 
-// Getxattr returns the value associated with the given extended attribute for
+// GetXattr returns the value associated with the given extended attribute for
 // the file represented by fd.
 //
 // If the size of the return value exceeds opts.Size, ERANGE may be returned
 // (note that implementations are free to ignore opts.Size entirely and return
 // without error). In all cases, if opts.Size is 0, the value should be
 // returned without error, regardless of size.
-func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
+func (fd *FileDescription) GetXattr(ctx context.Context, opts *GetXattrOptions) (string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
+		val, err := fd.vd.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(ctx, rp)
 		return val, err
 	}
-	return fd.impl.Getxattr(ctx, *opts)
+	return fd.impl.GetXattr(ctx, *opts)
 }
 
-// Setxattr changes the value associated with the given extended attribute for
+// SetXattr changes the value associated with the given extended attribute for
 // the file represented by fd.
-func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
+func (fd *FileDescription) SetXattr(ctx context.Context, opts *SetXattrOptions) error {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+		err := fd.vd.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(ctx, rp)
 		return err
 	}
-	return fd.impl.Setxattr(ctx, *opts)
+	return fd.impl.SetXattr(ctx, *opts)
 }
 
-// Removexattr removes the given extended attribute from the file represented
+// RemoveXattr removes the given extended attribute from the file represented
 // by fd.
-func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+func (fd *FileDescription) RemoveXattr(ctx context.Context, name string) error {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		err := fd.vd.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
 		vfsObj.putResolvingPath(ctx, rp)
 		return err
 	}
-	return fd.impl.Removexattr(ctx, name)
+	return fd.impl.RemoveXattr(ctx, name)
 }
 
 // SyncFS instructs the filesystem containing fd to execute the semantics of
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 6b8b4ad49..68b80a951 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -134,28 +134,28 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
 	return 0, syserror.ENOTTY
 }
 
-// Listxattr implements FileDescriptionImpl.Listxattr analogously to
+// ListXattr implements FileDescriptionImpl.ListXattr analogously to
 // inode_operations::listxattr == NULL in Linux.
-func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) {
-	// This isn't exactly accurate; see FileDescription.Listxattr.
+func (FileDescriptionDefaultImpl) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	// This isn't exactly accurate; see FileDescription.ListXattr.
 	return nil, syserror.ENOTSUP
 }
 
-// Getxattr implements FileDescriptionImpl.Getxattr analogously to
+// GetXattr implements FileDescriptionImpl.GetXattr analogously to
 // inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) {
+func (FileDescriptionDefaultImpl) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) {
 	return "", syserror.ENOTSUP
 }
 
-// Setxattr implements FileDescriptionImpl.Setxattr analogously to
+// SetXattr implements FileDescriptionImpl.SetXattr analogously to
 // inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+func (FileDescriptionDefaultImpl) SetXattr(ctx context.Context, opts SetXattrOptions) error {
 	return syserror.ENOTSUP
 }
 
-// Removexattr implements FileDescriptionImpl.Removexattr analogously to
+// RemoveXattr implements FileDescriptionImpl.RemoveXattr analogously to
 // inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error {
+func (FileDescriptionDefaultImpl) RemoveXattr(ctx context.Context, name string) error {
 	return syserror.ENOTSUP
 }
 
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 46851f638..7dae4e7e8 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -416,26 +416,26 @@ type FilesystemImpl interface {
 	// ResolvingPath.Resolve*(), then !rp.Done().
 	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 
-	// ListxattrAt returns all extended attribute names for the file at rp.
+	// ListXattrAt returns all extended attribute names for the file at rp.
 	//
 	// Errors:
 	//
 	// - If extended attributes are not supported by the filesystem,
-	// ListxattrAt returns ENOTSUP.
+	// ListXattrAt returns ENOTSUP.
 	//
 	// - If the size of the list (including a NUL terminating byte after every
 	// entry) would exceed size, ERANGE may be returned. Note that
 	// implementations are free to ignore size entirely and return without
 	// error). In all cases, if size is 0, the list should be returned without
 	// error, regardless of size.
-	ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
+	ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
 
-	// GetxattrAt returns the value associated with the given extended
+	// GetXattrAt returns the value associated with the given extended
 	// attribute for the file at rp.
 	//
 	// Errors:
 	//
-	// - If extended attributes are not supported by the filesystem, GetxattrAt
+	// - If extended attributes are not supported by the filesystem, GetXattrAt
 	// returns ENOTSUP.
 	//
 	// - If an extended attribute named opts.Name does not exist, ENODATA is
@@ -445,30 +445,30 @@ type FilesystemImpl interface {
 	// returned (note that implementations are free to ignore opts.Size entirely
 	// and return without error). In all cases, if opts.Size is 0, the value
 	// should be returned without error, regardless of size.
-	GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error)
+	GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error)
 
-	// SetxattrAt changes the value associated with the given extended
+	// SetXattrAt changes the value associated with the given extended
 	// attribute for the file at rp.
 	//
 	// Errors:
 	//
-	// - If extended attributes are not supported by the filesystem, SetxattrAt
+	// - If extended attributes are not supported by the filesystem, SetXattrAt
 	// returns ENOTSUP.
 	//
 	// - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
 	// EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
 	// ENODATA is returned.
-	SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
+	SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error
 
-	// RemovexattrAt removes the given extended attribute from the file at rp.
+	// RemoveXattrAt removes the given extended attribute from the file at rp.
 	//
 	// Errors:
 	//
 	// - If extended attributes are not supported by the filesystem,
-	// RemovexattrAt returns ENOTSUP.
+	// RemoveXattrAt returns ENOTSUP.
 	//
 	// - If name does not exist, ENODATA is returned.
-	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
+	RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error
 
 	// BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
 	//
diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go
index cc1e7d764..638b5d830 100644
--- a/pkg/sentry/vfs/memxattr/xattr.go
+++ b/pkg/sentry/vfs/memxattr/xattr.go
@@ -33,8 +33,8 @@ type SimpleExtendedAttributes struct {
 	xattrs map[string]string
 }
 
-// Getxattr returns the value at 'name'.
-func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) {
+// GetXattr returns the value at 'name'.
+func (x *SimpleExtendedAttributes) GetXattr(opts *vfs.GetXattrOptions) (string, error) {
 	x.mu.RLock()
 	value, ok := x.xattrs[opts.Name]
 	x.mu.RUnlock()
@@ -49,8 +49,8 @@ func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string,
 	return value, nil
 }
 
-// Setxattr sets 'value' at 'name'.
-func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
+// SetXattr sets 'value' at 'name'.
+func (x *SimpleExtendedAttributes) SetXattr(opts *vfs.SetXattrOptions) error {
 	x.mu.Lock()
 	defer x.mu.Unlock()
 	if x.xattrs == nil {
@@ -72,8 +72,8 @@ func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
 	return nil
 }
 
-// Listxattr returns all names in xattrs.
-func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
+// ListXattr returns all names in xattrs.
+func (x *SimpleExtendedAttributes) ListXattr(size uint64) ([]string, error) {
 	// Keep track of the size of the buffer needed in listxattr(2) for the list.
 	listSize := 0
 	x.mu.RLock()
@@ -90,8 +90,8 @@ func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
 	return names, nil
 }
 
-// Removexattr removes the xattr at 'name'.
-func (x *SimpleExtendedAttributes) Removexattr(name string) error {
+// RemoveXattr removes the xattr at 'name'.
+func (x *SimpleExtendedAttributes) RemoveXattr(name string) error {
 	x.mu.Lock()
 	defer x.mu.Unlock()
 	if _, ok := x.xattrs[name]; !ok {
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index dfc8573fd..b33d36cb1 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -190,10 +190,10 @@ type BoundEndpointOptions struct {
 	Addr string
 }
 
-// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(),
-// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and
-// FileDescriptionImpl.Getxattr().
-type GetxattrOptions struct {
+// GetXattrOptions contains options to VirtualFilesystem.GetXattrAt(),
+// FilesystemImpl.GetXattrAt(), FileDescription.GetXattr(), and
+// FileDescriptionImpl.GetXattr().
+type GetXattrOptions struct {
 	// Name is the name of the extended attribute to retrieve.
 	Name string
 
@@ -204,10 +204,10 @@ type GetxattrOptions struct {
 	Size uint64
 }
 
-// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
-// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
-// FileDescriptionImpl.Setxattr().
-type SetxattrOptions struct {
+// SetXattrOptions contains options to VirtualFilesystem.SetXattrAt(),
+// FilesystemImpl.SetXattrAt(), FileDescription.SetXattr(), and
+// FileDescriptionImpl.SetXattr().
+type SetXattrOptions struct {
 	// Name is the name of the extended attribute being mutated.
 	Name string
 
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index ec27562d6..6825d81a5 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -687,12 +687,12 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C
 	}
 }
 
-// ListxattrAt returns all extended attribute names for the file at the given
+// ListXattrAt returns all extended attribute names for the file at the given
 // path.
-func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
+func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
+		names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size)
 		if err == nil {
 			vfs.putResolvingPath(ctx, rp)
 			return names, nil
@@ -712,12 +712,12 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
 	}
 }
 
-// GetxattrAt returns the value associated with the given extended attribute
+// GetXattrAt returns the value associated with the given extended attribute
 // for the file at the given path.
-func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) {
+func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
+		val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(ctx, rp)
 			return val, nil
@@ -729,12 +729,12 @@ func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Creden
 	}
 }
 
-// SetxattrAt changes the value associated with the given extended attribute
+// SetXattrAt changes the value associated with the given extended attribute
 // for the file at the given path.
-func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error {
+func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+		err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(ctx, rp)
 			return nil
@@ -746,11 +746,11 @@ func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Creden
 	}
 }
 
-// RemovexattrAt removes the given extended attribute from the file at rp.
-func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
+// RemoveXattrAt removes the given extended attribute from the file at rp.
+func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
 		if err == nil {
 			vfs.putResolvingPath(ctx, rp)
 			return nil
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index 5510a87a0..1a1010bb5 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -232,7 +232,7 @@ TEST_F(XattrTest, XattrOnInvalidFileTypes) {
   EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
 }
 
-TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
+TEST_F(XattrTest, SetXattrSizeSmallerThanValue) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -247,7 +247,7 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
   EXPECT_EQ(buf, expected_buf);
 }
 
-TEST_F(XattrTest, SetxattrZeroSize) {
+TEST_F(XattrTest, SetXattrZeroSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -259,7 +259,7 @@ TEST_F(XattrTest, SetxattrZeroSize) {
   EXPECT_EQ(buf, '-');
 }
 
-TEST_F(XattrTest, SetxattrSizeTooLarge) {
+TEST_F(XattrTest, SetXattrSizeTooLarge) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
 
@@ -274,7 +274,7 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
-TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
+TEST_F(XattrTest, SetXattrNullValueAndNonzeroSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
@@ -283,7 +283,7 @@ TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
-TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
+TEST_F(XattrTest, SetXattrNullValueAndZeroSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -291,7 +291,7 @@ TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
 }
 
-TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
+TEST_F(XattrTest, SetXattrValueTooLargeButOKSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val(XATTR_SIZE_MAX + 1);
@@ -307,7 +307,7 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
   EXPECT_EQ(buf, expected_buf);
 }
 
-TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
+TEST_F(XattrTest, SetXattrReplaceWithSmaller) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -322,7 +322,7 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
   EXPECT_EQ(buf, expected_buf);
 }
 
-TEST_F(XattrTest, SetxattrReplaceWithLarger) {
+TEST_F(XattrTest, SetXattrReplaceWithLarger) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -336,7 +336,7 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) {
   EXPECT_EQ(buf, val);
 }
 
-TEST_F(XattrTest, SetxattrCreateFlag) {
+TEST_F(XattrTest, SetXattrCreateFlag) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
@@ -347,7 +347,7 @@ TEST_F(XattrTest, SetxattrCreateFlag) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
 }
 
-TEST_F(XattrTest, SetxattrReplaceFlag) {
+TEST_F(XattrTest, SetXattrReplaceFlag) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
@@ -359,14 +359,14 @@ TEST_F(XattrTest, SetxattrReplaceFlag) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
 }
 
-TEST_F(XattrTest, SetxattrInvalidFlags) {
+TEST_F(XattrTest, SetXattrInvalidFlags) {
   const char* path = test_file_name_.c_str();
   int invalid_flags = 0xff;
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
               SyscallFailsWithErrno(EINVAL));
 }
 
-TEST_F(XattrTest, Getxattr) {
+TEST_F(XattrTest, GetXattr) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -378,7 +378,7 @@ TEST_F(XattrTest, Getxattr) {
   EXPECT_EQ(buf, val);
 }
 
-TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
+TEST_F(XattrTest, GetXattrSizeSmallerThanValue) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -390,7 +390,7 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
   EXPECT_EQ(buf, '-');
 }
 
-TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
+TEST_F(XattrTest, GetXattrSizeLargerThanValue) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -405,7 +405,7 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
   EXPECT_EQ(buf, expected_buf);
 }
 
-TEST_F(XattrTest, GetxattrZeroSize) {
+TEST_F(XattrTest, GetXattrZeroSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -418,7 +418,7 @@ TEST_F(XattrTest, GetxattrZeroSize) {
   EXPECT_EQ(buf, '-');
 }
 
-TEST_F(XattrTest, GetxattrSizeTooLarge) {
+TEST_F(XattrTest, GetXattrSizeTooLarge) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -434,7 +434,7 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) {
   EXPECT_EQ(buf, expected_buf);
 }
 
-TEST_F(XattrTest, GetxattrNullValue) {
+TEST_F(XattrTest, GetXattrNullValue) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -445,7 +445,7 @@ TEST_F(XattrTest, GetxattrNullValue) {
               SyscallFailsWithErrno(EFAULT));
 }
 
-TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
+TEST_F(XattrTest, GetXattrNullValueAndZeroSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -461,13 +461,13 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(size));
 }
 
-TEST_F(XattrTest, GetxattrNonexistentName) {
+TEST_F(XattrTest, GetXattrNonexistentName) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
-TEST_F(XattrTest, Listxattr) {
+TEST_F(XattrTest, ListXattr) {
   const char* path = test_file_name_.c_str();
   const std::string name = "user.test";
   const std::string name2 = "user.test2";
@@ -493,7 +493,7 @@ TEST_F(XattrTest, Listxattr) {
   EXPECT_EQ(got, expected);
 }
 
-TEST_F(XattrTest, ListxattrNoXattrs) {
+TEST_F(XattrTest, ListXattrNoXattrs) {
   const char* path = test_file_name_.c_str();
 
   std::vector<char> list, expected;
@@ -501,13 +501,13 @@ TEST_F(XattrTest, ListxattrNoXattrs) {
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(list, expected);
 
-  // Listxattr should succeed if there are no attributes, even if the buffer
+  // ListXattr should succeed if there are no attributes, even if the buffer
   // passed in is a nullptr.
   EXPECT_THAT(listxattr(path, nullptr, sizeof(list)),
               SyscallSucceedsWithValue(0));
 }
 
-TEST_F(XattrTest, ListxattrNullBuffer) {
+TEST_F(XattrTest, ListXattrNullBuffer) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -516,7 +516,7 @@ TEST_F(XattrTest, ListxattrNullBuffer) {
               SyscallFailsWithErrno(EFAULT));
 }
 
-TEST_F(XattrTest, ListxattrSizeTooSmall) {
+TEST_F(XattrTest, ListXattrSizeTooSmall) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -526,7 +526,7 @@ TEST_F(XattrTest, ListxattrSizeTooSmall) {
               SyscallFailsWithErrno(ERANGE));
 }
 
-TEST_F(XattrTest, ListxattrZeroSize) {
+TEST_F(XattrTest, ListXattrZeroSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
-- 
cgit v1.2.3


From 89581f6495f201344227f3571eda8f9305e77d06 Mon Sep 17 00:00:00 2001
From: Ghanan Gowripalan <ghanan@google.com>
Date: Tue, 8 Sep 2020 12:15:58 -0700
Subject: Improve type safety for transport protocol options

The existing implementation for TransportProtocol.{Set}Option take
arguments of an empty interface type which all types (implicitly)
implement; any type may be passed to the functions.

This change introduces marker interfaces for transport protocol options
that may be set or queried which transport protocol option types
implement to ensure that invalid types are caught at compile time.
Different interfaces are used to allow the compiler to enforce read-only
or set-only socket options.

RELNOTES: n/a
PiperOrigin-RevId: 330559811
---
 pkg/sentry/socket/netstack/stack.go                |  22 +--
 pkg/tcpip/stack/registration.go                    |   4 +-
 pkg/tcpip/stack/stack.go                           |   4 +-
 pkg/tcpip/stack/transport_test.go                  |  59 ++-----
 pkg/tcpip/tcpip.go                                 | 127 ++++++++++++++-
 pkg/tcpip/transport/icmp/protocol.go               |   4 +-
 pkg/tcpip/transport/tcp/connect.go                 |   2 +-
 pkg/tcpip/transport/tcp/dual_stack_test.go         |   5 +-
 pkg/tcpip/transport/tcp/endpoint.go                |  20 +--
 pkg/tcpip/transport/tcp/endpoint_state.go          |   4 +-
 pkg/tcpip/transport/tcp/protocol.go                | 178 ++++++++-------------
 pkg/tcpip/transport/tcp/tcp_sack_test.go           |  15 +-
 pkg/tcpip/transport/tcp/tcp_test.go                | 164 +++++++++++--------
 pkg/tcpip/transport/tcp/tcp_timestamp_test.go      |  10 +-
 pkg/tcpip/transport/tcp/testing/context/context.go |  17 +-
 pkg/tcpip/transport/udp/protocol.go                |   4 +-
 runsc/boot/loader.go                               |  28 ++--
 test/benchmarks/tcp/tcp_proxy.go                   |  19 ++-
 18 files changed, 399 insertions(+), 287 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index f0fe18684..36144e1eb 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -155,7 +155,7 @@ func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
 
 // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
 func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
-	var rs tcp.ReceiveBufferSizeOption
+	var rs tcpip.TCPReceiveBufferSizeRangeOption
 	err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &rs)
 	return inet.TCPBufferSize{
 		Min:     rs.Min,
@@ -166,17 +166,17 @@ func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
 
 // SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize.
 func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error {
-	rs := tcp.ReceiveBufferSizeOption{
+	rs := tcpip.TCPReceiveBufferSizeRangeOption{
 		Min:     size.Min,
 		Default: size.Default,
 		Max:     size.Max,
 	}
-	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, rs)).ToError()
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &rs)).ToError()
 }
 
 // TCPSendBufferSize implements inet.Stack.TCPSendBufferSize.
 func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
-	var ss tcp.SendBufferSizeOption
+	var ss tcpip.TCPSendBufferSizeRangeOption
 	err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &ss)
 	return inet.TCPBufferSize{
 		Min:     ss.Min,
@@ -187,29 +187,30 @@ func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
 
 // SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize.
 func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error {
-	ss := tcp.SendBufferSizeOption{
+	ss := tcpip.TCPSendBufferSizeRangeOption{
 		Min:     size.Min,
 		Default: size.Default,
 		Max:     size.Max,
 	}
-	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, ss)).ToError()
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &ss)).ToError()
 }
 
 // TCPSACKEnabled implements inet.Stack.TCPSACKEnabled.
 func (s *Stack) TCPSACKEnabled() (bool, error) {
-	var sack tcp.SACKEnabled
+	var sack tcpip.TCPSACKEnabled
 	err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &sack)
 	return bool(sack), syserr.TranslateNetstackError(err).ToError()
 }
 
 // SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled.
 func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
-	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError()
+	opt := tcpip.TCPSACKEnabled(enabled)
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError()
 }
 
 // TCPRecovery implements inet.Stack.TCPRecovery.
 func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
-	var recovery tcp.Recovery
+	var recovery tcpip.TCPRecovery
 	if err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &recovery); err != nil {
 		return 0, syserr.TranslateNetstackError(err).ToError()
 	}
@@ -218,7 +219,8 @@ func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
 
 // SetTCPRecovery implements inet.Stack.SetTCPRecovery.
 func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error {
-	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.Recovery(recovery))).ToError()
+	opt := tcpip.TCPRecovery(recovery)
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError()
 }
 
 // Statistics implements inet.Stack.Statistics.
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 2d88fa1f7..4fa86a3ac 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -159,12 +159,12 @@ type TransportProtocol interface {
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
 	// provided option value is invalid.
-	SetOption(option interface{}) *tcpip.Error
+	SetOption(option tcpip.SettableTransportProtocolOption) *tcpip.Error
 
 	// Option allows retrieving protocol specific option values.
 	// Option returns an error if the option is not supported or the
 	// provided option value is invalid.
-	Option(option interface{}) *tcpip.Error
+	Option(option tcpip.GettableTransportProtocolOption) *tcpip.Error
 
 	// Close requests that any worker goroutines owned by the protocol
 	// stop.
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 133d90815..def8b0b43 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -817,7 +817,7 @@ func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, optio
 // options. This method returns an error if the protocol is not supported or
 // option is not supported by the protocol implementation or the provided value
 // is incorrect.
-func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error {
+func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumber, option tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	transProtoState, ok := s.transportProtocols[transport]
 	if !ok {
 		return tcpip.ErrUnknownProtocol
@@ -832,7 +832,7 @@ func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumb
 // if err := s.TransportProtocolOption(tcpip.TCPProtocolNumber, &v); err != nil {
 //   ...
 // }
-func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error {
+func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber, option tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	transProtoState, ok := s.transportProtocols[transport]
 	if !ok {
 		return tcpip.ErrUnknownProtocol
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 9292bfccb..ef3457e32 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -291,22 +291,20 @@ func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack
 	return true
 }
 
-func (f *fakeTransportProtocol) SetOption(option interface{}) *tcpip.Error {
+func (f *fakeTransportProtocol) SetOption(option tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case fakeTransportGoodOption:
-		f.opts.good = bool(v)
+	case *tcpip.TCPModerateReceiveBufferOption:
+		f.opts.good = bool(*v)
 		return nil
-	case fakeTransportInvalidValueOption:
-		return tcpip.ErrInvalidOptionValue
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
 
-func (f *fakeTransportProtocol) Option(option interface{}) *tcpip.Error {
+func (f *fakeTransportProtocol) Option(option tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case *fakeTransportGoodOption:
-		*v = fakeTransportGoodOption(f.opts.good)
+	case *tcpip.TCPModerateReceiveBufferOption:
+		*v = tcpip.TCPModerateReceiveBufferOption(f.opts.good)
 		return nil
 	default:
 		return tcpip.ErrUnknownProtocolOption
@@ -533,41 +531,16 @@ func TestTransportOptions(t *testing.T) {
 		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
 	})
 
-	// Try an unsupported transport protocol.
-	if err := s.SetTransportProtocolOption(tcpip.TransportProtocolNumber(99999), fakeTransportGoodOption(false)); err != tcpip.ErrUnknownProtocol {
-		t.Fatalf("SetTransportProtocolOption(fakeTrans2, blah, false) = %v, want = tcpip.ErrUnknownProtocol", err)
-	}
-
-	testCases := []struct {
-		option   interface{}
-		wantErr  *tcpip.Error
-		verifier func(t *testing.T, p stack.TransportProtocol)
-	}{
-		{fakeTransportGoodOption(true), nil, func(t *testing.T, p stack.TransportProtocol) {
-			t.Helper()
-			fakeTrans := p.(*fakeTransportProtocol)
-			if fakeTrans.opts.good != true {
-				t.Fatalf("fakeTrans.opts.good = false, want = true")
-			}
-			var v fakeTransportGoodOption
-			if err := s.TransportProtocolOption(fakeTransNumber, &v); err != nil {
-				t.Fatalf("s.TransportProtocolOption(fakeTransNumber, &v) = %v, want = nil, where v is option %T", v, err)
-			}
-			if v != true {
-				t.Fatalf("s.TransportProtocolOption(fakeTransNumber, &v) returned v = %v, want = true", v)
-			}
-
-		}},
-		{fakeTransportBadOption(true), tcpip.ErrUnknownProtocolOption, nil},
-		{fakeTransportInvalidValueOption(1), tcpip.ErrInvalidOptionValue, nil},
-	}
-	for _, tc := range testCases {
-		if got := s.SetTransportProtocolOption(fakeTransNumber, tc.option); got != tc.wantErr {
-			t.Errorf("s.SetTransportProtocolOption(fakeTrans, %v) = %v, want = %v", tc.option, got, tc.wantErr)
-		}
-		if tc.verifier != nil {
-			tc.verifier(t, s.TransportProtocolInstance(fakeTransNumber))
-		}
+	v := tcpip.TCPModerateReceiveBufferOption(true)
+	if err := s.SetTransportProtocolOption(fakeTransNumber, &v); err != nil {
+		t.Errorf("s.SetTransportProtocolOption(fakeTrans, &%T(%t)): %s", v, v, err)
+	}
+	v = false
+	if err := s.TransportProtocolOption(fakeTransNumber, &v); err != nil {
+		t.Fatalf("s.TransportProtocolOption(fakeTransNumber, &%T): %s", v, err)
+	}
+	if !v {
+		t.Fatalf("got tcpip.TCPModerateReceiveBufferOption = false, want = true")
 	}
 }
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 8ba615521..5e34e27ba 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -864,12 +864,93 @@ func (*DefaultTTLOption) isGettableNetworkProtocolOption() {}
 
 func (*DefaultTTLOption) isSettableNetworkProtocolOption() {}
 
-// AvailableCongestionControlOption is used to query the supported congestion
-// control algorithms.
-type AvailableCongestionControlOption string
+// GettableTransportProtocolOption is a marker interface for transport protocol
+// options that may be queried.
+type GettableTransportProtocolOption interface {
+	isGettableTransportProtocolOption()
+}
+
+// SettableTransportProtocolOption is a marker interface for transport protocol
+// options that may be set.
+type SettableTransportProtocolOption interface {
+	isSettableTransportProtocolOption()
+}
+
+// TCPSACKEnabled the SACK option for TCP.
+//
+// See: https://tools.ietf.org/html/rfc2018.
+type TCPSACKEnabled bool
+
+func (*TCPSACKEnabled) isGettableTransportProtocolOption() {}
+
+func (*TCPSACKEnabled) isSettableTransportProtocolOption() {}
+
+// TCPRecovery is the loss deteoction algorithm used by TCP.
+type TCPRecovery int32
+
+func (*TCPRecovery) isGettableTransportProtocolOption() {}
+
+func (*TCPRecovery) isSettableTransportProtocolOption() {}
+
+const (
+	// TCPRACKLossDetection indicates RACK is used for loss detection and
+	// recovery.
+	TCPRACKLossDetection TCPRecovery = 1 << iota
+
+	// TCPRACKStaticReoWnd indicates the reordering window should not be
+	// adjusted when DSACK is received.
+	TCPRACKStaticReoWnd
+
+	// TCPRACKNoDupTh indicates RACK should not consider the classic three
+	// duplicate acknowledgements rule to mark the segments as lost. This
+	// is used when reordering is not detected.
+	TCPRACKNoDupTh
+)
+
+// TCPDelayEnabled enables/disables Nagle's algorithm in TCP.
+type TCPDelayEnabled bool
+
+func (*TCPDelayEnabled) isGettableTransportProtocolOption() {}
+
+func (*TCPDelayEnabled) isSettableTransportProtocolOption() {}
+
+// TCPSendBufferSizeRangeOption is the send buffer size range for TCP.
+type TCPSendBufferSizeRangeOption struct {
+	Min     int
+	Default int
+	Max     int
+}
+
+func (*TCPSendBufferSizeRangeOption) isGettableTransportProtocolOption() {}
+
+func (*TCPSendBufferSizeRangeOption) isSettableTransportProtocolOption() {}
+
+// TCPReceiveBufferSizeRangeOption is the receive buffer size range for TCP.
+type TCPReceiveBufferSizeRangeOption struct {
+	Min     int
+	Default int
+	Max     int
+}
+
+func (*TCPReceiveBufferSizeRangeOption) isGettableTransportProtocolOption() {}
+
+func (*TCPReceiveBufferSizeRangeOption) isSettableTransportProtocolOption() {}
+
+// TCPAvailableCongestionControlOption is the supported congestion control
+// algorithms for TCP
+type TCPAvailableCongestionControlOption string
+
+func (*TCPAvailableCongestionControlOption) isGettableTransportProtocolOption() {}
+
+func (*TCPAvailableCongestionControlOption) isSettableTransportProtocolOption() {}
+
+// TCPModerateReceiveBufferOption enables/disables receive buffer moderation
+// for TCP.
+type TCPModerateReceiveBufferOption bool
 
-// ModerateReceiveBufferOption is used by buffer moderation.
-type ModerateReceiveBufferOption bool
+func (*TCPModerateReceiveBufferOption) isGettableTransportProtocolOption() {}
+
+func (*TCPModerateReceiveBufferOption) isSettableTransportProtocolOption() {}
 
 // GettableSocketOption is a marker interface for socket options that may be
 // queried.
@@ -935,6 +1016,10 @@ func (*CongestionControlOption) isGettableSocketOption() {}
 
 func (*CongestionControlOption) isSettableSocketOption() {}
 
+func (*CongestionControlOption) isGettableTransportProtocolOption() {}
+
+func (*CongestionControlOption) isSettableTransportProtocolOption() {}
+
 // TCPLingerTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum duration for which a socket lingers in the TCP_FIN_WAIT_2 state
 // before being marked closed.
@@ -944,6 +1029,10 @@ func (*TCPLingerTimeoutOption) isGettableSocketOption() {}
 
 func (*TCPLingerTimeoutOption) isSettableSocketOption() {}
 
+func (*TCPLingerTimeoutOption) isGettableTransportProtocolOption() {}
+
+func (*TCPLingerTimeoutOption) isSettableTransportProtocolOption() {}
+
 // TCPTimeWaitTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum duration for which a socket lingers in the TIME_WAIT state
 // before being marked closed.
@@ -953,6 +1042,10 @@ func (*TCPTimeWaitTimeoutOption) isGettableSocketOption() {}
 
 func (*TCPTimeWaitTimeoutOption) isSettableSocketOption() {}
 
+func (*TCPTimeWaitTimeoutOption) isGettableTransportProtocolOption() {}
+
+func (*TCPTimeWaitTimeoutOption) isSettableTransportProtocolOption() {}
+
 // TCPDeferAcceptOption is used by SetSockOpt/GetSockOpt to allow a
 // accept to return a completed connection only when there is data to be
 // read. This usually means the listening socket will drop the final ACK
@@ -971,6 +1064,10 @@ func (*TCPMinRTOOption) isGettableSocketOption() {}
 
 func (*TCPMinRTOOption) isSettableSocketOption() {}
 
+func (*TCPMinRTOOption) isGettableTransportProtocolOption() {}
+
+func (*TCPMinRTOOption) isSettableTransportProtocolOption() {}
+
 // TCPMaxRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
 // default MaxRTO used by the Stack.
 type TCPMaxRTOOption time.Duration
@@ -979,6 +1076,10 @@ func (*TCPMaxRTOOption) isGettableSocketOption() {}
 
 func (*TCPMaxRTOOption) isSettableSocketOption() {}
 
+func (*TCPMaxRTOOption) isGettableTransportProtocolOption() {}
+
+func (*TCPMaxRTOOption) isSettableTransportProtocolOption() {}
+
 // TCPMaxRetriesOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum number of retransmits after which we time out the connection.
 type TCPMaxRetriesOption uint64
@@ -987,6 +1088,10 @@ func (*TCPMaxRetriesOption) isGettableSocketOption() {}
 
 func (*TCPMaxRetriesOption) isSettableSocketOption() {}
 
+func (*TCPMaxRetriesOption) isGettableTransportProtocolOption() {}
+
+func (*TCPMaxRetriesOption) isSettableTransportProtocolOption() {}
+
 // TCPSynRcvdCountThresholdOption is used by SetSockOpt/GetSockOpt to specify
 // the number of endpoints that can be in SYN-RCVD state before the stack
 // switches to using SYN cookies.
@@ -996,6 +1101,10 @@ func (*TCPSynRcvdCountThresholdOption) isGettableSocketOption() {}
 
 func (*TCPSynRcvdCountThresholdOption) isSettableSocketOption() {}
 
+func (*TCPSynRcvdCountThresholdOption) isGettableTransportProtocolOption() {}
+
+func (*TCPSynRcvdCountThresholdOption) isSettableTransportProtocolOption() {}
+
 // TCPSynRetriesOption is used by SetSockOpt/GetSockOpt to specify stack-wide
 // default for number of times SYN is retransmitted before aborting a connect.
 type TCPSynRetriesOption uint8
@@ -1004,6 +1113,10 @@ func (*TCPSynRetriesOption) isGettableSocketOption() {}
 
 func (*TCPSynRetriesOption) isSettableSocketOption() {}
 
+func (*TCPSynRetriesOption) isGettableTransportProtocolOption() {}
+
+func (*TCPSynRetriesOption) isSettableTransportProtocolOption() {}
+
 // MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a
 // default interface for multicast.
 type MulticastInterfaceOption struct {
@@ -1062,6 +1175,10 @@ func (*TCPTimeWaitReuseOption) isGettableSocketOption() {}
 
 func (*TCPTimeWaitReuseOption) isSettableSocketOption() {}
 
+func (*TCPTimeWaitReuseOption) isGettableTransportProtocolOption() {}
+
+func (*TCPTimeWaitReuseOption) isSettableTransportProtocolOption() {}
+
 const (
 	// TCPTimeWaitReuseDisabled indicates reuse of port bound by endponts in TIME-WAIT cannot
 	// be reused for new connections.
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index 74ef6541e..bb11e4e83 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -109,12 +109,12 @@ func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEnd
 }
 
 // SetOption implements stack.TransportProtocol.SetOption.
-func (*protocol) SetOption(option interface{}) *tcpip.Error {
+func (*protocol) SetOption(tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
 // Option implements stack.TransportProtocol.Option.
-func (*protocol) Option(option interface{}) *tcpip.Error {
+func (*protocol) Option(tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 72df5c2a1..09d53d158 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -522,7 +522,7 @@ func (h *handshake) execute() *tcpip.Error {
 	s.AddWaker(&h.ep.newSegmentWaker, wakerForNewSegment)
 	defer s.Done()
 
-	var sackEnabled SACKEnabled
+	var sackEnabled tcpip.TCPSACKEnabled
 	if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil {
 		// If stack returned an error when checking for SACKEnabled
 		// status then just default to switching off SACK negotiation.
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index 80e9dd465..94207c141 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -560,8 +560,9 @@ func TestV4AcceptOnV4(t *testing.T) {
 func testV4ListenClose(t *testing.T, c *context.Context) {
 	// Set the SynRcvd threshold to zero to force a syn cookie based accept
 	// to happen.
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-		t.Fatalf("setting TCPSynRcvdCountThresholdOption failed: %s", err)
+	var opt tcpip.TCPSynRcvdCountThresholdOption
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("setting TCPSynRcvdCountThresholdOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	const n = uint16(32)
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 4cf966b65..8cb769d58 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -849,12 +849,12 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		maxSynRetries: DefaultSynRetries,
 	}
 
-	var ss SendBufferSizeOption
+	var ss tcpip.TCPSendBufferSizeRangeOption
 	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
 		e.sndBufSize = ss.Default
 	}
 
-	var rs ReceiveBufferSizeOption
+	var rs tcpip.TCPReceiveBufferSizeRangeOption
 	if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
 		e.rcvBufSize = rs.Default
 	}
@@ -864,12 +864,12 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		e.cc = cs
 	}
 
-	var mrb tcpip.ModerateReceiveBufferOption
+	var mrb tcpip.TCPModerateReceiveBufferOption
 	if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
 		e.rcvAutoParams.disabled = !bool(mrb)
 	}
 
-	var de DelayEnabled
+	var de tcpip.TCPDelayEnabled
 	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
 		e.SetSockOptBool(tcpip.DelayOption, true)
 	}
@@ -1609,7 +1609,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	case tcpip.ReceiveBufferSizeOption:
 		// Make sure the receive buffer size is within the min and max
 		// allowed.
-		var rs ReceiveBufferSizeOption
+		var rs tcpip.TCPReceiveBufferSizeRangeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
 			if v < rs.Min {
 				v = rs.Min
@@ -1659,7 +1659,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	case tcpip.SendBufferSizeOption:
 		// Make sure the send buffer size is within the min and max
 		// allowed.
-		var ss SendBufferSizeOption
+		var ss tcpip.TCPSendBufferSizeRangeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
 			if v < ss.Min {
 				v = ss.Min
@@ -1699,7 +1699,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 				return tcpip.ErrInvalidOptionValue
 			}
 		}
-		var rs ReceiveBufferSizeOption
+		var rs tcpip.TCPReceiveBufferSizeRangeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
 			if v < rs.Min/2 {
 				v = rs.Min / 2
@@ -1748,7 +1748,7 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 		// Query the available cc algorithms in the stack and
 		// validate that the specified algorithm is actually
 		// supported in the stack.
-		var avail tcpip.AvailableCongestionControlOption
+		var avail tcpip.TCPAvailableCongestionControlOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
 			return err
 		}
@@ -2707,7 +2707,7 @@ func (e *endpoint) receiveBufferSize() int {
 }
 
 func (e *endpoint) maxReceiveBufferSize() int {
-	var rs ReceiveBufferSizeOption
+	var rs tcpip.TCPReceiveBufferSizeRangeOption
 	if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
 		// As a fallback return the hardcoded max buffer size.
 		return MaxBufferSize
@@ -2787,7 +2787,7 @@ func timeStampOffset() uint32 {
 // if the SYN options indicate that the SACK option was negotiated and the TCP
 // stack is configured to enable TCP SACK option.
 func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) {
-	var v SACKEnabled
+	var v tcpip.TCPSACKEnabled
 	if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
 		// Stack doesn't support SACK. So just return.
 		return
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 723e47ddc..41d0050f3 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -182,14 +182,14 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	epState := e.origEndpointState
 	switch epState {
 	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
-		var ss SendBufferSizeOption
+		var ss tcpip.TCPSendBufferSizeRangeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
 			if e.sndBufSize < ss.Min || e.sndBufSize > ss.Max {
 				panic(fmt.Sprintf("endpoint.sndBufSize %d is outside the min and max allowed [%d, %d]", e.sndBufSize, ss.Min, ss.Max))
 			}
 		}
 
-		var rs ReceiveBufferSizeOption
+		var rs tcpip.TCPReceiveBufferSizeRangeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
 			if e.rcvBufSize < rs.Min || e.rcvBufSize > rs.Max {
 				panic(fmt.Sprintf("endpoint.rcvBufSize %d is outside the min and max allowed [%d, %d]", e.rcvBufSize, rs.Min, rs.Max))
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index c5afa2680..63ec12be8 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -79,50 +79,6 @@ const (
 	ccCubic = "cubic"
 )
 
-// SACKEnabled is used by stack.(*Stack).TransportProtocolOption to
-// enable/disable SACK support in TCP. See: https://tools.ietf.org/html/rfc2018.
-type SACKEnabled bool
-
-// Recovery is used by stack.(*Stack).TransportProtocolOption to
-// set loss detection algorithm in TCP.
-type Recovery int32
-
-const (
-	// RACKLossDetection indicates RACK is used for loss detection and
-	// recovery.
-	RACKLossDetection Recovery = 1 << iota
-
-	// RACKStaticReoWnd indicates the reordering window should not be
-	// adjusted when DSACK is received.
-	RACKStaticReoWnd
-
-	// RACKNoDupTh indicates RACK should not consider the classic three
-	// duplicate acknowledgements rule to mark the segments as lost. This
-	// is used when reordering is not detected.
-	RACKNoDupTh
-)
-
-// DelayEnabled is used by stack.(Stack*).TransportProtocolOption to
-// enable/disable Nagle's algorithm in TCP.
-type DelayEnabled bool
-
-// SendBufferSizeOption is used by stack.(Stack*).TransportProtocolOption
-// to get/set the default, min and max TCP send buffer sizes.
-type SendBufferSizeOption struct {
-	Min     int
-	Default int
-	Max     int
-}
-
-// ReceiveBufferSizeOption is used by
-// stack.(Stack*).TransportProtocolOption to get/set the default, min and max
-// TCP receive buffer sizes.
-type ReceiveBufferSizeOption struct {
-	Min     int
-	Default int
-	Max     int
-}
-
 // syncRcvdCounter tracks the number of endpoints in the SYN-RCVD state. The
 // value is protected by a mutex so that we can increment only when it's
 // guaranteed not to go above a threshold.
@@ -183,10 +139,10 @@ func (s *synRcvdCounter) Threshold() uint64 {
 type protocol struct {
 	mu                         sync.RWMutex
 	sackEnabled                bool
-	recovery                   Recovery
+	recovery                   tcpip.TCPRecovery
 	delayEnabled               bool
-	sendBufferSize             SendBufferSizeOption
-	recvBufferSize             ReceiveBufferSizeOption
+	sendBufferSize             tcpip.TCPSendBufferSizeRangeOption
+	recvBufferSize             tcpip.TCPReceiveBufferSizeRangeOption
 	congestionControl          string
 	availableCongestionControl []string
 	moderateReceiveBuffer      bool
@@ -296,49 +252,49 @@ func replyWithReset(s *segment, tos, ttl uint8) {
 }
 
 // SetOption implements stack.TransportProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case SACKEnabled:
+	case *tcpip.TCPSACKEnabled:
 		p.mu.Lock()
-		p.sackEnabled = bool(v)
+		p.sackEnabled = bool(*v)
 		p.mu.Unlock()
 		return nil
 
-	case Recovery:
+	case *tcpip.TCPRecovery:
 		p.mu.Lock()
-		p.recovery = Recovery(v)
+		p.recovery = *v
 		p.mu.Unlock()
 		return nil
 
-	case DelayEnabled:
+	case *tcpip.TCPDelayEnabled:
 		p.mu.Lock()
-		p.delayEnabled = bool(v)
+		p.delayEnabled = bool(*v)
 		p.mu.Unlock()
 		return nil
 
-	case SendBufferSizeOption:
+	case *tcpip.TCPSendBufferSizeRangeOption:
 		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
 			return tcpip.ErrInvalidOptionValue
 		}
 		p.mu.Lock()
-		p.sendBufferSize = v
+		p.sendBufferSize = *v
 		p.mu.Unlock()
 		return nil
 
-	case ReceiveBufferSizeOption:
+	case *tcpip.TCPReceiveBufferSizeRangeOption:
 		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
 			return tcpip.ErrInvalidOptionValue
 		}
 		p.mu.Lock()
-		p.recvBufferSize = v
+		p.recvBufferSize = *v
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.CongestionControlOption:
+	case *tcpip.CongestionControlOption:
 		for _, c := range p.availableCongestionControl {
-			if string(v) == c {
+			if string(*v) == c {
 				p.mu.Lock()
-				p.congestionControl = string(v)
+				p.congestionControl = string(*v)
 				p.mu.Unlock()
 				return nil
 			}
@@ -347,75 +303,79 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 		// is specified.
 		return tcpip.ErrNoSuchFile
 
-	case tcpip.ModerateReceiveBufferOption:
+	case *tcpip.TCPModerateReceiveBufferOption:
 		p.mu.Lock()
-		p.moderateReceiveBuffer = bool(v)
+		p.moderateReceiveBuffer = bool(*v)
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPLingerTimeoutOption:
-		if v < 0 {
-			v = 0
-		}
+	case *tcpip.TCPLingerTimeoutOption:
 		p.mu.Lock()
-		p.lingerTimeout = time.Duration(v)
+		if *v < 0 {
+			p.lingerTimeout = 0
+		} else {
+			p.lingerTimeout = time.Duration(*v)
+		}
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPTimeWaitTimeoutOption:
-		if v < 0 {
-			v = 0
-		}
+	case *tcpip.TCPTimeWaitTimeoutOption:
 		p.mu.Lock()
-		p.timeWaitTimeout = time.Duration(v)
+		if *v < 0 {
+			p.timeWaitTimeout = 0
+		} else {
+			p.timeWaitTimeout = time.Duration(*v)
+		}
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPTimeWaitReuseOption:
-		if v < tcpip.TCPTimeWaitReuseDisabled || v > tcpip.TCPTimeWaitReuseLoopbackOnly {
+	case *tcpip.TCPTimeWaitReuseOption:
+		if *v < tcpip.TCPTimeWaitReuseDisabled || *v > tcpip.TCPTimeWaitReuseLoopbackOnly {
 			return tcpip.ErrInvalidOptionValue
 		}
 		p.mu.Lock()
-		p.timeWaitReuse = v
+		p.timeWaitReuse = *v
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPMinRTOOption:
-		if v < 0 {
-			v = tcpip.TCPMinRTOOption(MinRTO)
-		}
+	case *tcpip.TCPMinRTOOption:
 		p.mu.Lock()
-		p.minRTO = time.Duration(v)
+		if *v < 0 {
+			p.minRTO = MinRTO
+		} else {
+			p.minRTO = time.Duration(*v)
+		}
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPMaxRTOOption:
-		if v < 0 {
-			v = tcpip.TCPMaxRTOOption(MaxRTO)
-		}
+	case *tcpip.TCPMaxRTOOption:
 		p.mu.Lock()
-		p.maxRTO = time.Duration(v)
+		if *v < 0 {
+			p.maxRTO = MaxRTO
+		} else {
+			p.maxRTO = time.Duration(*v)
+		}
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPMaxRetriesOption:
+	case *tcpip.TCPMaxRetriesOption:
 		p.mu.Lock()
-		p.maxRetries = uint32(v)
+		p.maxRetries = uint32(*v)
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPSynRcvdCountThresholdOption:
+	case *tcpip.TCPSynRcvdCountThresholdOption:
 		p.mu.Lock()
-		p.synRcvdCount.SetThreshold(uint64(v))
+		p.synRcvdCount.SetThreshold(uint64(*v))
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPSynRetriesOption:
-		if v < 1 || v > 255 {
+	case *tcpip.TCPSynRetriesOption:
+		if *v < 1 || *v > 255 {
 			return tcpip.ErrInvalidOptionValue
 		}
 		p.mu.Lock()
-		p.synRetries = uint8(v)
+		p.synRetries = uint8(*v)
 		p.mu.Unlock()
 		return nil
 
@@ -425,33 +385,33 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 }
 
 // Option implements stack.TransportProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case *SACKEnabled:
+	case *tcpip.TCPSACKEnabled:
 		p.mu.RLock()
-		*v = SACKEnabled(p.sackEnabled)
+		*v = tcpip.TCPSACKEnabled(p.sackEnabled)
 		p.mu.RUnlock()
 		return nil
 
-	case *Recovery:
+	case *tcpip.TCPRecovery:
 		p.mu.RLock()
-		*v = Recovery(p.recovery)
+		*v = tcpip.TCPRecovery(p.recovery)
 		p.mu.RUnlock()
 		return nil
 
-	case *DelayEnabled:
+	case *tcpip.TCPDelayEnabled:
 		p.mu.RLock()
-		*v = DelayEnabled(p.delayEnabled)
+		*v = tcpip.TCPDelayEnabled(p.delayEnabled)
 		p.mu.RUnlock()
 		return nil
 
-	case *SendBufferSizeOption:
+	case *tcpip.TCPSendBufferSizeRangeOption:
 		p.mu.RLock()
 		*v = p.sendBufferSize
 		p.mu.RUnlock()
 		return nil
 
-	case *ReceiveBufferSizeOption:
+	case *tcpip.TCPReceiveBufferSizeRangeOption:
 		p.mu.RLock()
 		*v = p.recvBufferSize
 		p.mu.RUnlock()
@@ -463,15 +423,15 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 		p.mu.RUnlock()
 		return nil
 
-	case *tcpip.AvailableCongestionControlOption:
+	case *tcpip.TCPAvailableCongestionControlOption:
 		p.mu.RLock()
-		*v = tcpip.AvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
+		*v = tcpip.TCPAvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
 		p.mu.RUnlock()
 		return nil
 
-	case *tcpip.ModerateReceiveBufferOption:
+	case *tcpip.TCPModerateReceiveBufferOption:
 		p.mu.RLock()
-		*v = tcpip.ModerateReceiveBufferOption(p.moderateReceiveBuffer)
+		*v = tcpip.TCPModerateReceiveBufferOption(p.moderateReceiveBuffer)
 		p.mu.RUnlock()
 		return nil
 
@@ -567,12 +527,12 @@ func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
 // NewProtocol returns a TCP transport protocol.
 func NewProtocol() stack.TransportProtocol {
 	p := protocol{
-		sendBufferSize: SendBufferSizeOption{
+		sendBufferSize: tcpip.TCPSendBufferSizeRangeOption{
 			Min:     MinBufferSize,
 			Default: DefaultSendBufferSize,
 			Max:     MaxBufferSize,
 		},
-		recvBufferSize: ReceiveBufferSizeOption{
+		recvBufferSize: tcpip.TCPReceiveBufferSizeRangeOption{
 			Min:     MinBufferSize,
 			Default: DefaultReceiveBufferSize,
 			Max:     MaxBufferSize,
@@ -587,7 +547,7 @@ func NewProtocol() stack.TransportProtocol {
 		minRTO:                     MinRTO,
 		maxRTO:                     MaxRTO,
 		maxRetries:                 MaxRetries,
-		recovery:                   RACKLossDetection,
+		recovery:                   tcpip.TCPRACKLossDetection,
 	}
 	p.dispatcher.init(runtime.GOMAXPROCS(0))
 	return &p
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index 99521f0c1..ef7f5719f 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -46,8 +46,9 @@ func createConnectedWithSACKAndTS(c *context.Context) *context.RawEndpoint {
 
 func setStackSACKPermitted(t *testing.T, c *context.Context, enable bool) {
 	t.Helper()
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enable)); err != nil {
-		t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, SACKEnabled(%t) = %s", enable, err)
+	opt := tcpip.TCPSACKEnabled(enable)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("c.s.SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 }
 
@@ -162,8 +163,9 @@ func TestSackPermittedAccept(t *testing.T) {
 						// Set the SynRcvd threshold to
 						// zero to force a syn cookie
 						// based accept to happen.
-						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-							t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+						var opt tcpip.TCPSynRcvdCountThresholdOption
+						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+							t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 						}
 					}
 					setStackSACKPermitted(t, c, sackEnabled)
@@ -236,8 +238,9 @@ func TestSackDisabledAccept(t *testing.T) {
 						// Set the SynRcvd threshold to
 						// zero to force a syn cookie
 						// based accept to happen.
-						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-							t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+						var opt tcpip.TCPSynRcvdCountThresholdOption
+						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+							t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 						}
 					}
 
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 3d09d6def..0d13e1efd 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -309,8 +309,8 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	// Lower stackwide TIME_WAIT timeout so that the reservations
 	// are released instantly on Close.
 	tcpTW := tcpip.TCPTimeWaitTimeoutOption(1 * time.Millisecond)
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpTW); err != nil {
-		t.Fatalf("e.stack.SetTransportProtocolOption(%d, %#v) = %s", tcp.ProtocolNumber, tcpTW, err)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &tcpTW); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, tcpTW, tcpTW, err)
 	}
 
 	c.EP.Close()
@@ -432,8 +432,9 @@ func TestConnectResetAfterClose(t *testing.T) {
 	// Set TCPLinger to 3 seconds so that sockets are marked closed
 	// after 3 second in FIN_WAIT2 state.
 	tcpLingerTimeout := 3 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPLingerTimeoutOption(tcpLingerTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%s) failed: %s", tcpLingerTimeout, err)
+	opt := tcpip.TCPLingerTimeoutOption(tcpLingerTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
@@ -506,8 +507,9 @@ func TestCurrentConnectedIncrement(t *testing.T) {
 	// Set TCPTimeWaitTimeout to 1 seconds so that sockets are marked closed
 	// after 1 second in TIME_WAIT state.
 	tcpTimeWaitTimeout := 1 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPTimeWaitTimeout(%d) failed: %s", tcpTimeWaitTimeout, err)
+	opt := tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
@@ -933,8 +935,8 @@ func TestUserSuppliedMSSOnListenAccept(t *testing.T) {
 
 					// Set the SynRcvd threshold to force a syn cookie based accept to happen.
 					opt := tcpip.TCPSynRcvdCountThresholdOption(nonSynCookieAccepts)
-					if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, opt); err != nil {
-						t.Fatalf("SetTransportProtocolOption(%d, %#v): %s", tcp.ProtocolNumber, opt, err)
+					if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+						t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 					}
 
 					if err := c.EP.SetSockOptInt(tcpip.MaxSegOption, int(test.setMSS)); err != nil {
@@ -2867,8 +2869,9 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
 
 	// Set the SynRcvd threshold to zero to force a syn cookie based accept
 	// to happen.
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-		t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+	opt := tcpip.TCPSynRcvdCountThresholdOption(0)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	// Create EP and start listening.
@@ -3146,8 +3149,9 @@ func TestMaxRetransmitsTimeout(t *testing.T) {
 	defer c.Cleanup()
 
 	const numRetries = 2
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRetriesOption(numRetries)); err != nil {
-		t.Fatalf("could not set protocol option MaxRetries.\n")
+	opt := tcpip.TCPMaxRetriesOption(numRetries)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */)
@@ -3206,8 +3210,9 @@ func TestMaxRTO(t *testing.T) {
 	defer c.Cleanup()
 
 	rto := 1 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRTOOption(rto)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPMaxRTO(%d) failed: %s", rto, err)
+	opt := tcpip.TCPMaxRTOOption(rto)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */)
@@ -3964,8 +3969,9 @@ func TestReadAfterClosedState(t *testing.T) {
 	// Set TCPTimeWaitTimeout to 1 seconds so that sockets are marked closed
 	// after 1 second in TIME_WAIT state.
 	tcpTimeWaitTimeout := 1 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPTimeWaitTimeout(%d) failed: %s", tcpTimeWaitTimeout, err)
+	opt := tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
@@ -4204,11 +4210,15 @@ func TestDefaultBufferSizes(t *testing.T) {
 	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize)
 
 	// Change the default send buffer size.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{
-		Min:     1,
-		Default: tcp.DefaultSendBufferSize * 2,
-		Max:     tcp.DefaultSendBufferSize * 20}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPSendBufferSizeRangeOption{
+			Min:     1,
+			Default: tcp.DefaultSendBufferSize * 2,
+			Max:     tcp.DefaultSendBufferSize * 20,
+		}
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
 	ep.Close()
@@ -4221,11 +4231,15 @@ func TestDefaultBufferSizes(t *testing.T) {
 	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize)
 
 	// Change the default receive buffer size.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{
-		Min:     1,
-		Default: tcp.DefaultReceiveBufferSize * 3,
-		Max:     tcp.DefaultReceiveBufferSize * 30}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %v", err)
+	{
+		opt := tcpip.TCPReceiveBufferSizeRangeOption{
+			Min:     1,
+			Default: tcp.DefaultReceiveBufferSize * 3,
+			Max:     tcp.DefaultReceiveBufferSize * 30,
+		}
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
 	ep.Close()
@@ -4252,12 +4266,18 @@ func TestMinMaxBufferSizes(t *testing.T) {
 	defer ep.Close()
 
 	// Change the min/max values for send/receive
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 200, Default: tcp.DefaultReceiveBufferSize * 2, Max: tcp.DefaultReceiveBufferSize * 20}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 200, Default: tcp.DefaultReceiveBufferSize * 2, Max: tcp.DefaultReceiveBufferSize * 20}
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{Min: 300, Default: tcp.DefaultSendBufferSize * 3, Max: tcp.DefaultSendBufferSize * 30}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPSendBufferSizeRangeOption{Min: 300, Default: tcp.DefaultSendBufferSize * 3, Max: tcp.DefaultSendBufferSize * 30}
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
 	// Set values below the min.
@@ -4718,8 +4738,8 @@ func TestStackSetCongestionControl(t *testing.T) {
 				t.Fatalf("s.TransportProtocolOption(%v, %v) = %s", tcp.ProtocolNumber, &oldCC, err)
 			}
 
-			if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tc.cc); err != tc.err {
-				t.Fatalf("s.SetTransportProtocolOption(%v, %v) = %v, want %v", tcp.ProtocolNumber, tc.cc, err, tc.err)
+			if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &tc.cc); err != tc.err {
+				t.Fatalf("s.SetTransportProtocolOption(%d, &%T(%s)) = %s, want = %s", tcp.ProtocolNumber, tc.cc, tc.cc, err, tc.err)
 			}
 
 			var cc tcpip.CongestionControlOption
@@ -4751,12 +4771,12 @@ func TestStackAvailableCongestionControl(t *testing.T) {
 	s := c.Stack()
 
 	// Query permitted congestion control algorithms.
-	var aCC tcpip.AvailableCongestionControlOption
+	var aCC tcpip.TCPAvailableCongestionControlOption
 	if err := s.TransportProtocolOption(tcp.ProtocolNumber, &aCC); err != nil {
 		t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &aCC, err)
 	}
-	if got, want := aCC, tcpip.AvailableCongestionControlOption("reno cubic"); got != want {
-		t.Fatalf("got tcpip.AvailableCongestionControlOption: %v, want: %v", got, want)
+	if got, want := aCC, tcpip.TCPAvailableCongestionControlOption("reno cubic"); got != want {
+		t.Fatalf("got tcpip.TCPAvailableCongestionControlOption: %v, want: %v", got, want)
 	}
 }
 
@@ -4767,18 +4787,18 @@ func TestStackSetAvailableCongestionControl(t *testing.T) {
 	s := c.Stack()
 
 	// Setting AvailableCongestionControlOption should fail.
-	aCC := tcpip.AvailableCongestionControlOption("xyz")
+	aCC := tcpip.TCPAvailableCongestionControlOption("xyz")
 	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &aCC); err == nil {
-		t.Fatalf("s.TransportProtocolOption(%v, %v) = nil, want non-nil", tcp.ProtocolNumber, &aCC)
+		t.Fatalf("s.SetTransportProtocolOption(%d, &%T(%s)) = nil, want non-nil", tcp.ProtocolNumber, aCC, aCC)
 	}
 
 	// Verify that we still get the expected list of congestion control options.
-	var cc tcpip.AvailableCongestionControlOption
+	var cc tcpip.TCPAvailableCongestionControlOption
 	if err := s.TransportProtocolOption(tcp.ProtocolNumber, &cc); err != nil {
-		t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &cc, err)
+		t.Fatalf("s.TransportProtocolOptio(%d, &%T(%s)): %s", tcp.ProtocolNumber, cc, cc, err)
 	}
-	if got, want := cc, tcpip.AvailableCongestionControlOption("reno cubic"); got != want {
-		t.Fatalf("got tcpip.AvailableCongestionControlOption: %v, want: %v", got, want)
+	if got, want := cc, tcpip.TCPAvailableCongestionControlOption("reno cubic"); got != want {
+		t.Fatalf("got tcpip.TCPAvailableCongestionControlOption = %s, want = %s", got, want)
 	}
 }
 
@@ -4842,8 +4862,8 @@ func TestEndpointSetCongestionControl(t *testing.T) {
 func enableCUBIC(t *testing.T, c *context.Context) {
 	t.Helper()
 	opt := tcpip.CongestionControlOption("cubic")
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, opt); err != nil {
-		t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, %s = %s", opt, err)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%s)) %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 }
 
@@ -5505,8 +5525,9 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(1)); err != nil {
-		t.Fatalf("setting TCPSynRcvdCountThresholdOption to 1 failed: %s", err)
+	opt := tcpip.TCPSynRcvdCountThresholdOption(1)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	// Create TCP endpoint.
@@ -5906,13 +5927,19 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 	// the segment queue holding unprocessed packets is limited to 500.
 	const receiveBufferSize = 80 << 10 // 80KB.
 	const maxReceiveBufferSize = receiveBufferSize * 10
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}
+		if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
 	// Enable auto-tuning.
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPModerateReceiveBufferOption(true)
+		if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 	// Change the expected window scale to match the value needed for the
 	// maximum buffer size defined above.
@@ -6027,13 +6054,19 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 	// the segment queue holding unprocessed packets is limited to 300.
 	const receiveBufferSize = 80 << 10 // 80KB.
 	const maxReceiveBufferSize = receiveBufferSize * 10
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}
+		if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
 	// Enable auto-tuning.
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPModerateReceiveBufferOption(true)
+		if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 	// Change the expected window scale to match the value needed for the
 	// maximum buffer size used by stack.
@@ -6169,7 +6202,7 @@ func TestDelayEnabled(t *testing.T) {
 	checkDelayOption(t, c, false, false) // Delay is disabled by default.
 
 	for _, v := range []struct {
-		delayEnabled    tcp.DelayEnabled
+		delayEnabled    tcpip.TCPDelayEnabled
 		wantDelayOption bool
 	}{
 		{delayEnabled: false, wantDelayOption: false},
@@ -6177,17 +6210,17 @@ func TestDelayEnabled(t *testing.T) {
 	} {
 		c := context.New(t, defaultMTU)
 		defer c.Cleanup()
-		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, v.delayEnabled); err != nil {
-			t.Fatalf("SetTransportProtocolOption(tcp, %t) failed: %s", v.delayEnabled, err)
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &v.delayEnabled); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, v.delayEnabled, v.delayEnabled, err)
 		}
 		checkDelayOption(t, c, v.delayEnabled, v.wantDelayOption)
 	}
 }
 
-func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.DelayEnabled, wantDelayOption bool) {
+func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcpip.TCPDelayEnabled, wantDelayOption bool) {
 	t.Helper()
 
-	var gotDelayEnabled tcp.DelayEnabled
+	var gotDelayEnabled tcpip.TCPDelayEnabled
 	if err := c.Stack().TransportProtocolOption(tcp.ProtocolNumber, &gotDelayEnabled); err != nil {
 		t.Fatalf("TransportProtocolOption(tcp, &gotDelayEnabled) failed: %s", err)
 	}
@@ -6625,8 +6658,9 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	// Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed
 	// after 5 seconds in TIME_WAIT state.
 	tcpTimeWaitTimeout := 5 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
+	opt := tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%s)): %s", tcp.ProtocolNumber, opt, tcpTimeWaitTimeout, err)
 	}
 
 	want := c.Stack().Stats().TCP.EstablishedClosed.Value() + 1
@@ -6775,8 +6809,9 @@ func TestTCPCloseWithData(t *testing.T) {
 	// Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed
 	// after 5 seconds in TIME_WAIT state.
 	tcpTimeWaitTimeout := 5 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
+	opt := tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%s)): %s", tcp.ProtocolNumber, opt, tcpTimeWaitTimeout, err)
 	}
 
 	wq := &waiter.Queue{}
@@ -7462,9 +7497,10 @@ func TestSetStackTimeWaitReuse(t *testing.T) {
 	}
 
 	for _, tc := range testCases {
-		err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitReuseOption(tc.v))
+		opt := tcpip.TCPTimeWaitReuseOption(tc.v)
+		err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)
 		if got, want := err, tc.err; got != want {
-			t.Fatalf("s.TransportProtocolOption(%v, %v) = %v, want %v", tcp.ProtocolNumber, tc.v, err, tc.err)
+			t.Fatalf("s.SetTransportProtocolOption(%d, &%T(%d)) = %s, want = %s", tcp.ProtocolNumber, tc.v, tc.v, err, tc.err)
 		}
 		if tc.err != nil {
 			continue
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index 8edbff964..44593ed98 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -131,8 +131,9 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS
 	defer c.Cleanup()
 
 	if cookieEnabled {
-		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-			t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+		var opt tcpip.TCPSynRcvdCountThresholdOption
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 		}
 	}
 
@@ -192,8 +193,9 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd
 	defer c.Cleanup()
 
 	if cookieEnabled {
-		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-			t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+		var opt tcpip.TCPSynRcvdCountThresholdOption
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 		}
 	}
 
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 8bb5e5f6d..baf7df197 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -146,19 +146,22 @@ func New(t *testing.T, mtu uint32) *Context {
 	const sendBufferSize = 1 << 20 // 1 MiB
 	const recvBufferSize = 1 << 20 // 1 MiB
 	// Allow minimum send/receive buffer sizes to be 1 during tests.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{Min: 1, Default: sendBufferSize, Max: 10 * sendBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	sendBufOpt := tcpip.TCPSendBufferSizeRangeOption{Min: 1, Default: sendBufferSize, Max: 10 * sendBufferSize}
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &sendBufOpt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%#v) failed: %s", tcp.ProtocolNumber, sendBufOpt, err)
 	}
 
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: recvBufferSize, Max: 10 * recvBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	rcvBufOpt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 1, Default: recvBufferSize, Max: 10 * recvBufferSize}
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &rcvBufOpt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%#v) failed: %s", tcp.ProtocolNumber, rcvBufOpt, err)
 	}
 
 	// Increase minimum RTO in tests to avoid test flakes due to early
 	// retransmit in case the test executors are overloaded and cause timers
 	// to fire earlier than expected.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMinRTOOption(3*time.Second)); err != nil {
-		t.Fatalf("failed to set stack-wide minRTO: %s", err)
+	minRTOOpt := tcpip.TCPMinRTOOption(3 * time.Second)
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &minRTOOpt); err != nil {
+		t.Fatalf("s.SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, minRTOOpt, minRTOOpt, err)
 	}
 
 	// Some of the congestion control tests send up to 640 packets, we so
@@ -1096,7 +1099,7 @@ func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions
 // SACKEnabled returns true if the TCP Protocol option SACKEnabled is set to true
 // for the Stack in the context.
 func (c *Context) SACKEnabled() bool {
-	var v tcp.SACKEnabled
+	var v tcpip.TCPSACKEnabled
 	if err := c.Stack().TransportProtocolOption(tcp.ProtocolNumber, &v); err != nil {
 		// Stack doesn't support SACK. So just return.
 		return false
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index f65751dd4..3f87e8057 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -202,12 +202,12 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 }
 
 // SetOption implements stack.TransportProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+func (*protocol) SetOption(tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
 // Option implements stack.TransportProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+func (*protocol) Option(tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 246ae3c3e..a136da21a 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -1045,22 +1045,30 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
 	})}
 
 	// Enable SACK Recovery.
-	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
-		return nil, fmt.Errorf("failed to enable SACK: %s", err)
+	{
+		opt := tcpip.TCPSACKEnabled(true)
+		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	// Set default TTLs as required by socket/netstack.
-	opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
-	if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
-		return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
-	}
-	if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
-		return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
+	{
+		opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
+		if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
+		}
+		if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	// Enable Receive Buffer Auto-Tuning.
-	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-		return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPModerateReceiveBufferOption(true)
+		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	return &s, nil
diff --git a/test/benchmarks/tcp/tcp_proxy.go b/test/benchmarks/tcp/tcp_proxy.go
index 4b7ca7a14..6cabfb451 100644
--- a/test/benchmarks/tcp/tcp_proxy.go
+++ b/test/benchmarks/tcp/tcp_proxy.go
@@ -228,19 +228,26 @@ func newNetstackImpl(mode string) (impl, error) {
 	})
 
 	// Set protocol options.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(*sack)); err != nil {
-		return nil, fmt.Errorf("SetTransportProtocolOption for SACKEnabled failed: %s", err)
+	{
+		opt := tcpip.TCPSACKEnabled(*sack)
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	// Enable Receive Buffer Auto-Tuning.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(*moderateRecvBuf)); err != nil {
-		return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPModerateReceiveBufferOption(*moderateRecvBuf)
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	// Set Congestion Control to cubic if requested.
 	if *cubic {
-		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.CongestionControlOption("cubic")); err != nil {
-			return nil, fmt.Errorf("SetTransportProtocolOption for CongestionControlOption(cubic) failed: %s", err)
+		opt := tcpip.CongestionControlOption("cubic")
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%s)): %s", tcp.ProtocolNumber, opt, opt, err)
 		}
 	}
 
-- 
cgit v1.2.3


From 03a529d8ac52ab0227a0bbd85aaa3d8808b11e59 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 8 Sep 2020 12:29:26 -0700
Subject: Fix data race in tcp.GetSockOpt.

e.ID can't be read without holding e.mu. GetSockOpt was reading e.ID
when looking up OriginalDst without holding e.mu.

PiperOrigin-RevId: 330562293
---
 pkg/tcpip/transport/tcp/endpoint.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pkg')

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 8cb769d58..6d5046a3d 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2019,8 +2019,10 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 		e.UnlockUser()
 
 	case *tcpip.OriginalDestinationOption:
+		e.LockUser()
 		ipt := e.stack.IPTables()
 		addr, port, err := ipt.OriginalDst(e.ID)
+		e.UnlockUser()
 		if err != nil {
 			return err
 		}
-- 
cgit v1.2.3


From 982ac0e46a1ed6a76ef09d1e6eb7a19ed20b03c8 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Tue, 8 Sep 2020 15:50:29 +0800
Subject: Fix the use after nil check on args.MountNamespaceVFS2

The args.MountNamespaceVFS2 is used again after the nil check,
instead, mntnsVFS2 which holds the expected reference should be
used. This patch fixes this issue.

Fixes: #3855

Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
---
 pkg/sentry/kernel/kernel.go | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 402aa1718..22f9bb006 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -888,17 +888,18 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		opener    fsbridge.Lookup
 		fsContext *FSContext
 		mntns     *fs.MountNamespace
+		mntnsVFS2 *vfs.MountNamespace
 	)
 
 	if VFS2Enabled {
-		mntnsVFS2 := args.MountNamespaceVFS2
+		mntnsVFS2 = args.MountNamespaceVFS2
 		if mntnsVFS2 == nil {
 			// MountNamespaceVFS2 adds a reference to the namespace, which is
 			// transferred to the new process.
 			mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
 		}
 		// Get the root directory from the MountNamespace.
-		root := args.MountNamespaceVFS2.Root()
+		root := mntnsVFS2.Root()
 		// The call to newFSContext below will take a reference on root, so we
 		// don't need to hold this one.
 		defer root.DecRef(ctx)
@@ -1008,7 +1009,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
-		MountNamespaceVFS2:      args.MountNamespaceVFS2,
+		MountNamespaceVFS2:      mntnsVFS2,
 		ContainerID:             args.ContainerID,
 	}
 	t, err := k.tasks.NewTask(config)
-- 
cgit v1.2.3


From 8a4c4aed6d6af1a5c0f8c0dc27b6177016c8617f Mon Sep 17 00:00:00 2001
From: Sam Balana <sbalana@google.com>
Date: Tue, 8 Sep 2020 12:48:28 -0700
Subject: Increase resolution timeout for TestCacheResolution

Fixes pkg/tcpip/stack:stack_test flake experienced while running
TestCacheResolution with gotsan. This occurs when the test-runner takes longer
than the resolution timeout to call linkAddrCache.get.

In this test we don't care about the resolution timeout, so set it to the
maximum and rely on test-runner timeouts to avoid deadlocks.

PiperOrigin-RevId: 330566250
---
 pkg/tcpip/stack/linkaddrcache_test.go | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'pkg')

diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go
index 14fb4239b..33806340e 100644
--- a/pkg/tcpip/stack/linkaddrcache_test.go
+++ b/pkg/tcpip/stack/linkaddrcache_test.go
@@ -16,6 +16,7 @@ package stack
 
 import (
 	"fmt"
+	"math"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -191,7 +192,13 @@ func TestCacheReplace(t *testing.T) {
 }
 
 func TestCacheResolution(t *testing.T) {
-	c := newLinkAddrCache(1<<63-1, 250*time.Millisecond, 1)
+	// There is a race condition causing this test to fail when the executor
+	// takes longer than the resolution timeout to call linkAddrCache.get. This
+	// is especially common when this test is run with gotsan.
+	//
+	// Using a large resolution timeout decreases the probability of experiencing
+	// this race condition and does not affect how long this test takes to run.
+	c := newLinkAddrCache(1<<63-1, math.MaxInt64, 1)
 	linkRes := &testLinkAddressResolver{cache: c}
 	for i, ta := range testAddrs {
 		got, err := getBlocking(c, ta.addr, linkRes)
-- 
cgit v1.2.3


From f2f92e52f6548b3b29d561d6d334a4f1fdbd8437 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 8 Sep 2020 13:58:50 -0700
Subject: Honor readonly flag for root mount

Updates #1487

PiperOrigin-RevId: 330580699
---
 pkg/sentry/fsimpl/devtmpfs/devtmpfs.go            |   2 +-
 pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go       |   2 +-
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go |   6 +-
 pkg/sentry/fsimpl/ext/ext_test.go                 |   6 +-
 pkg/sentry/fsimpl/fuse/dev_test.go                |   2 +-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go           |   2 +-
 pkg/sentry/fsimpl/proc/tasks_test.go              |   2 +-
 pkg/sentry/fsimpl/sys/sys_test.go                 |   2 +-
 pkg/sentry/fsimpl/tmpfs/benchmark_test.go         |   4 +-
 pkg/sentry/fsimpl/tmpfs/pipe_test.go              |   2 +-
 pkg/sentry/fsimpl/tmpfs/tmpfs_test.go             |   2 +-
 pkg/sentry/vfs/mount.go                           |   6 +-
 runsc/boot/fs.go                                  |   8 +-
 runsc/boot/loader_test.go                         |   4 +-
 runsc/boot/vfs.go                                 |  35 ++++-
 runsc/container/container_test.go                 | 180 +++++++++++++---------
 runsc/container/multi_container_test.go           |  21 +--
 runsc/container/shared_volume_test.go             |  12 +-
 18 files changed, 178 insertions(+), 120 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index 52f44f66d..a23094e54 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -80,7 +80,7 @@ type Accessor struct {
 // NewAccessor returns an Accessor that supports creation of device special
 // files in the devtmpfs instance registered with name fsTypeName in vfsObj.
 func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) {
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.MountOptions{})
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 827a608cb..3a38b8bb4 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -48,7 +48,7 @@ func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.Virtu
 	})
 
 	// Create a test mount namespace with devtmpfs mounted at "/dev".
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index a2cc9b59f..c349b886e 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -59,7 +59,11 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: int(f.Fd()),
+		},
+	})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 2dbaee287..0989558cd 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -71,7 +71,11 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: int(f.Fd()),
+		},
+	})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go
index 1ffe7ccd2..aedc2fa39 100644
--- a/pkg/sentry/fsimpl/fuse/dev_test.go
+++ b/pkg/sentry/fsimpl/fuse/dev_test.go
@@ -342,7 +342,7 @@ func setup(t *testing.T) *testutil.System {
 		AllowUserMount: true,
 	})
 
-	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("NewMountNamespace(): %v", err)
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 675587c6b..09806a3f2 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -52,7 +52,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
 	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
+	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create testfs root mount: %v", err)
 	}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index d82b3d2f3..f693f9060 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -104,7 +104,7 @@ func setup(t *testing.T) *testutil.System {
 		AllowUserMount: true,
 	})
 
-	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{})
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("NewMountNamespace(): %v", err)
 	}
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 9fd38b295..0a0d914cc 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -38,7 +38,7 @@ func newTestSystem(t *testing.T) *testutil.System {
 		AllowUserMount: true,
 	})
 
-	mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
+	mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create new mount namespace: %v", err)
 	}
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index e5a4218e8..5209a17af 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -182,7 +182,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
@@ -376,7 +376,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index ec2701d8b..be29a2363 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -158,7 +158,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
index 6f3e3ae6f..99c8e3c0f 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
@@ -41,7 +41,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr
 	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 	if err != nil {
 		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
 	}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index db5fb3bb1..06ca91989 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -154,13 +154,13 @@ type MountNamespace struct {
 // NewMountNamespace returns a new mount namespace with a root filesystem
 // configured by the given arguments. A reference is taken on the returned
 // MountNamespace.
-func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
+func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*MountNamespace, error) {
 	rft := vfs.getFilesystemType(fsTypeName)
 	if rft == nil {
 		ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
 		return nil, syserror.ENODEV
 	}
-	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
 	if err != nil {
 		return nil, err
 	}
@@ -169,7 +169,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 		mountpoints: make(map[*Dentry]uint32),
 	}
 	mntns.EnableLeakCheck()
-	mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{})
+	mntns.root = newMount(vfs, fs, root, mntns, opts)
 	return mntns, nil
 }
 
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e2c5f5fb1..ddf288456 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -254,7 +254,7 @@ func mustFindFilesystem(name string) fs.Filesystem {
 
 // addSubmountOverlay overlays the inode over a ramfs tree containing the given
 // paths.
-func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) {
 	// Construct a ramfs tree of mount points. The contents never
 	// change, so this can be fully caching. There's no real
 	// filesystem backing this tree, so we set the filesystem to
@@ -264,7 +264,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string
 	if err != nil {
 		return nil, fmt.Errorf("creating mount tree: %v", err)
 	}
-	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf)
 	if err != nil {
 		return nil, fmt.Errorf("adding mount overlay: %v", err)
 	}
@@ -741,7 +741,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Con
 	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
 	// mounted even if they are not in the spec.
 	submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
-	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf)
 	if err != nil {
 		return nil, fmt.Errorf("adding submount overlay: %v", err)
 	}
@@ -851,7 +851,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Confi
 	submounts := subtargets(m.Destination, c.mounts)
 	if len(submounts) > 0 {
 		log.Infof("Adding submount overlay over %q", m.Destination)
-		inode, err = addSubmountOverlay(ctx, inode, submounts)
+		inode, err = addSubmountOverlay(ctx, inode, submounts, mf)
 		if err != nil {
 			return fmt.Errorf("adding submount overlay: %v", err)
 		}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index dc9861389..bf9ec5d38 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -491,9 +491,9 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
 			}
 
 			ctx := l.k.SupervisorContext()
-			mns, err := mntr.setupVFS2(ctx, l.root.conf, &l.root.procArgs)
+			mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs)
 			if err != nil {
-				t.Fatalf("failed to setupVFS2: %v", err)
+				t.Fatalf("mountAll: %v", err)
 			}
 
 			root := mns.Root()
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 66b6cf19b..7844ea28c 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -134,7 +134,7 @@ func registerFilesystems(k *kernel.Kernel) error {
 }
 
 func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
-	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+	mns, err := mntr.mountAll(conf, procArgs)
 	if err != nil {
 		return fmt.Errorf("failed to setupFS: %w", err)
 	}
@@ -149,7 +149,7 @@ func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containe
 	return nil
 }
 
-func (c *containerMounter) setupVFS2(ctx context.Context, conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
 	log.Infof("Configuring container's file system with VFS2")
 
 	// Create context with root credentials to mount the filesystem (the current
@@ -172,24 +172,44 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *config.Config, p
 	if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
 		return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
 	}
+
+	if c.root.Readonly || conf.Overlay {
+		// Switch to ReadOnly after all submounts were setup.
+		root := mns.Root()
+		defer root.DecRef(rootCtx)
+		if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
+			return nil, fmt.Errorf(`failed to set mount at "/" readonly: %v`, err)
+		}
+	}
+
 	return mns, nil
 }
 
+// createMountNamespaceVFS2 creates the container's root mount and namespace.
+// The mount is created ReadWrite to allow mount point for submounts to be
+// created. ** The caller is responsible to switch to ReadOnly if needed **
 func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
 	fd := c.fds.remove()
-	opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
+	data := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
 
 	if conf.OverlayfsStaleRead {
 		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
 		// can only send mount options for specs.Mounts (specs.Root is missing
 		// Options field). So assume root is always on top of overlayfs.
-		opts = append(opts, "overlayfs_stale_read")
+		data = append(data, "overlayfs_stale_read")
 	}
 
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
-	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{
-		Data: strings.Join(opts, ","),
-	})
+	opts := &vfs.MountOptions{
+		// Always mount as ReadWrite to allow other mounts on top of it. It'll be
+		// made ReadOnly in the caller (if needed).
+		ReadOnly: false,
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			Data: strings.Join(data, ","),
+		},
+		InternalMount: true,
+	}
+	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, opts)
 	if err != nil {
 		return nil, fmt.Errorf("setting up mount namespace: %w", err)
 	}
@@ -227,6 +247,7 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.
 			if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
 				return fmt.Errorf("failed to set mount at %q readwrite: %v", submount.Destination, err)
 			}
+			// Restore back to ReadOnly at the end.
 			defer func() {
 				if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
 					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err))
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 6082068c7..33ada5bb9 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -41,6 +41,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
 	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -1490,6 +1491,8 @@ func TestMountNewDir(t *testing.T) {
 				Source:      srcDir,
 				Type:        "bind",
 			})
+			// Extra points for creating the mount with a readonly root.
+			spec.Root.Readonly = true
 
 			if err := run(spec, conf); err != nil {
 				t.Fatalf("error running sandbox: %v", err)
@@ -1499,17 +1502,17 @@ func TestMountNewDir(t *testing.T) {
 }
 
 func TestReadonlyRoot(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
-			spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
+			spec := testutil.NewSpecWithArgs("sleep", "100")
 			spec.Root.Readonly = true
+
 			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 			if err != nil {
 				t.Fatalf("error setting up container: %v", err)
 			}
 			defer cleanup()
 
-			// Create, start and wait for the container.
 			args := Args{
 				ID:        testutil.RandomContainerID(),
 				Spec:      spec,
@@ -1524,12 +1527,82 @@ func TestReadonlyRoot(t *testing.T) {
 				t.Fatalf("error starting container: %v", err)
 			}
 
-			ws, err := c.Wait()
+			// Read mounts to check that root is readonly.
+			out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", "mount | grep ' / '")
+			if err != nil || ws != 0 {
+				t.Fatalf("exec failed, ws: %v, err: %v", ws, err)
+			}
+			t.Logf("root mount: %q", out)
+			if !strings.Contains(string(out), "(ro)") {
+				t.Errorf("root not mounted readonly: %q", out)
+			}
+
+			// Check that file cannot be created.
+			ws, err = execute(c, "/bin/touch", "/foo")
 			if err != nil {
-				t.Fatalf("error waiting on container: %v", err)
+				t.Fatalf("touch file in ro mount: %v", err)
 			}
 			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-				t.Fatalf("container failed, waitStatus: %v", ws)
+				t.Fatalf("wrong waitStatus: %v", ws)
+			}
+		})
+	}
+}
+
+func TestReadonlyMount(t *testing.T) {
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: dir,
+				Source:      dir,
+				Type:        "bind",
+				Options:     []string{"ro"},
+			})
+			spec.Root.Readonly = false
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Read mounts to check that volume is readonly.
+			cmd := fmt.Sprintf("mount | grep ' %s '", dir)
+			out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", cmd)
+			if err != nil || ws != 0 {
+				t.Fatalf("exec failed, ws: %v, err: %v", ws, err)
+			}
+			t.Logf("mount: %q", out)
+			if !strings.Contains(string(out), "(ro)") {
+				t.Errorf("volume not mounted readonly: %q", out)
+			}
+
+			// Check that file cannot be created.
+			ws, err = execute(c, "/bin/touch", path.Join(dir, "file"))
+			if err != nil {
+				t.Fatalf("touch file in ro mount: %v", err)
+			}
+			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+				t.Fatalf("wrong WaitStatus: %v", ws)
 			}
 		})
 	}
@@ -1616,54 +1689,6 @@ func TestUIDMap(t *testing.T) {
 	}
 }
 
-func TestReadonlyMount(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
-		t.Run(name, func(t *testing.T) {
-			dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
-			spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
-			if err != nil {
-				t.Fatalf("ioutil.TempDir() failed: %v", err)
-			}
-			spec.Mounts = append(spec.Mounts, specs.Mount{
-				Destination: dir,
-				Source:      dir,
-				Type:        "bind",
-				Options:     []string{"ro"},
-			})
-			spec.Root.Readonly = false
-
-			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
-			if err != nil {
-				t.Fatalf("error setting up container: %v", err)
-			}
-			defer cleanup()
-
-			// Create, start and wait for the container.
-			args := Args{
-				ID:        testutil.RandomContainerID(),
-				Spec:      spec,
-				BundleDir: bundleDir,
-			}
-			c, err := New(conf, args)
-			if err != nil {
-				t.Fatalf("error creating container: %v", err)
-			}
-			defer c.Destroy()
-			if err := c.Start(conf); err != nil {
-				t.Fatalf("error starting container: %v", err)
-			}
-
-			ws, err := c.Wait()
-			if err != nil {
-				t.Fatalf("error waiting on container: %v", err)
-			}
-			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-				t.Fatalf("container failed, waitStatus: %v", ws)
-			}
-		})
-	}
-}
-
 // TestAbbreviatedIDs checks that runsc supports using abbreviated container
 // IDs in place of full IDs.
 func TestAbbreviatedIDs(t *testing.T) {
@@ -2116,21 +2141,13 @@ func TestMountPropagation(t *testing.T) {
 
 	// Check that mount didn't propagate to private mount.
 	privFile := filepath.Join(priv, "mnt", "file")
-	execArgs := &control.ExecArgs{
-		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "!", "-f", privFile},
-	}
-	if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+	if ws, err := execute(cont, "/usr/bin/test", "!", "-f", privFile); err != nil || ws != 0 {
 		t.Fatalf("exec: test ! -f %q, ws: %v, err: %v", privFile, ws, err)
 	}
 
 	// Check that mount propagated to slave mount.
 	slaveFile := filepath.Join(slave, "mnt", "file")
-	execArgs = &control.ExecArgs{
-		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "-f", slaveFile},
-	}
-	if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+	if ws, err := execute(cont, "/usr/bin/test", "-f", slaveFile); err != nil || ws != 0 {
 		t.Fatalf("exec: test -f %q, ws: %v, err: %v", privFile, ws, err)
 	}
 }
@@ -2196,11 +2213,7 @@ func TestMountSymlink(t *testing.T) {
 			// Check that symlink was resolved and mount was created where the symlink
 			// is pointing to.
 			file := path.Join(target, "file")
-			execArgs := &control.ExecArgs{
-				Filename: "/usr/bin/test",
-				Argv:     []string{"test", "-f", file},
-			}
-			if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+			if ws, err := execute(cont, "/usr/bin/test", "-f", file); err != nil || ws != 0 {
 				t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err)
 			}
 		})
@@ -2326,6 +2339,35 @@ func TestTTYField(t *testing.T) {
 	}
 }
 
+func execute(cont *Container, name string, arg ...string) (syscall.WaitStatus, error) {
+	args := &control.ExecArgs{
+		Filename: name,
+		Argv:     append([]string{name}, arg...),
+	}
+	return cont.executeSync(args)
+}
+
+func executeCombinedOutput(cont *Container, name string, arg ...string) ([]byte, syscall.WaitStatus, error) {
+	r, w, err := os.Pipe()
+	if err != nil {
+		return nil, 0, err
+	}
+	defer r.Close()
+
+	args := &control.ExecArgs{
+		Filename:    name,
+		Argv:        append([]string{name}, arg...),
+		FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, w, w}},
+	}
+	ws, err := cont.executeSync(args)
+	w.Close()
+	if err != nil {
+		return nil, 0, err
+	}
+	out, err := ioutil.ReadAll(r)
+	return out, ws, err
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 5b790c6c8..952215ec1 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -1517,8 +1517,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	}
 
 	// Check that container isn't running anymore.
-	args := &control.ExecArgs{Argv: []string{"/bin/true"}}
-	if _, err := c.executeSync(args); err == nil {
+	if _, err := execute(c, "/bin/true"); err == nil {
 		t.Fatalf("Container %q was not stopped after gofer death", c.ID)
 	}
 
@@ -1533,8 +1532,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 		if err := waitForProcessList(c, pl); err != nil {
 			t.Errorf("Container %q was affected by another container: %v", c.ID, err)
 		}
-		args := &control.ExecArgs{Argv: []string{"/bin/true"}}
-		if _, err := c.executeSync(args); err != nil {
+		if _, err := execute(c, "/bin/true"); err != nil {
 			t.Fatalf("Container %q was affected by another container: %v", c.ID, err)
 		}
 	}
@@ -1556,8 +1554,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 
 	// Check that entire sandbox isn't running anymore.
 	for _, c := range containers {
-		args := &control.ExecArgs{Argv: []string{"/bin/true"}}
-		if _, err := c.executeSync(args); err == nil {
+		if _, err := execute(c, "/bin/true"); err == nil {
 			t.Fatalf("Container %q was not stopped after gofer death", c.ID)
 		}
 	}
@@ -1719,12 +1716,11 @@ func TestMultiContainerHomeEnvDir(t *testing.T) {
 				homeDirs[name] = homeFile
 			}
 
-			// We will sleep in the root container in order to ensure that
-			// the root container doesn't terminate before sub containers can be
-			// created.
+			// We will sleep in the root container in order to ensure that the root
+			//container doesn't terminate before sub containers can be created.
 			rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s; sleep 1000", homeDirs["root"].Name())}
 			subCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["sub"].Name())}
-			execCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name())}
+			execCmd := fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name())
 
 			// Setup the containers, a root container and sub container.
 			specConfig, ids := createSpecs(rootCmd, subCmd)
@@ -1735,9 +1731,8 @@ func TestMultiContainerHomeEnvDir(t *testing.T) {
 			defer cleanup()
 
 			// Exec into the root container synchronously.
-			args := &control.ExecArgs{Argv: execCmd}
-			if _, err := containers[0].executeSync(args); err != nil {
-				t.Errorf("error executing %+v: %v", args, err)
+			if _, err := execute(containers[0], "/bin/sh", "-c", execCmd); err != nil {
+				t.Errorf("error executing %+v: %v", execCmd, err)
 			}
 
 			// Wait for the subcontainer to finish.
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
index 4ea8fefee..cb5bffb89 100644
--- a/runsc/container/shared_volume_test.go
+++ b/runsc/container/shared_volume_test.go
@@ -168,11 +168,7 @@ func TestSharedVolume(t *testing.T) {
 
 func checkFile(c *Container, filename string, want []byte) error {
 	cpy := filename + ".copy"
-	argsCp := &control.ExecArgs{
-		Filename: "/bin/cp",
-		Argv:     []string{"cp", "-f", filename, cpy},
-	}
-	if _, err := c.executeSync(argsCp); err != nil {
+	if _, err := execute(c, "/bin/cp", "-f", filename, cpy); err != nil {
 		return fmt.Errorf("unexpected error copying file %q to %q: %v", filename, cpy, err)
 	}
 	got, err := ioutil.ReadFile(cpy)
@@ -235,11 +231,7 @@ func TestSharedVolumeFile(t *testing.T) {
 	}
 
 	// Append to file inside the container and check that content is not lost.
-	argsAppend := &control.ExecArgs{
-		Filename: "/bin/bash",
-		Argv:     []string{"bash", "-c", "echo -n sandbox- >> " + filename},
-	}
-	if _, err := c.executeSync(argsAppend); err != nil {
+	if _, err := execute(c, "/bin/bash", "-c", "echo -n sandbox- >> "+filename); err != nil {
 		t.Fatalf("unexpected error appending file %q: %v", filename, err)
 	}
 	want = []byte("host-sandbox-")
-- 
cgit v1.2.3


From a530de2b29c5f5fa0236ce95f7bc788effab446a Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Tue, 8 Sep 2020 14:40:57 -0700
Subject: [vfs] overlayfs: decref VD when not using it.

overlay/filesystem.go:lookupLocked() did not DecRef the VD on some error paths
when it would not end up saving or using the VD.

PiperOrigin-RevId: 330589742
---
 pkg/sentry/fsimpl/overlay/filesystem.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index 87afeeaf3..b530851c5 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -211,6 +211,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 			lookupErr = err
 			return false
 		}
+		defer childVD.DecRef(ctx)
 
 		mask := uint32(linux.STATX_TYPE)
 		if !existsOnAnyLayer {
@@ -249,6 +250,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 		}
 
 		// Update child to include this layer.
+		childVD.IncRef()
 		if isUpper {
 			child.upperVD = childVD
 			child.copiedUp = 1
-- 
cgit v1.2.3


From 0170be90f67b4dc075710d1cd82ccd8d3ac9156a Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 8 Sep 2020 15:52:01 -0700
Subject: Implement ioctl with enable verity

ioctl with FS_IOC_ENABLE_VERITY is added to verity file system to enable
a file as verity file. For a file, a Merkle tree is built with its data.
For a directory, a Merkle tree is built with the root hashes of its
children.

PiperOrigin-RevId: 330604368
---
 pkg/abi/linux/ioctl.go             |   5 ++
 pkg/sentry/fsimpl/verity/BUILD     |   2 +
 pkg/sentry/fsimpl/verity/verity.go | 128 +++++++++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+)

(limited to 'pkg')

diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index d6dbedc3e..a4fe7501d 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -113,6 +113,11 @@ const (
 	_IOC_DIRSHIFT  = _IOC_SIZESHIFT + _IOC_SIZEBITS
 )
 
+// Constants from uapi/linux/fsverity.h.
+const (
+	FS_IOC_ENABLE_VERITY = 1082156677
+)
+
 // IOC outputs the result of _IOC macro in asm-generic/ioctl.h.
 func IOC(dir, typ, nr, size uint32) uint32 {
 	return uint32(dir)<<_IOC_DIRSHIFT | typ<<_IOC_TYPESHIFT | nr<<_IOC_NRSHIFT | size<<_IOC_SIZESHIFT
diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD
index 326c4ed90..d28450e53 100644
--- a/pkg/sentry/fsimpl/verity/BUILD
+++ b/pkg/sentry/fsimpl/verity/BUILD
@@ -14,11 +14,13 @@ go_library(
         "//pkg/context",
         "//pkg/fspath",
         "//pkg/merkletree",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index eedb5f484..0bac8e938 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -22,16 +22,21 @@
 package verity
 
 import (
+	"strconv"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+
+	"gvisor.dev/gvisor/pkg/merkletree"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Name is the default filesystem name.
@@ -471,6 +476,129 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	return syserror.EPERM
 }
 
+// generateMerkle generates a Merkle tree file for fd. If fd points to a file
+// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The root
+// hash of the generated Merkle tree and the data size is returned.
+// If fd points to a regular file, the data is the content of the file. If fd
+// points to a directory, the data is all root hahes of its children, written
+// to the Merkle tree file.
+func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) {
+	fdReader := vfs.FileReadWriteSeeker{
+		FD:  fd.lowerFD,
+		Ctx: ctx,
+	}
+	merkleReader := vfs.FileReadWriteSeeker{
+		FD:  fd.merkleReader,
+		Ctx: ctx,
+	}
+	merkleWriter := vfs.FileReadWriteSeeker{
+		FD:  fd.merkleWriter,
+		Ctx: ctx,
+	}
+	var rootHash []byte
+	var dataSize uint64
+
+	switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT {
+	case linux.S_IFREG:
+		// For a regular file, generate a Merkle tree based on its
+		// content.
+		var err error
+		stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return nil, 0, err
+		}
+		dataSize = stat.Size
+
+		rootHash, err = merkletree.Generate(&fdReader, int64(dataSize), &merkleReader, &merkleWriter, false /* dataAndTreeInSameFile */)
+		if err != nil {
+			return nil, 0, err
+		}
+	case linux.S_IFDIR:
+		// For a directory, generate a Merkle tree based on the root
+		// hashes of its children that has already been written to the
+		// Merkle tree file.
+		merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return nil, 0, err
+		}
+		dataSize = merkleStat.Size
+
+		rootHash, err = merkletree.Generate(&merkleReader, int64(dataSize), &merkleReader, &merkleWriter, true /* dataAndTreeInSameFile */)
+		if err != nil {
+			return nil, 0, err
+		}
+	default:
+		// TODO(b/167728857): Investigate whether and how we should
+		// enable other types of file.
+		return nil, 0, syserror.EINVAL
+	}
+	return rootHash, dataSize, nil
+}
+
+// enableVerity enables verity features on fd by generating a Merkle tree file
+// and stores its root hash in its parent directory's Merkle tree.
+func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	if !fd.d.fs.allowRuntimeEnable {
+		return 0, syserror.EPERM
+	}
+
+	// Lock to prevent other threads performing enable or access the file
+	// while it's being enabled.
+	verityMu.Lock()
+	defer verityMu.Unlock()
+
+	if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || fd.parentMerkleWriter == nil {
+		panic("Unexpected verity fd: missing expected underlying fds")
+	}
+
+	rootHash, dataSize, err := fd.generateMerkle(ctx)
+	if err != nil {
+		return 0, err
+	}
+
+	stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{})
+	if err != nil {
+		return 0, err
+	}
+
+	// Write the root hash of fd to the parent directory's Merkle tree
+	// file, as it should be part of the parent Merkle tree data.
+	// parentMerkleWriter is open with O_APPEND, so it should write
+	// directly to the end of the file.
+	if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(rootHash), vfs.WriteOptions{}); err != nil {
+		return 0, err
+	}
+
+	// Record the offset of the root hash of fd in parent directory's
+	// Merkle tree file.
+	if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+		Name:  merkleOffsetInParentXattr,
+		Value: strconv.Itoa(int(stat.Size)),
+	}); err != nil {
+		return 0, err
+	}
+
+	// Record the size of the data being hashed for fd.
+	if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+		Name:  merkleSizeXattr,
+		Value: strconv.Itoa(int(dataSize)),
+	}); err != nil {
+		return 0, err
+	}
+	fd.d.rootHash = append(fd.d.rootHash, rootHash...)
+	return 0, nil
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FS_IOC_ENABLE_VERITY:
+		return fd.enableVerity(ctx, uio, args)
+	default:
+		return fd.lowerFD.Ioctl(ctx, uio, args)
+	}
+}
+
 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
 	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
-- 
cgit v1.2.3


From 6b2ba821c791ab1713cada80d9b722496224f663 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 8 Sep 2020 15:54:22 -0700
Subject: Add check for both child and childMerkle ENOENT

The check in verity walk returns error for non ENOENT cases, and all
ENOENT results should be checked. This case was missing.

PiperOrigin-RevId: 330604771
---
 pkg/sentry/fsimpl/verity/filesystem.go | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index e944fd5d2..2cf0a38c9 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -414,6 +414,14 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
 			}
 			panic(fmt.Sprintf("Expected Merkle file for target %s but none found", parentPath+"/"+name))
 		}
+	} else if childErr == syserror.ENOENT && childMerkleErr == syserror.ENOENT {
+		// Both the child and the corresponding Merkle tree are missing.
+		// This could be an unexpected modification or due to incorrect
+		// parameter.
+		// TODO(b/167752508): Investigate possible ways to differentiate
+		// cases that both files are deleted from cases that they never
+		// exist in the file system.
+		panic(fmt.Sprintf("Failed to find file %s", parentPath+"/"+name))
 	}
 
 	mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
-- 
cgit v1.2.3


From 397dc3ae5d2d4f836a7820a618c432ba49e4f4a7 Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Tue, 8 Sep 2020 17:54:13 -0700
Subject: [vfs] overlayfs: Fix socket tests.

- BindSocketThenOpen test was expecting the incorrect error when opening
  a socket. Fixed that.
- VirtualFilesystem.BindEndpointAt should not require pop.Path.Begin.Ok()
  because the filesystem implementations do not need to walk to the parent
  dentry. This check also exists for MknodAt, MkdirAt, RmdirAt, SymlinkAt and
  UnlinkAt but those filesystem implementations also need to walk to the parent
  denty. So that check is valid. Added some syscall tests to test this.

PiperOrigin-RevId: 330625220
---
 pkg/sentry/vfs/vfs.go          | 16 ++++++++++------
 test/syscalls/linux/mkdir.cc   |  7 +++++++
 test/syscalls/linux/mknod.cc   |  8 ++++++++
 test/syscalls/linux/symlink.cc | 10 ++++++++++
 test/syscalls/linux/unlink.cc  | 14 ++++++++++++++
 5 files changed, 49 insertions(+), 6 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 6825d81a5..ed1cf99ba 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -297,6 +297,8 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
 // MkdirAt creates a directory at the given path.
 func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with mkdirat(dirfd, "", mode).
 		if pop.Path.Absolute {
 			return syserror.EEXIST
 		}
@@ -333,6 +335,8 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
 // error from the syserror package.
 func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with mknodat(dirfd, "", mode, dev).
 		if pop.Path.Absolute {
 			return syserror.EEXIST
 		}
@@ -518,6 +522,8 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
 // RmdirAt removes the directory at the given path.
 func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR).
 		if pop.Path.Absolute {
 			return syserror.EBUSY
 		}
@@ -599,6 +605,8 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti
 // SymlinkAt creates a symbolic link at the given path with the given target.
 func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with symlinkat(oldpath, newdirfd, "").
 		if pop.Path.Absolute {
 			return syserror.EEXIST
 		}
@@ -631,6 +639,8 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
 // UnlinkAt deletes the non-directory file at the given path.
 func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with unlinkat(dirfd, "", 0).
 		if pop.Path.Absolute {
 			return syserror.EBUSY
 		}
@@ -662,12 +672,6 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
 
 // BoundEndpointAt gets the bound endpoint at the given path, if one exists.
 func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) {
-	if !pop.Path.Begin.Ok() {
-		if pop.Path.Absolute {
-			return nil, syserror.ECONNREFUSED
-		}
-		return nil, syserror.ENOENT
-	}
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts)
diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
index 4036a9275..27758203d 100644
--- a/test/syscalls/linux/mkdir.cc
+++ b/test/syscalls/linux/mkdir.cc
@@ -82,6 +82,13 @@ TEST_F(MkdirTest, FailsOnDirWithoutWritePerms) {
               SyscallFailsWithErrno(EACCES));
 }
 
+TEST_F(MkdirTest, MkdirAtEmptyPath) {
+  ASSERT_THAT(mkdir(dirname_.c_str(), 0777), SyscallSucceeds());
+  auto fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dirname_, O_RDONLY | O_DIRECTORY, 0666));
+  EXPECT_THAT(mkdirat(fd.get(), "", 0777), SyscallFailsWithErrno(ENOENT));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/mknod.cc b/test/syscalls/linux/mknod.cc
index 2ba8c11b8..89e4564e8 100644
--- a/test/syscalls/linux/mknod.cc
+++ b/test/syscalls/linux/mknod.cc
@@ -203,6 +203,14 @@ TEST(MknodTest, FifoTruncNoOp) {
   EXPECT_THAT(ftruncate(wfd.get(), 0), SyscallFailsWithErrno(EINVAL));
 }
 
+TEST(MknodTest, MknodAtEmptyPath) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY, 0666));
+  EXPECT_THAT(mknodat(fd.get(), "", S_IFREG | 0777, 0),
+              SyscallFailsWithErrno(ENOENT));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc
index aa1f32c85..a5d7efe94 100644
--- a/test/syscalls/linux/symlink.cc
+++ b/test/syscalls/linux/symlink.cc
@@ -326,6 +326,16 @@ TEST(SymlinkTest, FollowUpdatesATime) {
   EXPECT_LT(st_before_follow.st_atime, st_after_follow.st_atime);
 }
 
+TEST(SymlinkTest, SymlinkAtEmptyPath) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  auto fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY, 0666));
+  EXPECT_THAT(symlinkat(file.path().c_str(), fd.get(), ""),
+              SyscallFailsWithErrno(ENOENT));
+}
+
 class ParamSymlinkTest : public ::testing::TestWithParam<std::string> {};
 
 // Test that creating an existing symlink with creat will create the target.
diff --git a/test/syscalls/linux/unlink.cc b/test/syscalls/linux/unlink.cc
index 2040375c9..061e2e0f1 100644
--- a/test/syscalls/linux/unlink.cc
+++ b/test/syscalls/linux/unlink.cc
@@ -208,6 +208,20 @@ TEST(RmdirTest, CanRemoveWithTrailingSlashes) {
   ASSERT_THAT(rmdir(slashslash.c_str()), SyscallSucceeds());
 }
 
+TEST(UnlinkTest, UnlinkAtEmptyPath) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+  EXPECT_THAT(unlinkat(fd.get(), "", 0), SyscallFailsWithErrno(ENOENT));
+
+  auto dirInDir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+  auto dirFD = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(dirInDir.path(), O_RDONLY | O_DIRECTORY, 0666));
+  EXPECT_THAT(unlinkat(dirFD.get(), "", AT_REMOVEDIR),
+              SyscallFailsWithErrno(ENOENT));
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From f9fa8b118f0f1e4a9fbe7fa5f5a367ba8105ddf0 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 8 Sep 2020 18:31:17 -0700
Subject: Implement synthetic mountpoints for kernfs.

PiperOrigin-RevId: 330629897
---
 pkg/sentry/fsimpl/kernfs/BUILD                  |   1 +
 pkg/sentry/fsimpl/kernfs/filesystem.go          |   5 +-
 pkg/sentry/fsimpl/kernfs/synthetic_directory.go | 102 ++++++++++++++++++++++++
 3 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 pkg/sentry/fsimpl/kernfs/synthetic_directory.go

(limited to 'pkg')

diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 637dca70c..5e91e0536 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -83,6 +83,7 @@ go_library(
         "slot_list.go",
         "static_directory_refs.go",
         "symlink.go",
+        "synthetic_directory.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index c428053e8..d7d3e8f48 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -360,7 +360,10 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 	defer rp.Mount().EndWrite()
 	childVFSD, err := parentInode.NewDir(ctx, pc, opts)
 	if err != nil {
-		return err
+		if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
+			return err
+		}
+		childVFSD = newSyntheticDirectory(rp.Credentials(), opts.Mode)
 	}
 	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
 	return nil
diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
new file mode 100644
index 000000000..01ba72fa8
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
@@ -0,0 +1,102 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// syntheticDirectory implements kernfs.Inode for a directory created by
+// MkdirAt(ForSyntheticMountpoint=true).
+//
+// +stateify savable
+type syntheticDirectory struct {
+	InodeAttrs
+	InodeNoStatFS
+	InodeNoopRefCount
+	InodeNoDynamicLookup
+	InodeNotSymlink
+	OrderedChildren
+
+	locks vfs.FileLocks
+}
+
+var _ Inode = (*syntheticDirectory)(nil)
+
+func newSyntheticDirectory(creds *auth.Credentials, perm linux.FileMode) *vfs.Dentry {
+	inode := &syntheticDirectory{}
+	inode.Init(creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, perm)
+	d := &Dentry{}
+	d.Init(inode)
+	return &d.vfsd
+}
+
+func (dir *syntheticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("perm contains non-permission bits: %#o", perm))
+	}
+	dir.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.S_IFDIR|perm)
+	dir.OrderedChildren.Init(OrderedChildrenOptions{
+		Writable: true,
+	})
+}
+
+// Open implements Inode.Open.
+func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &dir.OrderedChildren, &dir.locks, &opts, GenericDirectoryFDOptions{})
+	if err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// NewFile implements Inode.NewFile.
+func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewDir implements Inode.NewDir.
+func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+	if !opts.ForSyntheticMountpoint {
+		return nil, syserror.EPERM
+	}
+	subdird := newSyntheticDirectory(auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask)
+	if err := dir.OrderedChildren.Insert(name, subdird); err != nil {
+		subdird.DecRef(ctx)
+		return nil, err
+	}
+	return subdird, nil
+}
+
+// NewLink implements Inode.NewLink.
+func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewNode implements Inode.NewNode.
+func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
-- 
cgit v1.2.3


From 031dd3fc2127e65c4187666999c348d3965a1d38 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 9 Sep 2020 12:47:24 -0700
Subject: Don't sched_setaffinity in ptrace platform.

PiperOrigin-RevId: 330777900
---
 pkg/sentry/platform/ptrace/BUILD                   |  1 -
 pkg/sentry/platform/ptrace/filters.go              |  9 ++--
 pkg/sentry/platform/ptrace/subprocess.go           |  5 --
 .../platform/ptrace/subprocess_linux_unsafe.go     | 61 ----------------------
 4 files changed, 4 insertions(+), 72 deletions(-)

(limited to 'pkg')

diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index e04165fbf..fc43cc3c0 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -30,7 +30,6 @@ go_library(
         "//pkg/safecopy",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
-        "//pkg/sentry/hostcpu",
         "//pkg/sentry/memmap",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/interrupt",
diff --git a/pkg/sentry/platform/ptrace/filters.go b/pkg/sentry/platform/ptrace/filters.go
index 1e07cfd0d..b0970e356 100644
--- a/pkg/sentry/platform/ptrace/filters.go
+++ b/pkg/sentry/platform/ptrace/filters.go
@@ -24,10 +24,9 @@ import (
 // SyscallFilters returns syscalls made exclusively by the ptrace platform.
 func (*PTrace) SyscallFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
-		unix.SYS_GETCPU:            {},
-		unix.SYS_SCHED_SETAFFINITY: {},
-		syscall.SYS_PTRACE:         {},
-		syscall.SYS_TGKILL:         {},
-		syscall.SYS_WAIT4:          {},
+		unix.SYS_GETCPU:    {},
+		syscall.SYS_PTRACE: {},
+		syscall.SYS_TGKILL: {},
+		syscall.SYS_WAIT4:  {},
 	}
 }
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index e1d54d8a2..812ab80ef 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -518,11 +518,6 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 	}
 	defer c.interrupt.Disable()
 
-	// Ensure that the CPU set is bound appropriately; this makes the
-	// emulation below several times faster, presumably by avoiding
-	// interprocessor wakeups and by simplifying the schedule.
-	t.bind()
-
 	// Set registers.
 	if err := t.setRegs(regs); err != nil {
 		panic(fmt.Sprintf("ptrace set regs (%+v) failed: %v", regs, err))
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
index 245b20722..533e45497 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
@@ -18,29 +18,12 @@
 package ptrace
 
 import (
-	"sync/atomic"
 	"syscall"
 	"unsafe"
 
-	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
-	"gvisor.dev/gvisor/pkg/sync"
 )
 
-// maskPool contains reusable CPU masks for setting affinity. Unfortunately,
-// runtime.NumCPU doesn't actually record the number of CPUs on the system, it
-// just records the number of CPUs available in the scheduler affinity set at
-// startup. This may a) change over time and b) gives a number far lower than
-// the maximum indexable CPU. To prevent lots of allocation in the hot path, we
-// use a pool to store large masks that we can reuse during bind.
-var maskPool = sync.Pool{
-	New: func() interface{} {
-		const maxCPUs = 1024 // Not a hard limit; see below.
-		return make([]uintptr, maxCPUs/64)
-	},
-}
-
 // unmaskAllSignals unmasks all signals on the current thread.
 //
 //go:nosplit
@@ -49,47 +32,3 @@ func unmaskAllSignals() syscall.Errno {
 	_, _, errno := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_SETMASK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0)
 	return errno
 }
-
-// setCPU sets the CPU affinity.
-func (t *thread) setCPU(cpu uint32) error {
-	mask := maskPool.Get().([]uintptr)
-	n := int(cpu / 64)
-	v := uintptr(1 << uintptr(cpu%64))
-	if n >= len(mask) {
-		// See maskPool note above. We've actually exceeded the number
-		// of available cores. Grow the mask and return it.
-		mask = make([]uintptr, n+1)
-	}
-	mask[n] |= v
-	if _, _, errno := syscall.RawSyscall(
-		unix.SYS_SCHED_SETAFFINITY,
-		uintptr(t.tid),
-		uintptr(len(mask)*8),
-		uintptr(unsafe.Pointer(&mask[0]))); errno != 0 {
-		return errno
-	}
-	mask[n] &^= v
-	maskPool.Put(mask)
-	return nil
-}
-
-// bind attempts to ensure that the thread is on the same CPU as the current
-// thread. This provides no guarantees as it is fundamentally a racy operation:
-// CPU sets may change and we may be rescheduled in the middle of this
-// operation. As a result, no failures are reported.
-//
-// Precondition: the current runtime thread should be locked.
-func (t *thread) bind() {
-	currentCPU := hostcpu.GetCPU()
-
-	if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU {
-		// Set the affinity on the thread and save the CPU for next
-		// round; we don't expect CPUs to bounce around too frequently.
-		//
-		// (It's worth noting that we could move CPUs between this point
-		// and when the tracee finishes executing. But that would be
-		// roughly the status quo anyways -- we're just maximizing our
-		// chances of colocation, not guaranteeing it.)
-		t.setCPU(currentCPU)
-	}
-}
-- 
cgit v1.2.3