Merge release-20210112.0-25-ge57ebcd37 (automated)

author: gVisor bot <gvisor-bot@google.com> 2021-01-15 15:47:01 +0000
committer: gVisor bot <gvisor-bot@google.com> 2021-01-15 15:47:01 +0000
commit: 578c5460b62f52063bef41203940a315deced6b3 (patch)
tree: 25ee54afd2fba0133c549110656d9efc631031c9
parent: 6cc587a931cb704006e5d843f725b4be2d1523c9 (diff)
parent: e57ebcd37a7b9f98d80e594f2c0baf2220d7b830 (diff)
23 files changed, 408 insertions, 1488 deletions
diff --git a/pkg/buffer/buffer.go b/pkg/buffer/buffer.go
deleted file mode 100644
index 311808ae9..000000000
--- a/pkg/buffer/buffer.go
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package buffer provides the implementation of a buffer view.
-//
-// A view is an flexible buffer, supporting the safecopy operations natively as
-// well as the ability to grow via either prepend or append, as well as shrink.
-package buffer
-
-// buffer encapsulates a queueable byte buffer.
-//
-// +stateify savable
-type buffer struct {
-	data  []byte
-	read  int
-	write int
-	bufferEntry
-}
-
-// init performs in-place initialization for zero value.
-func (b *buffer) init(size int) {
-	b.data = make([]byte, size)
-}
-
-// Reset resets read and write locations, effectively emptying the buffer.
-func (b *buffer) Reset() {
-	b.read = 0
-	b.write = 0
-}
-
-// Full indicates the buffer is full.
-//
-// This indicates there is no capacity left to write.
-func (b *buffer) Full() bool {
-	return b.write == len(b.data)
-}
-
-// ReadSize returns the number of bytes available for reading.
-func (b *buffer) ReadSize() int {
-	return b.write - b.read
-}
-
-// ReadMove advances the read index by the given amount.
-func (b *buffer) ReadMove(n int) {
-	b.read += n
-}
-
-// ReadSlice returns the read slice for this buffer.
-func (b *buffer) ReadSlice() []byte {
-	return b.data[b.read:b.write]
-}
-
-// WriteSize returns the number of bytes available for writing.
-func (b *buffer) WriteSize() int {
-	return len(b.data) - b.write
-}
-
-// WriteMove advances the write index by the given amount.
-func (b *buffer) WriteMove(n int) {
-	b.write += n
-}
-
-// WriteSlice returns the write slice for this buffer.
-func (b *buffer) WriteSlice() []byte {
-	return b.data[b.write:]
-}
diff --git a/pkg/buffer/buffer_list.go b/pkg/buffer/buffer_list.go
deleted file mode 100644
index 6b5bea3fc..000000000
--- a/pkg/buffer/buffer_list.go
+++ /dev/null
@@ -1,221 +0,0 @@
-package buffer
-
-// ElementMapper provides an identity mapping by default.
-//
-// This can be replaced to provide a struct that maps elements to linker
-// objects, if they are not the same. An ElementMapper is not typically
-// required if: Linker is left as is, Element is left as is, or Linker and
-// Element are the same type.
-type bufferElementMapper struct{}
-
-// linkerFor maps an Element to a Linker.
-//
-// This default implementation should be inlined.
-//
-//go:nosplit
-func (bufferElementMapper) linkerFor(elem *buffer) *buffer { return elem }
-
-// List is an intrusive list. Entries can be added to or removed from the list
-// in O(1) time and with no additional memory allocations.
-//
-// The zero value for List is an empty list ready to use.
-//
-// To iterate over a list (where l is a List):
-//      for e := l.Front(); e != nil; e = e.Next() {
-// 		// do something with e.
-//      }
-//
-// +stateify savable
-type bufferList struct {
-	head *buffer
-	tail *buffer
-}
-
-// Reset resets list l to the empty state.
-func (l *bufferList) Reset() {
-	l.head = nil
-	l.tail = nil
-}
-
-// Empty returns true iff the list is empty.
-//
-//go:nosplit
-func (l *bufferList) Empty() bool {
-	return l.head == nil
-}
-
-// Front returns the first element of list l or nil.
-//
-//go:nosplit
-func (l *bufferList) Front() *buffer {
-	return l.head
-}
-
-// Back returns the last element of list l or nil.
-//
-//go:nosplit
-func (l *bufferList) Back() *buffer {
-	return l.tail
-}
-
-// Len returns the number of elements in the list.
-//
-// NOTE: This is an O(n) operation.
-//
-//go:nosplit
-func (l *bufferList) Len() (count int) {
-	for e := l.Front(); e != nil; e = (bufferElementMapper{}.linkerFor(e)).Next() {
-		count++
-	}
-	return count
-}
-
-// PushFront inserts the element e at the front of list l.
-//
-//go:nosplit
-func (l *bufferList) PushFront(e *buffer) {
-	linker := bufferElementMapper{}.linkerFor(e)
-	linker.SetNext(l.head)
-	linker.SetPrev(nil)
-	if l.head != nil {
-		bufferElementMapper{}.linkerFor(l.head).SetPrev(e)
-	} else {
-		l.tail = e
-	}
-
-	l.head = e
-}
-
-// PushBack inserts the element e at the back of list l.
-//
-//go:nosplit
-func (l *bufferList) PushBack(e *buffer) {
-	linker := bufferElementMapper{}.linkerFor(e)
-	linker.SetNext(nil)
-	linker.SetPrev(l.tail)
-	if l.tail != nil {
-		bufferElementMapper{}.linkerFor(l.tail).SetNext(e)
-	} else {
-		l.head = e
-	}
-
-	l.tail = e
-}
-
-// PushBackList inserts list m at the end of list l, emptying m.
-//
-//go:nosplit
-func (l *bufferList) PushBackList(m *bufferList) {
-	if l.head == nil {
-		l.head = m.head
-		l.tail = m.tail
-	} else if m.head != nil {
-		bufferElementMapper{}.linkerFor(l.tail).SetNext(m.head)
-		bufferElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
-
-		l.tail = m.tail
-	}
-	m.head = nil
-	m.tail = nil
-}
-
-// InsertAfter inserts e after b.
-//
-//go:nosplit
-func (l *bufferList) InsertAfter(b, e *buffer) {
-	bLinker := bufferElementMapper{}.linkerFor(b)
-	eLinker := bufferElementMapper{}.linkerFor(e)
-
-	a := bLinker.Next()
-
-	eLinker.SetNext(a)
-	eLinker.SetPrev(b)
-	bLinker.SetNext(e)
-
-	if a != nil {
-		bufferElementMapper{}.linkerFor(a).SetPrev(e)
-	} else {
-		l.tail = e
-	}
-}
-
-// InsertBefore inserts e before a.
-//
-//go:nosplit
-func (l *bufferList) InsertBefore(a, e *buffer) {
-	aLinker := bufferElementMapper{}.linkerFor(a)
-	eLinker := bufferElementMapper{}.linkerFor(e)
-
-	b := aLinker.Prev()
-	eLinker.SetNext(a)
-	eLinker.SetPrev(b)
-	aLinker.SetPrev(e)
-
-	if b != nil {
-		bufferElementMapper{}.linkerFor(b).SetNext(e)
-	} else {
-		l.head = e
-	}
-}
-
-// Remove removes e from l.
-//
-//go:nosplit
-func (l *bufferList) Remove(e *buffer) {
-	linker := bufferElementMapper{}.linkerFor(e)
-	prev := linker.Prev()
-	next := linker.Next()
-
-	if prev != nil {
-		bufferElementMapper{}.linkerFor(prev).SetNext(next)
-	} else if l.head == e {
-		l.head = next
-	}
-
-	if next != nil {
-		bufferElementMapper{}.linkerFor(next).SetPrev(prev)
-	} else if l.tail == e {
-		l.tail = prev
-	}
-
-	linker.SetNext(nil)
-	linker.SetPrev(nil)
-}
-
-// Entry is a default implementation of Linker. Users can add anonymous fields
-// of this type to their structs to make them automatically implement the
-// methods needed by List.
-//
-// +stateify savable
-type bufferEntry struct {
-	next *buffer
-	prev *buffer
-}
-
-// Next returns the entry that follows e in the list.
-//
-//go:nosplit
-func (e *bufferEntry) Next() *buffer {
-	return e.next
-}
-
-// Prev returns the entry that precedes e in the list.
-//
-//go:nosplit
-func (e *bufferEntry) Prev() *buffer {
-	return e.prev
-}
-
-// SetNext assigns 'entry' as the entry that follows e in the list.
-//
-//go:nosplit
-func (e *bufferEntry) SetNext(elem *buffer) {
-	e.next = elem
-}
-
-// SetPrev assigns 'entry' as the entry that precedes e in the list.
-//
-//go:nosplit
-func (e *bufferEntry) SetPrev(elem *buffer) {
-	e.prev = elem
-}
diff --git a/pkg/buffer/buffer_state_autogen.go b/pkg/buffer/buffer_state_autogen.go
deleted file mode 100644
index 29007f642..000000000
--- a/pkg/buffer/buffer_state_autogen.go
+++ /dev/null
@@ -1,153 +0,0 @@
-// automatically generated by stateify.
-
-package buffer
-
-import (
-	"gvisor.dev/gvisor/pkg/state"
-)
-
-func (b *buffer) StateTypeName() string {
-	return "pkg/buffer.buffer"
-}
-
-func (b *buffer) StateFields() []string {
-	return []string{
-		"data",
-		"read",
-		"write",
-		"bufferEntry",
-	}
-}
-
-func (b *buffer) beforeSave() {}
-
-func (b *buffer) StateSave(stateSinkObject state.Sink) {
-	b.beforeSave()
-	stateSinkObject.Save(0, &b.data)
-	stateSinkObject.Save(1, &b.read)
-	stateSinkObject.Save(2, &b.write)
-	stateSinkObject.Save(3, &b.bufferEntry)
-}
-
-func (b *buffer) afterLoad() {}
-
-func (b *buffer) StateLoad(stateSourceObject state.Source) {
-	stateSourceObject.Load(0, &b.data)
-	stateSourceObject.Load(1, &b.read)
-	stateSourceObject.Load(2, &b.write)
-	stateSourceObject.Load(3, &b.bufferEntry)
-}
-
-func (l *bufferList) StateTypeName() string {
-	return "pkg/buffer.bufferList"
-}
-
-func (l *bufferList) StateFields() []string {
-	return []string{
-		"head",
-		"tail",
-	}
-}
-
-func (l *bufferList) beforeSave() {}
-
-func (l *bufferList) StateSave(stateSinkObject state.Sink) {
-	l.beforeSave()
-	stateSinkObject.Save(0, &l.head)
-	stateSinkObject.Save(1, &l.tail)
-}
-
-func (l *bufferList) afterLoad() {}
-
-func (l *bufferList) StateLoad(stateSourceObject state.Source) {
-	stateSourceObject.Load(0, &l.head)
-	stateSourceObject.Load(1, &l.tail)
-}
-
-func (e *bufferEntry) StateTypeName() string {
-	return "pkg/buffer.bufferEntry"
-}
-
-func (e *bufferEntry) StateFields() []string {
-	return []string{
-		"next",
-		"prev",
-	}
-}
-
-func (e *bufferEntry) beforeSave() {}
-
-func (e *bufferEntry) StateSave(stateSinkObject state.Sink) {
-	e.beforeSave()
-	stateSinkObject.Save(0, &e.next)
-	stateSinkObject.Save(1, &e.prev)
-}
-
-func (e *bufferEntry) afterLoad() {}
-
-func (e *bufferEntry) StateLoad(stateSourceObject state.Source) {
-	stateSourceObject.Load(0, &e.next)
-	stateSourceObject.Load(1, &e.prev)
-}
-
-func (p *pool) StateTypeName() string {
-	return "pkg/buffer.pool"
-}
-
-func (p *pool) StateFields() []string {
-	return []string{
-		"bufferSize",
-		"embeddedStorage",
-	}
-}
-
-func (p *pool) beforeSave() {}
-
-func (p *pool) StateSave(stateSinkObject state.Sink) {
-	p.beforeSave()
-	stateSinkObject.Save(0, &p.bufferSize)
-	stateSinkObject.Save(1, &p.embeddedStorage)
-}
-
-func (p *pool) StateLoad(stateSourceObject state.Source) {
-	stateSourceObject.Load(0, &p.bufferSize)
-	stateSourceObject.LoadWait(1, &p.embeddedStorage)
-	stateSourceObject.AfterLoad(p.afterLoad)
-}
-
-func (v *View) StateTypeName() string {
-	return "pkg/buffer.View"
-}
-
-func (v *View) StateFields() []string {
-	return []string{
-		"data",
-		"size",
-		"pool",
-	}
-}
-
-func (v *View) beforeSave() {}
-
-func (v *View) StateSave(stateSinkObject state.Sink) {
-	v.beforeSave()
-	stateSinkObject.Save(0, &v.data)
-	stateSinkObject.Save(1, &v.size)
-	stateSinkObject.Save(2, &v.pool)
-}
-
-func (v *View) afterLoad() {}
-
-func (v *View) StateLoad(stateSourceObject state.Source) {
-	stateSourceObject.Load(0, &v.data)
-	stateSourceObject.Load(1, &v.size)
-	stateSourceObject.Load(2, &v.pool)
-}
-
-func init() {
-	state.Register((*buffer)(nil))
-	state.Register((*bufferList)(nil))
-	state.Register((*bufferEntry)(nil))
-	state.Register((*pool)(nil))
-	state.Register((*View)(nil))
-}
diff --git a/pkg/buffer/buffer_unsafe_state_autogen.go b/pkg/buffer/buffer_unsafe_state_autogen.go
deleted file mode 100644
index 5a5c40722..000000000
--- a/pkg/buffer/buffer_unsafe_state_autogen.go
+++ /dev/null
@@ -1,3 +0,0 @@
-// automatically generated by stateify.
-
-package buffer
diff --git a/pkg/buffer/pool.go b/pkg/buffer/pool.go
deleted file mode 100644
index 7ad6132ab..000000000
--- a/pkg/buffer/pool.go
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package buffer
-
-const (
-	// embeddedCount is the number of buffer structures embedded in the pool. It
-	// is also the number for overflow allocations.
-	embeddedCount = 8
-
-	// defaultBufferSize is the default size for each underlying storage buffer.
-	//
-	// It is slightly less than two pages. This is done intentionally to ensure
-	// that the buffer object aligns with runtime internals. This two page size
-	// will effectively minimize internal fragmentation, but still have a large
-	// enough chunk to limit excessive segmentation.
-	defaultBufferSize = 8144
-)
-
-// pool allocates buffer.
-//
-// It contains an embedded buffer storage for fast path when the number of
-// buffers needed is small.
-//
-// +stateify savable
-type pool struct {
-	bufferSize      int
-	avail           []buffer              `state:"nosave"`
-	embeddedStorage [embeddedCount]buffer `state:"wait"`
-}
-
-// get gets a new buffer from p.
-func (p *pool) get() *buffer {
-	if p.avail == nil {
-		p.avail = p.embeddedStorage[:]
-	}
-	if len(p.avail) == 0 {
-		p.avail = make([]buffer, embeddedCount)
-	}
-	if p.bufferSize <= 0 {
-		p.bufferSize = defaultBufferSize
-	}
-	buf := &p.avail[0]
-	buf.init(p.bufferSize)
-	p.avail = p.avail[1:]
-	return buf
-}
-
-// put releases buf.
-func (p *pool) put(buf *buffer) {
-	// Remove reference to the underlying storage, allowing it to be garbage
-	// collected.
-	buf.data = nil
-}
-
-// setBufferSize sets the size of underlying storage buffer for future
-// allocations. It can be called at any time.
-func (p *pool) setBufferSize(size int) {
-	p.bufferSize = size
-}
-
-// afterLoad is invoked by stateify.
-func (p *pool) afterLoad() {
-	// S/R does not save subslice into embeddedStorage correctly. Restore
-	// available portion of embeddedStorage manually. Restore as nil if none used.
-	for i := len(p.embeddedStorage); i > 0; i-- {
-		if p.embeddedStorage[i-1].data != nil {
-			p.avail = p.embeddedStorage[i:]
-			break
-		}
-	}
-}
diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go
deleted file mode 100644
index 8b42575b4..000000000
--- a/pkg/buffer/safemem.go
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package buffer
-
-import (
-	"gvisor.dev/gvisor/pkg/safemem"
-)
-
-// WriteBlock returns this buffer as a write Block.
-func (b *buffer) WriteBlock() safemem.Block {
-	return safemem.BlockFromSafeSlice(b.WriteSlice())
-}
-
-// ReadBlock returns this buffer as a read Block.
-func (b *buffer) ReadBlock() safemem.Block {
-	return safemem.BlockFromSafeSlice(b.ReadSlice())
-}
-
-// WriteFromSafememReader writes up to count bytes from r to v and advances the
-// write index by the number of bytes written. It calls r.ReadToBlocks() at
-// most once.
-func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, error) {
-	if count == 0 {
-		return 0, nil
-	}
-
-	var (
-		dst    safemem.BlockSeq
-		blocks []safemem.Block
-	)
-
-	// Need at least one buffer.
-	firstBuf := v.data.Back()
-	if firstBuf == nil {
-		firstBuf = v.pool.get()
-		v.data.PushBack(firstBuf)
-	}
-
-	// Does the last block have sufficient capacity alone?
-	if l := uint64(firstBuf.WriteSize()); l >= count {
-		dst = safemem.BlockSeqOf(firstBuf.WriteBlock().TakeFirst64(count))
-	} else {
-		// Append blocks until sufficient.
-		count -= l
-		blocks = append(blocks, firstBuf.WriteBlock())
-		for count > 0 {
-			emptyBuf := v.pool.get()
-			v.data.PushBack(emptyBuf)
-			block := emptyBuf.WriteBlock().TakeFirst64(count)
-			count -= uint64(block.Len())
-			blocks = append(blocks, block)
-		}
-		dst = safemem.BlockSeqFromSlice(blocks)
-	}
-
-	// Perform I/O.
-	n, err := r.ReadToBlocks(dst)
-	v.size += int64(n)
-
-	// Update all indices.
-	for left := n; left > 0; firstBuf = firstBuf.Next() {
-		if l := firstBuf.WriteSize(); left >= uint64(l) {
-			firstBuf.WriteMove(l) // Whole block.
-			left -= uint64(l)
-		} else {
-			firstBuf.WriteMove(int(left)) // Partial block.
-			left = 0
-		}
-	}
-
-	return n, err
-}
-
-// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. It advances the
-// write index by the number of bytes written.
-func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
-	return v.WriteFromSafememReader(&safemem.BlockSeqReader{srcs}, srcs.NumBytes())
-}
-
-// ReadToSafememWriter reads up to count bytes from v to w. It does not advance
-// the read index. It calls w.WriteFromBlocks() at most once.
-func (v *View) ReadToSafememWriter(w safemem.Writer, count uint64) (uint64, error) {
-	if count == 0 {
-		return 0, nil
-	}
-
-	var (
-		src    safemem.BlockSeq
-		blocks []safemem.Block
-	)
-
-	firstBuf := v.data.Front()
-	if firstBuf == nil {
-		return 0, nil // No EOF.
-	}
-
-	// Is all the data in a single block?
-	if l := uint64(firstBuf.ReadSize()); l >= count {
-		src = safemem.BlockSeqOf(firstBuf.ReadBlock().TakeFirst64(count))
-	} else {
-		// Build a list of all the buffers.
-		count -= l
-		blocks = append(blocks, firstBuf.ReadBlock())
-		for buf := firstBuf.Next(); buf != nil && count > 0; buf = buf.Next() {
-			block := buf.ReadBlock().TakeFirst64(count)
-			count -= uint64(block.Len())
-			blocks = append(blocks, block)
-		}
-		src = safemem.BlockSeqFromSlice(blocks)
-	}
-
-	// Perform I/O. As documented, we don't advance the read index.
-	return w.WriteFromBlocks(src)
-}
-
-// ReadToBlocks implements safemem.Reader.ReadToBlocks. It does not advance the
-// read index by the number of bytes read, such that it's only safe to call if
-// the caller guarantees that ReadToBlocks will only be called once.
-func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
-	return v.ReadToSafememWriter(&safemem.BlockSeqWriter{dsts}, dsts.NumBytes())
-}
diff --git a/pkg/buffer/view.go b/pkg/buffer/view.go
deleted file mode 100644
index 00652d675..000000000
--- a/pkg/buffer/view.go
+++ /dev/null
@@ -1,391 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package buffer
-
-import (
-	"fmt"
-	"io"
-)
-
-// View is a non-linear buffer.
-//
-// All methods are thread compatible.
-//
-// +stateify savable
-type View struct {
-	data bufferList
-	size int64
-	pool pool
-}
-
-// TrimFront removes the first count bytes from the buffer.
-func (v *View) TrimFront(count int64) {
-	if count >= v.size {
-		v.advanceRead(v.size)
-	} else {
-		v.advanceRead(count)
-	}
-}
-
-// ReadAt implements io.ReaderAt.ReadAt.
-func (v *View) ReadAt(p []byte, offset int64) (int, error) {
-	var (
-		skipped int64
-		done    int64
-	)
-	for buf := v.data.Front(); buf != nil && done < int64(len(p)); buf = buf.Next() {
-		needToSkip := int(offset - skipped)
-		if sz := buf.ReadSize(); sz <= needToSkip {
-			skipped += int64(sz)
-			continue
-		}
-
-		// Actually read data.
-		n := copy(p[done:], buf.ReadSlice()[needToSkip:])
-		skipped += int64(needToSkip)
-		done += int64(n)
-	}
-	if int(done) < len(p) || offset+done == v.size {
-		return int(done), io.EOF
-	}
-	return int(done), nil
-}
-
-// advanceRead advances the view's read index.
-//
-// Precondition: there must be sufficient bytes in the buffer.
-func (v *View) advanceRead(count int64) {
-	for buf := v.data.Front(); buf != nil && count > 0; {
-		sz := int64(buf.ReadSize())
-		if sz > count {
-			// There is still data for reading.
-			buf.ReadMove(int(count))
-			v.size -= count
-			count = 0
-			break
-		}
-
-		// Consume the whole buffer.
-		oldBuf := buf
-		buf = buf.Next() // Iterate.
-		v.data.Remove(oldBuf)
-		oldBuf.Reset()
-		v.pool.put(oldBuf)
-
-		// Update counts.
-		count -= sz
-		v.size -= sz
-	}
-	if count > 0 {
-		panic(fmt.Sprintf("advanceRead still has %d bytes remaining", count))
-	}
-}
-
-// Truncate truncates the view to the given bytes.
-//
-// This will not grow the view, only shrink it. If a length is passed that is
-// greater than the current size of the view, then nothing will happen.
-//
-// Precondition: length must be >= 0.
-func (v *View) Truncate(length int64) {
-	if length < 0 {
-		panic("negative length provided")
-	}
-	if length >= v.size {
-		return // Nothing to do.
-	}
-	for buf := v.data.Back(); buf != nil && v.size > length; buf = v.data.Back() {
-		sz := int64(buf.ReadSize())
-		if after := v.size - sz; after < length {
-			// Truncate the buffer locally.
-			left := (length - after)
-			buf.write = buf.read + int(left)
-			v.size = length
-			break
-		}
-
-		// Drop the buffer completely; see above.
-		v.data.Remove(buf)
-		buf.Reset()
-		v.pool.put(buf)
-		v.size -= sz
-	}
-}
-
-// Grow grows the given view to the number of bytes, which will be appended. If
-// zero is true, all these bytes will be zero. If zero is false, then this is
-// the caller's responsibility.
-//
-// Precondition: length must be >= 0.
-func (v *View) Grow(length int64, zero bool) {
-	if length < 0 {
-		panic("negative length provided")
-	}
-	for v.size < length {
-		buf := v.data.Back()
-
-		// Is there some space in the last buffer?
-		if buf == nil || buf.Full() {
-			buf = v.pool.get()
-			v.data.PushBack(buf)
-		}
-
-		// Write up to length bytes.
-		sz := buf.WriteSize()
-		if int64(sz) > length-v.size {
-			sz = int(length - v.size)
-		}
-
-		// Zero the written section; note that this pattern is
-		// specifically recognized and optimized by the compiler.
-		if zero {
-			for i := buf.write; i < buf.write+sz; i++ {
-				buf.data[i] = 0
-			}
-		}
-
-		// Advance the index.
-		buf.WriteMove(sz)
-		v.size += int64(sz)
-	}
-}
-
-// Prepend prepends the given data.
-func (v *View) Prepend(data []byte) {
-	// Is there any space in the first buffer?
-	if buf := v.data.Front(); buf != nil && buf.read > 0 {
-		// Fill up before the first write.
-		avail := buf.read
-		bStart := 0
-		dStart := len(data) - avail
-		if avail > len(data) {
-			bStart = avail - len(data)
-			dStart = 0
-		}
-		n := copy(buf.data[bStart:], data[dStart:])
-		data = data[:dStart]
-		v.size += int64(n)
-		buf.read -= n
-	}
-
-	for len(data) > 0 {
-		// Do we need an empty buffer?
-		buf := v.pool.get()
-		v.data.PushFront(buf)
-
-		// The buffer is empty; copy last chunk.
-		avail := len(buf.data)
-		bStart := 0
-		dStart := len(data) - avail
-		if avail > len(data) {
-			bStart = avail - len(data)
-			dStart = 0
-		}
-
-		// We have to put the data at the end of the current
-		// buffer in order to ensure that the next prepend will
-		// correctly fill up the beginning of this buffer.
-		n := copy(buf.data[bStart:], data[dStart:])
-		data = data[:dStart]
-		v.size += int64(n)
-		buf.read = len(buf.data) - n
-		buf.write = len(buf.data)
-	}
-}
-
-// Append appends the given data.
-func (v *View) Append(data []byte) {
-	for done := 0; done < len(data); {
-		buf := v.data.Back()
-
-		// Ensure there's a buffer with space.
-		if buf == nil || buf.Full() {
-			buf = v.pool.get()
-			v.data.PushBack(buf)
-		}
-
-		// Copy in to the given buffer.
-		n := copy(buf.WriteSlice(), data[done:])
-		done += n
-		buf.WriteMove(n)
-		v.size += int64(n)
-	}
-}
-
-// Flatten returns a flattened copy of this data.
-//
-// This method should not be used in any performance-sensitive paths. It may
-// allocate a fresh byte slice sufficiently large to contain all the data in
-// the buffer. This is principally for debugging.
-//
-// N.B. Tee data still belongs to this view, as if there is a single buffer
-// present, then it will be returned directly. This should be used for
-// temporary use only, and a reference to the given slice should not be held.
-func (v *View) Flatten() []byte {
-	if buf := v.data.Front(); buf == nil {
-		return nil // No data at all.
-	} else if buf.Next() == nil {
-		return buf.ReadSlice() // Only one buffer.
-	}
-	data := make([]byte, 0, v.size) // Need to flatten.
-	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
-		// Copy to the allocated slice.
-		data = append(data, buf.ReadSlice()...)
-	}
-	return data
-}
-
-// Size indicates the total amount of data available in this view.
-func (v *View) Size() int64 {
-	return v.size
-}
-
-// Copy makes a strict copy of this view.
-func (v *View) Copy() (other View) {
-	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
-		other.Append(buf.ReadSlice())
-	}
-	return
-}
-
-// Apply applies the given function across all valid data.
-func (v *View) Apply(fn func([]byte)) {
-	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
-		fn(buf.ReadSlice())
-	}
-}
-
-// Merge merges the provided View with this one.
-//
-// The other view will be appended to v, and other will be empty after this
-// operation completes.
-func (v *View) Merge(other *View) {
-	// Copy over all buffers.
-	for buf := other.data.Front(); buf != nil; buf = other.data.Front() {
-		other.data.Remove(buf)
-		v.data.PushBack(buf)
-	}
-
-	// Adjust sizes.
-	v.size += other.size
-	other.size = 0
-}
-
-// WriteFromReader writes to the buffer from an io.Reader.
-//
-// A minimum read size equal to unsafe.Sizeof(unintptr) is enforced,
-// provided that count is greater than or equal to unsafe.Sizeof(uintptr).
-func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) {
-	var (
-		done int64
-		n    int
-		err  error
-	)
-	for done < count {
-		buf := v.data.Back()
-
-		// Ensure we have an empty buffer.
-		if buf == nil || buf.Full() {
-			buf = v.pool.get()
-			v.data.PushBack(buf)
-		}
-
-		// Is this less than the minimum batch?
-		if buf.WriteSize() < minBatch && (count-done) >= int64(minBatch) {
-			tmp := make([]byte, minBatch)
-			n, err = r.Read(tmp)
-			v.Append(tmp[:n])
-			done += int64(n)
-			if err != nil {
-				break
-			}
-			continue
-		}
-
-		// Limit the read, if necessary.
-		sz := buf.WriteSize()
-		if left := count - done; int64(sz) > left {
-			sz = int(left)
-		}
-
-		// Pass the relevant portion of the buffer.
-		n, err = r.Read(buf.WriteSlice()[:sz])
-		buf.WriteMove(n)
-		done += int64(n)
-		v.size += int64(n)
-		if err == io.EOF {
-			err = nil // Short write allowed.
-			break
-		} else if err != nil {
-			break
-		}
-	}
-	return done, err
-}
-
-// ReadToWriter reads from the buffer into an io.Writer.
-//
-// N.B. This does not consume the bytes read. TrimFront should
-// be called appropriately after this call in order to do so.
-//
-// A minimum write size equal to unsafe.Sizeof(unintptr) is enforced,
-// provided that count is greater than or equal to unsafe.Sizeof(uintptr).
-func (v *View) ReadToWriter(w io.Writer, count int64) (int64, error) {
-	var (
-		done int64
-		n    int
-		err  error
-	)
-	offset := 0 // Spill-over for batching.
-	for buf := v.data.Front(); buf != nil && done < count; buf = buf.Next() {
-		// Has this been consumed? Skip it.
-		sz := buf.ReadSize()
-		if sz <= offset {
-			offset -= sz
-			continue
-		}
-		sz -= offset
-
-		// Is this less than the minimum batch?
-		left := count - done
-		if sz < minBatch && left >= int64(minBatch) && (v.size-done) >= int64(minBatch) {
-			tmp := make([]byte, minBatch)
-			n, err = v.ReadAt(tmp, done)
-			w.Write(tmp[:n])
-			done += int64(n)
-			offset = n - sz // Reset below.
-			if err != nil {
-				break
-			}
-			continue
-		}
-
-		// Limit the write if necessary.
-		if int64(sz) >= left {
-			sz = int(left)
-		}
-
-		// Perform the actual write.
-		n, err = w.Write(buf.ReadSlice()[offset : offset+sz])
-		done += int64(n)
-		if err != nil {
-			break
-		}
-
-		// Reset spill-over.
-		offset = 0
-	}
-	return done, err
-}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index df27554d3..91d5dc174 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -407,33 +407,44 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if err != nil {
 		return err
 	}
-	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+
+	// Order of checks is important. First check if parent directory can be
+	// executed, then check for existence, and lastly check if mount is writable.
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
 	if name == "." || name == ".." {
 		return syserror.EEXIST
 	}
-	if len(name) > maxFilenameLen {
-		return syserror.ENAMETOOLONG
-	}
 	if parent.isDeleted() {
 		return syserror.ENOENT
 	}
+
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), parent, name, &ds)
+	switch {
+	case err != nil && err != syserror.ENOENT:
+		return err
+	case child != nil:
+		return syserror.EEXIST
+	}
+
 	mnt := rp.Mount()
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return err
 	}
 	defer mnt.EndWrite()
-	parent.dirMu.Lock()
-	defer parent.dirMu.Unlock()
+
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		return err
+	}
+	if !dir && rp.MustBeDir() {
+		return syserror.ENOENT
+	}
 	if parent.isSynthetic() {
-		if child := parent.children[name]; child != nil {
-			return syserror.EEXIST
-		}
-		if !dir && rp.MustBeDir() {
-			return syserror.ENOENT
-		}
 		if createInSyntheticDir == nil {
 			return syserror.EPERM
 		}
@@ -449,47 +460,20 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 		parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
 		return nil
 	}
-	if fs.opts.interop == InteropModeShared {
-		if child := parent.children[name]; child != nil && child.isSynthetic() {
-			return syserror.EEXIST
-		}
-		if !dir && rp.MustBeDir() {
-			return syserror.ENOENT
-		}
-		// The existence of a non-synthetic dentry at name would be inconclusive
-		// because the file it represents may have been deleted from the remote
-		// filesystem, so we would need to make an RPC to revalidate the dentry.
-		// Just attempt the file creation RPC instead. If a file does exist, the
-		// RPC will fail with EEXIST like we would have. If the RPC succeeds, and a
-		// stale dentry exists, the dentry will fail revalidation next time it's
-		// used.
-		if err := createInRemoteDir(parent, name, &ds); err != nil {
-			return err
-		}
-		ev := linux.IN_CREATE
-		if dir {
-			ev |= linux.IN_ISDIR
-		}
-		parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
-		return nil
-	}
-	if child := parent.children[name]; child != nil {
-		return syserror.EEXIST
-	}
-	if !dir && rp.MustBeDir() {
-		return syserror.ENOENT
-	}
-	// No cached dentry exists; however, there might still be an existing file
-	// at name. As above, we attempt the file creation RPC anyway.
+	// No cached dentry exists; however, in InteropModeShared there might still be
+	// an existing file at name. Just attempt the file creation RPC anyways. If a
+	// file does exist, the RPC will fail with EEXIST like we would have.
 	if err := createInRemoteDir(parent, name, &ds); err != nil {
 		return err
 	}
-	if child, ok := parent.children[name]; ok && child == nil {
-		// Delete the now-stale negative dentry.
-		delete(parent.children, name)
+	if fs.opts.interop != InteropModeShared {
+		if child, ok := parent.children[name]; ok && child == nil {
+			// Delete the now-stale negative dentry.
+			delete(parent.children, name)
+		}
+		parent.touchCMtime()
+		parent.dirents = nil
 	}
-	parent.touchCMtime()
-	parent.dirents = nil
 	ev := linux.IN_CREATE
 	if dir {
 		ev |= linux.IN_ISDIR
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index e77523f22..a7a553619 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -208,7 +208,9 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
 // * Filesystem.mu must be locked for at least reading.
 // * isDir(parentInode) == true.
 func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string, parent *Dentry) error {
-	if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite|vfs.MayExec); err != nil {
+	// Order of checks is important. First check if parent directory can be
+	// executed, then check for existence, and lastly check if mount is writable.
+	if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayExec); err != nil {
 		return err
 	}
 	if name == "." || name == ".." {
@@ -223,6 +225,9 @@ func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string
 	if parent.VFSDentry().IsDead() {
 		return syserror.ENOENT
 	}
+	if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite); err != nil {
+		return err
+	}
 	return nil
 }
 
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index d55bdc97f..e46f593c7 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -480,9 +480,6 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if err != nil {
 		return err
 	}
-	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
-		return err
-	}
 	name := rp.Component()
 	if name == "." || name == ".." {
 		return syserror.EEXIST
@@ -490,11 +487,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if parent.vfsd.IsDead() {
 		return syserror.ENOENT
 	}
-	mnt := rp.Mount()
-	if err := mnt.CheckBeginWrite(); err != nil {
+
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return err
 	}
-	defer mnt.EndWrite()
+
 	parent.dirMu.Lock()
 	defer parent.dirMu.Unlock()
 
@@ -514,6 +511,14 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 		return syserror.ENOENT
 	}
 
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
 	// Ensure that the parent directory is copied-up so that we can create the
 	// new file in the upper layer.
 	if err := parent.copyUpLocked(ctx); err != nil {
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 9296db2fb..453e41d11 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -153,7 +153,10 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if err != nil {
 		return err
 	}
-	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+
+	// Order of checks is important. First check if parent directory can be
+	// executed, then check for existence, and lastly check if mount is writable.
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
@@ -179,6 +182,10 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 		return err
 	}
 	defer mnt.EndWrite()
+
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		return err
+	}
 	if err := create(parentDir, name); err != nil {
 		return err
 	}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index b989e14c7..c551acd99 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -21,8 +21,8 @@ import (
 	"sync/atomic"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -75,10 +75,18 @@ type Pipe struct {
 	// mu protects all pipe internal state below.
 	mu sync.Mutex `state:"nosave"`
 
-	// view is the underlying set of buffers.
+	// buf holds the pipe's data. buf is a circular buffer; the first valid
+	// byte in buf is at offset off, and the pipe contains size valid bytes.
+	// bufBlocks contains two identical safemem.Blocks representing buf; this
+	// avoids needing to heap-allocate a new safemem.Block slice when buf is
+	// resized. bufBlockSeq is a safemem.BlockSeq representing bufBlocks.
 	//
-	// This is protected by mu.
-	view buffer.View
+	// These fields are protected by mu.
+	buf         []byte
+	bufBlocks   [2]safemem.Block `state:"nosave"`
+	bufBlockSeq safemem.BlockSeq `state:"nosave"`
+	off         int64
+	size        int64
 
 	// max is the maximum size of the pipe in bytes. When this max has been
 	// reached, writers will get EWOULDBLOCK.
@@ -99,12 +107,6 @@ type Pipe struct {
 //
 // N.B. The size will be bounded.
 func NewPipe(isNamed bool, sizeBytes int64) *Pipe {
-	if sizeBytes < MinimumPipeSize {
-		sizeBytes = MinimumPipeSize
-	}
-	if sizeBytes > MaximumPipeSize {
-		sizeBytes = MaximumPipeSize
-	}
 	var p Pipe
 	initPipe(&p, isNamed, sizeBytes)
 	return &p
@@ -175,75 +177,71 @@ func (p *Pipe) Open(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) *fs.F
 	}
 }
 
-type readOps struct {
-	// left returns the bytes remaining.
-	left func() int64
-
-	// limit limits subsequence reads.
-	limit func(int64)
-
-	// read performs the actual read operation.
-	read func(*buffer.View) (int64, error)
-}
-
-// read reads data from the pipe into dst and returns the number of bytes
-// read, or returns ErrWouldBlock if the pipe is empty.
+// peekLocked passes the first count bytes in the pipe to f and returns its
+// result. If fewer than count bytes are available, the safemem.BlockSeq passed
+// to f will be less than count bytes in length.
 //
-// Precondition: this pipe must have readers.
-func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-	return p.readLocked(ctx, ops)
-}
-
-func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
+// peekLocked does not mutate the pipe; if the read consumes bytes from the
+// pipe, then the caller is responsible for calling p.consumeLocked() and
+// p.Notify(waiter.EventOut). (The latter must be called with p.mu unlocked.)
+//
+// Preconditions:
+// * p.mu must be locked.
+// * This pipe must have readers.
+func (p *Pipe) peekLocked(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
 	// Don't block for a zero-length read even if the pipe is empty.
-	if ops.left() == 0 {
+	if count == 0 {
 		return 0, nil
 	}
 
-	// Is the pipe empty?
-	if p.view.Size() == 0 {
-		if !p.HasWriters() {
-			// There are no writers, return EOF.
-			return 0, io.EOF
+	// Limit the amount of data read to the amount of data in the pipe.
+	if count > p.size {
+		if p.size == 0 {
+			if !p.HasWriters() {
+				return 0, io.EOF
+			}
+			return 0, syserror.ErrWouldBlock
 		}
-		return 0, syserror.ErrWouldBlock
+		count = p.size
 	}
 
-	// Limit how much we consume.
-	if ops.left() > p.view.Size() {
-		ops.limit(p.view.Size())
-	}
+	// Prepare the view of the data to be read.
+	bs := p.bufBlockSeq.DropFirst64(uint64(p.off)).TakeFirst64(uint64(count))
 
-	// Copy user data; the read op is responsible for trimming.
-	done, err := ops.read(&p.view)
-	return done, err
+	// Perform the read.
+	done, err := f(bs)
+	return int64(done), err
 }
 
-type writeOps struct {
-	// left returns the bytes remaining.
-	left func() int64
-
-	// limit should limit subsequent writes.
-	limit func(int64)
-
-	// write should write to the provided buffer.
-	write func(*buffer.View) (int64, error)
-}
-
-// write writes data from sv into the pipe and returns the number of bytes
-// written. If no bytes are written because the pipe is full (or has less than
-// atomicIOBytes free capacity), write returns ErrWouldBlock.
+// consumeLocked consumes the first n bytes in the pipe, such that they will no
+// longer be visible to future reads.
 //
-// Precondition: this pipe must have writers.
-func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-	return p.writeLocked(ctx, ops)
+// Preconditions:
+// * p.mu must be locked.
+// * The pipe must contain at least n bytes.
+func (p *Pipe) consumeLocked(n int64) {
+	p.off += n
+	if max := int64(len(p.buf)); p.off >= max {
+		p.off -= max
+	}
+	p.size -= n
 }
 
-func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) {
+// writeLocked passes a safemem.BlockSeq representing the first count bytes of
+// unused space in the pipe to f and returns the result. If fewer than count
+// bytes are free, the safemem.BlockSeq passed to f will be less than count
+// bytes in length. If the pipe is full or otherwise cannot accomodate a write
+// of any number of bytes up to count, writeLocked returns ErrWouldBlock
+// without calling f.
+//
+// Unlike peekLocked, writeLocked assumes that f returns the number of bytes
+// written to the pipe, and increases the number of bytes stored in the pipe
+// accordingly. Callers are still responsible for calling
+// p.Notify(waiter.EventIn) with p.mu unlocked.
+//
+// Preconditions:
+// * p.mu must be locked.
+func (p *Pipe) writeLocked(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
 	// Can't write to a pipe with no readers.
 	if !p.HasReaders() {
 		return 0, syscall.EPIPE
@@ -251,29 +249,59 @@ func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) {
 
 	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
 	// atomic, but requires no atomicity for writes larger than this.
-	wanted := ops.left()
-	avail := p.max - p.view.Size()
-	if wanted > avail {
-		if wanted <= atomicIOBytes {
+	avail := p.max - p.size
+	short := false
+	if count > avail {
+		if count <= atomicIOBytes {
 			return 0, syserror.ErrWouldBlock
 		}
-		ops.limit(avail)
+		count = avail
+		short = true
 	}
 
-	// Copy user data.
-	done, err := ops.write(&p.view)
-	if err != nil {
-		return done, err
+	// Ensure that the buffer is big enough.
+	if newLen, oldCap := p.size+count, int64(len(p.buf)); newLen > oldCap {
+		// Allocate a new buffer.
+		newCap := oldCap * 2
+		if oldCap == 0 {
+			newCap = 8 // arbitrary; sending individual integers across pipes is relatively common
+		}
+		for newLen > newCap {
+			newCap *= 2
+		}
+		if newCap > p.max {
+			newCap = p.max
+		}
+		newBuf := make([]byte, newCap)
+		// Copy the old buffer's contents to the beginning of the new one.
+		safemem.CopySeq(
+			safemem.BlockSeqOf(safemem.BlockFromSafeSlice(newBuf)),
+			p.bufBlockSeq.DropFirst64(uint64(p.off)).TakeFirst64(uint64(p.size)))
+		// Switch to the new buffer.
+		p.buf = newBuf
+		p.bufBlocks[0] = safemem.BlockFromSafeSlice(newBuf)
+		p.bufBlocks[1] = p.bufBlocks[0]
+		p.bufBlockSeq = safemem.BlockSeqFromSlice(p.bufBlocks[:])
+		p.off = 0
 	}
 
-	if done < avail {
-		// Non-failure, but short write.
-		return done, nil
+	// Prepare the view of the space to be written.
+	woff := p.off + p.size
+	if woff >= int64(len(p.buf)) {
+		woff -= int64(len(p.buf))
 	}
-	if done < wanted {
-		// Partial write due to full pipe. Note that this could also be
-		// the short write case above, we would expect a second call
-		// and the write to return zero bytes in this case.
+	bs := p.bufBlockSeq.DropFirst64(uint64(woff)).TakeFirst64(uint64(count))
+
+	// Perform the write.
+	doneU64, err := f(bs)
+	done := int64(doneU64)
+	p.size += done
+	if done < count || err != nil {
+		return done, err
+	}
+
+	// If we shortened the write, adjust the returned error appropriately.
+	if short {
 		return done, syserror.ErrWouldBlock
 	}
 
@@ -324,7 +352,7 @@ func (p *Pipe) HasWriters() bool {
 // Precondition: mu must be held.
 func (p *Pipe) rReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
-	if p.HasReaders() && p.view.Size() != 0 {
+	if p.HasReaders() && p.size != 0 {
 		ready |= waiter.EventIn
 	}
 	if !p.HasWriters() && p.hadWriter {
@@ -350,7 +378,7 @@ func (p *Pipe) rReadiness() waiter.EventMask {
 // Precondition: mu must be held.
 func (p *Pipe) wReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
-	if p.HasWriters() && p.view.Size() < p.max {
+	if p.HasWriters() && p.size < p.max {
 		ready |= waiter.EventOut
 	}
 	if !p.HasReaders() {
@@ -383,7 +411,7 @@ func (p *Pipe) queued() int64 {
 }
 
 func (p *Pipe) queuedLocked() int64 {
-	return p.view.Size()
+	return p.size
 }
 
 // FifoSize implements fs.FifoSizer.FifoSize.
@@ -406,7 +434,7 @@ func (p *Pipe) SetFifoSize(size int64) (int64, error) {
 	}
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	if size < p.view.Size() {
+	if size < p.size {
 		return 0, syserror.EBUSY
 	}
 	p.max = size
diff --git a/pkg/sentry/kernel/pipe/pipe_state_autogen.go b/pkg/sentry/kernel/pipe/pipe_state_autogen.go
index 3413c8bbb..9cee1f13a 100644
--- a/pkg/sentry/kernel/pipe/pipe_state_autogen.go
+++ b/pkg/sentry/kernel/pipe/pipe_state_autogen.go
@@ -41,7 +41,9 @@ func (p *Pipe) StateFields() []string {
 		"isNamed",
 		"readers",
 		"writers",
-		"view",
+		"buf",
+		"off",
+		"size",
 		"max",
 		"hadWriter",
 	}
@@ -54,20 +56,23 @@ func (p *Pipe) StateSave(stateSinkObject state.Sink) {
 	stateSinkObject.Save(0, &p.isNamed)
 	stateSinkObject.Save(1, &p.readers)
 	stateSinkObject.Save(2, &p.writers)
-	stateSinkObject.Save(3, &p.view)
-	stateSinkObject.Save(4, &p.max)
-	stateSinkObject.Save(5, &p.hadWriter)
+	stateSinkObject.Save(3, &p.buf)
+	stateSinkObject.Save(4, &p.off)
+	stateSinkObject.Save(5, &p.size)
+	stateSinkObject.Save(6, &p.max)
+	stateSinkObject.Save(7, &p.hadWriter)
 }
 
-func (p *Pipe) afterLoad() {}
-
 func (p *Pipe) StateLoad(stateSourceObject state.Source) {
 	stateSourceObject.Load(0, &p.isNamed)
 	stateSourceObject.Load(1, &p.readers)
 	stateSourceObject.Load(2, &p.writers)
-	stateSourceObject.Load(3, &p.view)
-	stateSourceObject.Load(4, &p.max)
-	stateSourceObject.Load(5, &p.hadWriter)
+	stateSourceObject.Load(3, &p.buf)
+	stateSourceObject.Load(4, &p.off)
+	stateSourceObject.Load(5, &p.size)
+	stateSourceObject.Load(6, &p.max)
+	stateSourceObject.Load(7, &p.hadWriter)
+	stateSourceObject.AfterLoad(p.afterLoad)
 }
 
 func (r *Reader) StateTypeName() string {
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index f665920cb..77246edbe 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -21,9 +21,9 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/amutex"
-	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/marshal/primitive"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -44,46 +44,37 @@ func (p *Pipe) Release(context.Context) {
 
 // Read reads from the Pipe into dst.
 func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error) {
-	n, err := p.read(ctx, readOps{
-		left: func() int64 {
-			return dst.NumBytes()
-		},
-		limit: func(l int64) {
-			dst = dst.TakeFirst64(l)
-		},
-		read: func(view *buffer.View) (int64, error) {
-			n, err := dst.CopyOutFrom(ctx, view)
-			dst = dst.DropFirst64(n)
-			view.TrimFront(n)
-			return n, err
-		},
-	})
+	n, err := dst.CopyOutFrom(ctx, p)
 	if n > 0 {
 		p.Notify(waiter.EventOut)
 	}
 	return n, err
 }
 
+// ReadToBlocks implements safemem.Reader.ReadToBlocks for Pipe.Read.
+func (p *Pipe) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	n, err := p.read(int64(dsts.NumBytes()), func(srcs safemem.BlockSeq) (uint64, error) {
+		return safemem.CopySeq(dsts, srcs)
+	}, true /* removeFromSrc */)
+	return uint64(n), err
+}
+
+func (p *Pipe) read(count int64, f func(srcs safemem.BlockSeq) (uint64, error), removeFromSrc bool) (int64, error) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	n, err := p.peekLocked(count, f)
+	if n > 0 && removeFromSrc {
+		p.consumeLocked(n)
+	}
+	return n, err
+}
+
 // WriteTo writes to w from the Pipe.
 func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) (int64, error) {
-	ops := readOps{
-		left: func() int64 {
-			return count
-		},
-		limit: func(l int64) {
-			count = l
-		},
-		read: func(view *buffer.View) (int64, error) {
-			n, err := view.ReadToWriter(w, count)
-			if !dup {
-				view.TrimFront(n)
-			}
-			count -= n
-			return n, err
-		},
-	}
-	n, err := p.read(ctx, ops)
-	if n > 0 {
+	n, err := p.read(count, func(srcs safemem.BlockSeq) (uint64, error) {
+		return safemem.FromIOWriter{w}.WriteFromBlocks(srcs)
+	}, !dup /* removeFromSrc */)
+	if n > 0 && !dup {
 		p.Notify(waiter.EventOut)
 	}
 	return n, err
@@ -91,39 +82,31 @@ func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool)
 
 // Write writes to the Pipe from src.
 func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) {
-	n, err := p.write(ctx, writeOps{
-		left: func() int64 {
-			return src.NumBytes()
-		},
-		limit: func(l int64) {
-			src = src.TakeFirst64(l)
-		},
-		write: func(view *buffer.View) (int64, error) {
-			n, err := src.CopyInTo(ctx, view)
-			src = src.DropFirst64(n)
-			return n, err
-		},
-	})
+	n, err := src.CopyInTo(ctx, p)
 	if n > 0 {
 		p.Notify(waiter.EventIn)
 	}
 	return n, err
 }
 
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks for Pipe.Write.
+func (p *Pipe) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	n, err := p.write(int64(srcs.NumBytes()), func(dsts safemem.BlockSeq) (uint64, error) {
+		return safemem.CopySeq(dsts, srcs)
+	})
+	return uint64(n), err
+}
+
+func (p *Pipe) write(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.writeLocked(count, f)
+}
+
 // ReadFrom reads from r to the Pipe.
 func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, error) {
-	n, err := p.write(ctx, writeOps{
-		left: func() int64 {
-			return count
-		},
-		limit: func(l int64) {
-			count = l
-		},
-		write: func(view *buffer.View) (int64, error) {
-			n, err := view.WriteFromReader(r, count)
-			count -= n
-			return n, err
-		},
+	n, err := p.write(count, func(dsts safemem.BlockSeq) (uint64, error) {
+		return safemem.FromIOReader{r}.ReadToBlocks(dsts)
 	})
 	if n > 0 {
 		p.Notify(waiter.EventIn)
diff --git a/pkg/buffer/view_unsafe.go b/pkg/sentry/kernel/pipe/save_restore.go
index d1ef39b26..f135827de 100644
--- a/pkg/buffer/view_unsafe.go
+++ b/pkg/sentry/kernel/pipe/save_restore.go
@@ -12,14 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package buffer
+package pipe
 
 import (
-	"unsafe"
+	"gvisor.dev/gvisor/pkg/safemem"
 )
 
-// minBatch is the smallest Read or Write operation that the
-// WriteFromReader and ReadToWriter functions will use.
-//
-// This is defined as the size of a native pointer.
-const minBatch = int(unsafe.Sizeof(uintptr(0)))
+// afterLoad is called by stateify.
+func (p *Pipe) afterLoad() {
+	p.bufBlocks[0] = safemem.BlockFromSafeSlice(p.buf)
+	p.bufBlocks[1] = p.bufBlocks[0]
+	p.bufBlockSeq = safemem.BlockSeqFromSlice(p.bufBlocks[:])
+}
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index 2d47d2e82..d5a91730d 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -16,7 +16,6 @@ package pipe
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -269,12 +268,10 @@ func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
 // SpliceToNonPipe performs a splice operation from fd to a non-pipe file.
 func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) {
 	fd.pipe.mu.Lock()
-	defer fd.pipe.mu.Unlock()
 
 	// Cap the sequence at number of bytes actually available.
-	v := fd.pipe.queuedLocked()
-	if v < count {
-		count = v
+	if count > fd.pipe.size {
+		count = fd.pipe.size
 	}
 	src := usermem.IOSequence{
 		IO:    fd,
@@ -291,154 +288,97 @@ func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescripti
 		n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{})
 	}
 	if n > 0 {
-		fd.pipe.view.TrimFront(n)
+		fd.pipe.consumeLocked(n)
+	}
+
+	fd.pipe.mu.Unlock()
+
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventOut)
 	}
 	return n, err
 }
 
 // SpliceFromNonPipe performs a splice operation from a non-pipe file to fd.
 func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) {
-	fd.pipe.mu.Lock()
-	defer fd.pipe.mu.Unlock()
-
 	dst := usermem.IOSequence{
 		IO:    fd,
 		Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
 	}
 
+	var (
+		n   int64
+		err error
+	)
+	fd.pipe.mu.Lock()
 	if off == -1 {
-		return in.Read(ctx, dst, vfs.ReadOptions{})
+		n, err = in.Read(ctx, dst, vfs.ReadOptions{})
+	} else {
+		n, err = in.PRead(ctx, dst, off, vfs.ReadOptions{})
+	}
+	fd.pipe.mu.Unlock()
+
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventIn)
 	}
-	return in.PRead(ctx, dst, off, vfs.ReadOptions{})
+	return n, err
 }
 
 // CopyIn implements usermem.IO.CopyIn. Note that it is the caller's
-// responsibility to trim fd.pipe.view after the read is completed.
+// responsibility to call fd.pipe.consumeLocked() and
+// fd.pipe.Notify(waiter.EventOut) after the read is completed.
+//
+// Preconditions: fd.pipe.mu must be locked.
 func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
-	origCount := int64(len(dst))
-	n, err := fd.pipe.readLocked(ctx, readOps{
-		left: func() int64 {
-			return int64(len(dst))
-		},
-		limit: func(l int64) {
-			dst = dst[:l]
-		},
-		read: func(view *buffer.View) (int64, error) {
-			n, err := view.ReadAt(dst, 0)
-			return int64(n), err
-		},
+	n, err := fd.pipe.peekLocked(int64(len(dst)), func(srcs safemem.BlockSeq) (uint64, error) {
+		return safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), srcs)
 	})
-	if n > 0 {
-		fd.pipe.Notify(waiter.EventOut)
-	}
-	if err == nil && n != origCount {
-		return int(n), syserror.ErrWouldBlock
-	}
 	return int(n), err
 }
 
-// CopyOut implements usermem.IO.CopyOut.
+// CopyOut implements usermem.IO.CopyOut. Note that it is the caller's
+// responsibility to call fd.pipe.Notify(waiter.EventIn) after the
+// write is completed.
+//
+// Preconditions: fd.pipe.mu must be locked.
 func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
-	origCount := int64(len(src))
-	n, err := fd.pipe.writeLocked(ctx, writeOps{
-		left: func() int64 {
-			return int64(len(src))
-		},
-		limit: func(l int64) {
-			src = src[:l]
-		},
-		write: func(view *buffer.View) (int64, error) {
-			view.Append(src)
-			return int64(len(src)), nil
-		},
+	n, err := fd.pipe.writeLocked(int64(len(src)), func(dsts safemem.BlockSeq) (uint64, error) {
+		return safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src)))
 	})
-	if n > 0 {
-		fd.pipe.Notify(waiter.EventIn)
-	}
-	if err == nil && n != origCount {
-		return int(n), syserror.ErrWouldBlock
-	}
 	return int(n), err
 }
 
 // ZeroOut implements usermem.IO.ZeroOut.
+//
+// Preconditions: fd.pipe.mu must be locked.
 func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
-	origCount := toZero
-	n, err := fd.pipe.writeLocked(ctx, writeOps{
-		left: func() int64 {
-			return toZero
-		},
-		limit: func(l int64) {
-			toZero = l
-		},
-		write: func(view *buffer.View) (int64, error) {
-			view.Grow(view.Size()+toZero, true /* zero */)
-			return toZero, nil
-		},
+	n, err := fd.pipe.writeLocked(toZero, func(dsts safemem.BlockSeq) (uint64, error) {
+		return safemem.ZeroSeq(dsts)
 	})
-	if n > 0 {
-		fd.pipe.Notify(waiter.EventIn)
-	}
-	if err == nil && n != origCount {
-		return n, syserror.ErrWouldBlock
-	}
 	return n, err
 }
 
 // CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's
-// responsibility to trim fd.pipe.view after the read is completed.
+// responsibility to call fd.pipe.consumeLocked() and
+// fd.pipe.Notify(waiter.EventOut) after the read is completed.
+//
+// Preconditions: fd.pipe.mu must be locked.
 func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
-	count := ars.NumBytes()
-	if count == 0 {
-		return 0, nil
-	}
-	origCount := count
-	n, err := fd.pipe.readLocked(ctx, readOps{
-		left: func() int64 {
-			return count
-		},
-		limit: func(l int64) {
-			count = l
-		},
-		read: func(view *buffer.View) (int64, error) {
-			n, err := view.ReadToSafememWriter(dst, uint64(count))
-			return int64(n), err
-		},
+	return fd.pipe.peekLocked(ars.NumBytes(), func(srcs safemem.BlockSeq) (uint64, error) {
+		return dst.WriteFromBlocks(srcs)
 	})
-	if n > 0 {
-		fd.pipe.Notify(waiter.EventOut)
-	}
-	if err == nil && n != origCount {
-		return n, syserror.ErrWouldBlock
-	}
-	return n, err
 }
 
 // CopyOutFrom implements usermem.IO.CopyOutFrom.
+//
+// Preconditions: fd.pipe.mu must be locked.
 func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
-	count := ars.NumBytes()
-	if count == 0 {
-		return 0, nil
-	}
-	origCount := count
-	n, err := fd.pipe.writeLocked(ctx, writeOps{
-		left: func() int64 {
-			return count
-		},
-		limit: func(l int64) {
-			count = l
-		},
-		write: func(view *buffer.View) (int64, error) {
-			n, err := view.WriteFromSafememReader(src, uint64(count))
-			return int64(n), err
-		},
+	n, err := fd.pipe.writeLocked(ars.NumBytes(), func(dsts safemem.BlockSeq) (uint64, error) {
+		return src.ReadToBlocks(dsts)
 	})
 	if n > 0 {
 		fd.pipe.Notify(waiter.EventIn)
 	}
-	if err == nil && n != origCount {
-		return n, syserror.ErrWouldBlock
-	}
 	return n, err
 }
 
@@ -481,37 +421,23 @@ func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFr
 	}
 
 	lockTwoPipes(dst.pipe, src.pipe)
-	defer dst.pipe.mu.Unlock()
-	defer src.pipe.mu.Unlock()
-
-	n, err := dst.pipe.writeLocked(ctx, writeOps{
-		left: func() int64 {
-			return count
-		},
-		limit: func(l int64) {
-			count = l
-		},
-		write: func(dstView *buffer.View) (int64, error) {
-			return src.pipe.readLocked(ctx, readOps{
-				left: func() int64 {
-					return count
-				},
-				limit: func(l int64) {
-					count = l
-				},
-				read: func(srcView *buffer.View) (int64, error) {
-					n, err := srcView.ReadToSafememWriter(dstView, uint64(count))
-					if n > 0 && removeFromSrc {
-						srcView.TrimFront(int64(n))
-					}
-					return int64(n), err
-				},
-			})
-		},
+	n, err := dst.pipe.writeLocked(count, func(dsts safemem.BlockSeq) (uint64, error) {
+		n, err := src.pipe.peekLocked(int64(dsts.NumBytes()), func(srcs safemem.BlockSeq) (uint64, error) {
+			return safemem.CopySeq(dsts, srcs)
+		})
+		if n > 0 && removeFromSrc {
+			src.pipe.consumeLocked(n)
+		}
+		return uint64(n), err
 	})
+	dst.pipe.mu.Unlock()
+	src.pipe.mu.Unlock()
+
 	if n > 0 {
 		dst.pipe.Notify(waiter.EventIn)
-		src.pipe.Notify(waiter.EventOut)
+		if removeFromSrc {
+			src.pipe.Notify(waiter.EventOut)
+		}
 	}
 	return n, err
 }
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 94fb425b2..03749a8bf 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -186,6 +186,21 @@ var Metrics = tcpip.Stats{
 		IPTablesInputDropped:                mustCreateMetric("/netstack/ip/iptables/input_dropped", "Total number of IP packets dropped in the Input chain."),
 		IPTablesOutputDropped:               mustCreateMetric("/netstack/ip/iptables/output_dropped", "Total number of IP packets dropped in the Output chain."),
 	},
+	ARP: tcpip.ARPStats{
+		PacketsReceived:                                 mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."),
+		DisabledPacketsReceived:                         mustCreateMetric("/netstack/arp/disabled_packets_received", "Number of ARP packets received from the link layer when the ARP layer is disabled."),
+		MalformedPacketsReceived:                        mustCreateMetric("/netstack/arp/malformed_packets_received", "Number of ARP packets which failed ARP header validation checks."),
+		RequestsReceived:                                mustCreateMetric("/netstack/arp/requests_received", "Number of ARP requests received."),
+		RequestsReceivedUnknownTargetAddress:            mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."),
+		OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."),
+		OutgoingRequestBadLocalAddressErrors:            mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."),
+		OutgoingRequestNetworkUnreachableErrors:         mustCreateMetric("/netstack/arp/outgoing_requests_network_unreachable", "Number of failed attempts to send an ARP request with a network unreachable error."),
+		OutgoingRequestsDropped:                         mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."),
+		OutgoingRequestsSent:                            mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."),
+		RepliesReceived:                                 mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."),
+		OutgoingRepliesDropped:                          mustCreateMetric("/netstack/arp/outgoing_replies_dropped", "Number of ARP replies which failed to write to a link-layer endpoint."),
+		OutgoingRepliesSent:                             mustCreateMetric("/netstack/arp/outgoing_replies_sent", "Number of ARP replies sent."),
+	},
 	TCP: tcpip.TCPStats{
 		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
 		PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index 1c4cdb0dd..134051124 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -29,24 +29,23 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
 	if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 || (opts.SrcStart+opts.Length < 0) {
 		return 0, syserror.EINVAL
 	}
-
+	if opts.Length == 0 {
+		return 0, nil
+	}
 	if opts.Length > int64(kernel.MAX_RW_COUNT) {
 		opts.Length = int64(kernel.MAX_RW_COUNT)
 	}
 
 	var (
-		total int64
 		n     int64
 		err   error
 		inCh  chan struct{}
 		outCh chan struct{}
 	)
 
-	for opts.Length > 0 {
+	for {
 		n, err = fs.Splice(t, outFile, inFile, opts)
-		opts.Length -= n
-		total += n
-		if err != syserror.ErrWouldBlock {
+		if n != 0 || err != syserror.ErrWouldBlock {
 			break
 		} else if err == syserror.ErrWouldBlock && nonBlocking {
 			break
@@ -87,13 +86,13 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
 		}
 	}
 
-	if total > 0 {
+	if n > 0 {
 		// On Linux, inotify behavior is not very consistent with splice(2). We try
 		// our best to emulate Linux for very basic calls to splice, where for some
 		// reason, events are generated for output files, but not input files.
 		outFile.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
 	}
-	return total, err
+	return n, err
 }
 
 // Sendfile implements linux system call sendfile(2).
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 3d5c0d270..3259d052f 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -119,21 +119,28 @@ func (*endpoint) WriteHeaderIncludedPacket(*stack.Route, *stack.PacketBuffer) *t
 }
 
 func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) {
+	stats := e.protocol.stack.Stats().ARP
+	stats.PacketsReceived.Increment()
+
 	if !e.isEnabled() {
+		stats.DisabledPacketsReceived.Increment()
 		return
 	}
 
 	h := header.ARP(pkt.NetworkHeader().View())
 	if !h.IsValid() {
+		stats.MalformedPacketsReceived.Increment()
 		return
 	}
 
 	switch h.Op() {
 	case header.ARPRequest:
+		stats.RequestsReceived.Increment()
 		localAddr := tcpip.Address(h.ProtocolAddressTarget())
 
 		if e.nud == nil {
 			if e.linkAddrCache.CheckLocalAddress(e.nic.ID(), header.IPv4ProtocolNumber, localAddr) == 0 {
+				stats.RequestsReceivedUnknownTargetAddress.Increment()
 				return // we have no useful answer, ignore the request
 			}
 
@@ -142,6 +149,7 @@ func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) {
 			e.linkAddrCache.AddLinkAddress(e.nic.ID(), addr, linkAddr)
 		} else {
 			if e.protocol.stack.CheckLocalAddress(e.nic.ID(), header.IPv4ProtocolNumber, localAddr) == 0 {
+				stats.RequestsReceivedUnknownTargetAddress.Increment()
 				return // we have no useful answer, ignore the request
 			}
 
@@ -177,9 +185,14 @@ func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) {
 		//
 		//   Send the packet to the (new) target hardware address on the same
 		//   hardware on which the request was received.
-		_ = e.nic.WritePacketToRemote(tcpip.LinkAddress(origSender), nil /* gso */, ProtocolNumber, respPkt)
+		if err := e.nic.WritePacketToRemote(tcpip.LinkAddress(origSender), nil /* gso */, ProtocolNumber, respPkt); err != nil {
+			stats.OutgoingRepliesDropped.Increment()
+		} else {
+			stats.OutgoingRepliesSent.Increment()
+		}
 
 	case header.ARPReply:
+		stats.RepliesReceived.Increment()
 		addr := tcpip.Address(h.ProtocolAddressSender())
 		linkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
 
@@ -233,6 +246,8 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 
 // LinkAddressRequest implements stack.LinkAddressResolver.LinkAddressRequest.
 func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, nic stack.NetworkInterface) *tcpip.Error {
+	stats := p.stack.Stats().ARP
+
 	if len(remoteLinkAddr) == 0 {
 		remoteLinkAddr = header.EthernetBroadcastAddress
 	}
@@ -241,15 +256,18 @@ func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remot
 	if len(localAddr) == 0 {
 		addr, err := p.stack.GetMainNICAddress(nicID, header.IPv4ProtocolNumber)
 		if err != nil {
+			stats.OutgoingRequestInterfaceHasNoLocalAddressErrors.Increment()
 			return err
 		}
 
 		if len(addr.Address) == 0 {
+			stats.OutgoingRequestNetworkUnreachableErrors.Increment()
 			return tcpip.ErrNetworkUnreachable
 		}
 
 		localAddr = addr.Address
 	} else if p.stack.CheckLocalAddress(nicID, header.IPv4ProtocolNumber, localAddr) == 0 {
+		stats.OutgoingRequestBadLocalAddressErrors.Increment()
 		return tcpip.ErrBadLocalAddress
 	}
 
@@ -269,7 +287,12 @@ func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remot
 	if n := copy(h.ProtocolAddressTarget(), targetAddr); n != header.IPv4AddressSize {
 		panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize))
 	}
-	return nic.WritePacketToRemote(remoteLinkAddr, nil /* gso */, ProtocolNumber, pkt)
+	if err := nic.WritePacketToRemote(remoteLinkAddr, nil /* gso */, ProtocolNumber, pkt); err != nil {
+		stats.OutgoingRequestsDropped.Increment()
+		return err
+	}
+	stats.OutgoingRequestsSent.Increment()
+	return nil
 }
 
 // ResolveStaticAddress implements stack.LinkAddressResolver.ResolveStaticAddress.
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 002ddaf67..49d4912ad 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1591,6 +1591,59 @@ type IPStats struct {
 	OptionUnknownReceived *StatCounter
 }
 
+// ARPStats collects ARP-specific stats.
+type ARPStats struct {
+	// PacketsReceived is the number of ARP packets received from the link layer.
+	PacketsReceived *StatCounter
+
+	// DisabledPacketsReceived is the number of ARP packets received from the link
+	// layer when the ARP layer is disabled.
+	DisabledPacketsReceived *StatCounter
+
+	// MalformedPacketsReceived is the number of ARP packets that were dropped due
+	// to being malformed.
+	MalformedPacketsReceived *StatCounter
+
+	// RequestsReceived is the number of ARP requests received.
+	RequestsReceived *StatCounter
+
+	// RequestsReceivedUnknownTargetAddress is the number of ARP requests that
+	// were targeted to an interface different from the one it was received on.
+	RequestsReceivedUnknownTargetAddress *StatCounter
+
+	// OutgoingRequestInterfaceHasNoLocalAddressErrors is the number of failures
+	// to send an ARP request because the interface has no network address
+	// assigned to it.
+	OutgoingRequestInterfaceHasNoLocalAddressErrors *StatCounter
+
+	// OutgoingRequestBadLocalAddressErrors is the number of failures to send an
+	// ARP request with a bad local address.
+	OutgoingRequestBadLocalAddressErrors *StatCounter
+
+	// OutgoingRequestNetworkUnreachableErrors is the number of failures to send
+	// an ARP request with a network unreachable error.
+	OutgoingRequestNetworkUnreachableErrors *StatCounter
+
+	// OutgoingRequestsDropped is the number of ARP requests which failed to write
+	// to a link-layer endpoint.
+	OutgoingRequestsDropped *StatCounter
+
+	// OutgoingRequestSent is the number of ARP requests successfully written to a
+	// link-layer endpoint.
+	OutgoingRequestsSent *StatCounter
+
+	// RepliesReceived is the number of ARP replies received.
+	RepliesReceived *StatCounter
+
+	// OutgoingRepliesDropped is the number of ARP replies which failed to write
+	// to a link-layer endpoint.
+	OutgoingRepliesDropped *StatCounter
+
+	// OutgoingRepliesSent is the number of ARP replies successfully written to a
+	// link-layer endpoint.
+	OutgoingRepliesSent *StatCounter
+}
+
 // TCPStats collects TCP-specific stats.
 type TCPStats struct {
 	// ActiveConnectionOpenings is the number of connections opened
@@ -1743,6 +1796,9 @@ type Stats struct {
 	// IP breaks out IP-specific stats (both v4 and v6).
 	IP IPStats
 
+	// ARP breaks out ARP-specific stats.
+	ARP ARPStats
+
 	// TCP breaks out TCP-specific stats.
 	TCP TCPStats
 
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 9e8872fc9..6921de0f1 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -305,10 +305,7 @@ func (l *listenContext) startHandshake(s *segment, opts *header.TCPSynOptions, q
 
 	// Initialize and start the handshake.
 	h := ep.newPassiveHandshake(isn, irs, opts, deferAccept)
-	if err := h.start(); err != nil {
-		l.cleanupFailedHandshake(h)
-		return nil, err
-	}
+	h.start()
 	return h, nil
 }
 
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index f45d26a87..6cdbb8bee 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -53,7 +53,6 @@ const (
 	wakerForNotification = iota
 	wakerForNewSegment
 	wakerForResend
-	wakerForResolution
 )
 
 const (
@@ -460,9 +459,9 @@ func (h *handshake) processSegments() *tcpip.Error {
 	return nil
 }
 
-// start resolves the route if necessary and sends the first
-// SYN/SYN-ACK.
-func (h *handshake) start() *tcpip.Error {
+// start sends the first SYN/SYN-ACK. It does not block, even if link address
+// resolution is required.
+func (h *handshake) start() {
 	h.startTime = time.Now()
 	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
 	var sackEnabled tcpip.TCPSACKEnabled
@@ -503,7 +502,6 @@ func (h *handshake) start() *tcpip.Error {
 		ack:    h.ackNum,
 		rcvWnd: h.rcvWnd,
 	}, synOpts)
-	return nil
 }
 
 // complete completes the TCP 3-way handshake initiated by h.start().
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index ddbed7e46..a4508e871 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2325,68 +2325,17 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	}
 
 	if run {
-		if err := e.startMainLoop(handshake); err != nil {
-			return err
-		}
-	}
-
-	return tcpip.ErrConnectStarted
-}
-
-// startMainLoop sends the initial SYN and starts the main loop for the
-// endpoint.
-func (e *endpoint) startMainLoop(handshake bool) *tcpip.Error {
-	preloop := func() *tcpip.Error {
 		if handshake {
 			h := e.newHandshake()
 			e.setEndpointState(StateSynSent)
-			if err := h.start(); err != nil {
-				e.lastErrorMu.Lock()
-				e.lastError = err
-				e.lastErrorMu.Unlock()
-
-				e.setEndpointState(StateError)
-				e.hardError = err
-
-				// Call cleanupLocked to free up any reservations.
-				e.cleanupLocked()
-				return err
-			}
+			h.start()
 		}
 		e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
-		return nil
-	}
-
-	if e.route.IsResolutionRequired() {
-		// If the endpoint is closed between releasing e.mu and the goroutine below
-		// acquiring it, make sure that cleanup is deferred to the new goroutine.
 		e.workerRunning = true
-
-		// Sending the initial SYN may block due to route resolution; do it in a
-		// separate goroutine to avoid blocking the syscall goroutine.
-		go func() { // S/R-SAFE: will be drained before save.
-			e.mu.Lock()
-			if err := preloop(); err != nil {
-				e.workerRunning = false
-				e.mu.Unlock()
-				return
-			}
-			e.mu.Unlock()
-			_ = e.protocolMainLoop(handshake, nil)
-		}()
-		return nil
+		go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save.
 	}
 
-	// No route resolution is required, so we can send the initial SYN here without
-	// blocking. This will hopefully reduce overall latency by overlapping time
-	// spent waiting for a SYN-ACK and time spent spinning up a new goroutine
-	// for the main loop.
-	if err := preloop(); err != nil {
-		return err
-	}
-	e.workerRunning = true
-	go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save.
-	return nil
+	return tcpip.ErrConnectStarted
 }
 
 // ConnectEndpoint is not supported.
author	gVisor bot <gvisor-bot@google.com>	2021-01-15 15:47:01 +0000
committer	gVisor bot <gvisor-bot@google.com>	2021-01-15 15:47:01 +0000
commit	578c5460b62f52063bef41203940a315deced6b3 (patch)
tree	25ee54afd2fba0133c549110656d9efc631031c9
parent	6cc587a931cb704006e5d843f725b4be2d1523c9 (diff)
parent	e57ebcd37a7b9f98d80e594f2c0baf2220d7b830 (diff)