Merge release-20210510.0-27-g84f04cc85 (automated)

author: gVisor bot <gvisor-bot@google.com> 2021-05-13 21:00:53 +0000
committer: gVisor bot <gvisor-bot@google.com> 2021-05-13 21:00:53 +0000
commit: e1cfd3185c285f4dc69804210cc0d77ec582beb5 (patch)
tree: 866c789fcc0f67ec0933e3cf88fed6628e85b6f6
parent: f4d9f967005fdf7995439f56839cbb4a7589ff6c (diff)
parent: 84f04cc858644e9748a82f33b834a84c8b0fc934 (diff)
12 files changed, 1494 insertions, 198 deletions
diff --git a/pkg/buffer/buffer.go b/pkg/buffer/buffer.go
new file mode 100644
index 000000000..5b77a6a3f
--- /dev/null
+++ b/pkg/buffer/buffer.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package buffer provides the implementation of a buffer view.
+//
+// A view is an flexible buffer, supporting the safecopy operations natively as
+// well as the ability to grow via either prepend or append, as well as shrink.
+package buffer
+
+// buffer encapsulates a queueable byte buffer.
+//
+// +stateify savable
+type buffer struct {
+	data  []byte
+	read  int
+	write int
+	bufferEntry
+}
+
+// init performs in-place initialization for zero value.
+func (b *buffer) init(size int) {
+	b.data = make([]byte, size)
+}
+
+// initWithData initializes b with data, taking ownership.
+func (b *buffer) initWithData(data []byte) {
+	b.data = data
+	b.read = 0
+	b.write = len(data)
+}
+
+// Reset resets read and write locations, effectively emptying the buffer.
+func (b *buffer) Reset() {
+	b.read = 0
+	b.write = 0
+}
+
+// Remove removes r from the unread portion. It returns false if r does not
+// fully reside in b.
+func (b *buffer) Remove(r Range) bool {
+	sz := b.ReadSize()
+	switch {
+	case r.Len() != r.Intersect(Range{end: sz}).Len():
+		return false
+	case r.Len() == 0:
+		// Noop
+	case r.begin == 0:
+		b.read += r.end
+	case r.end == sz:
+		b.write -= r.Len()
+	default:
+		// Remove from the middle of b.data.
+		copy(b.data[b.read+r.begin:], b.data[b.read+r.end:b.write])
+		b.write -= r.Len()
+	}
+	return true
+}
+
+// Full indicates the buffer is full.
+//
+// This indicates there is no capacity left to write.
+func (b *buffer) Full() bool {
+	return b.write == len(b.data)
+}
+
+// ReadSize returns the number of bytes available for reading.
+func (b *buffer) ReadSize() int {
+	return b.write - b.read
+}
+
+// ReadMove advances the read index by the given amount.
+func (b *buffer) ReadMove(n int) {
+	b.read += n
+}
+
+// ReadSlice returns the read slice for this buffer.
+func (b *buffer) ReadSlice() []byte {
+	return b.data[b.read:b.write]
+}
+
+// WriteSize returns the number of bytes available for writing.
+func (b *buffer) WriteSize() int {
+	return len(b.data) - b.write
+}
+
+// WriteMove advances the write index by the given amount.
+func (b *buffer) WriteMove(n int) {
+	b.write += n
+}
+
+// WriteSlice returns the write slice for this buffer.
+func (b *buffer) WriteSlice() []byte {
+	return b.data[b.write:]
+}
diff --git a/pkg/buffer/buffer_list.go b/pkg/buffer/buffer_list.go
new file mode 100644
index 000000000..6b5bea3fc
--- /dev/null
+++ b/pkg/buffer/buffer_list.go
@@ -0,0 +1,221 @@
+package buffer
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type bufferElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (bufferElementMapper) linkerFor(elem *buffer) *buffer { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+//      for e := l.Front(); e != nil; e = e.Next() {
+// 		// do something with e.
+//      }
+//
+// +stateify savable
+type bufferList struct {
+	head *buffer
+	tail *buffer
+}
+
+// Reset resets list l to the empty state.
+func (l *bufferList) Reset() {
+	l.head = nil
+	l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+//
+//go:nosplit
+func (l *bufferList) Empty() bool {
+	return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+//
+//go:nosplit
+func (l *bufferList) Front() *buffer {
+	return l.head
+}
+
+// Back returns the last element of list l or nil.
+//
+//go:nosplit
+func (l *bufferList) Back() *buffer {
+	return l.tail
+}
+
+// Len returns the number of elements in the list.
+//
+// NOTE: This is an O(n) operation.
+//
+//go:nosplit
+func (l *bufferList) Len() (count int) {
+	for e := l.Front(); e != nil; e = (bufferElementMapper{}.linkerFor(e)).Next() {
+		count++
+	}
+	return count
+}
+
+// PushFront inserts the element e at the front of list l.
+//
+//go:nosplit
+func (l *bufferList) PushFront(e *buffer) {
+	linker := bufferElementMapper{}.linkerFor(e)
+	linker.SetNext(l.head)
+	linker.SetPrev(nil)
+	if l.head != nil {
+		bufferElementMapper{}.linkerFor(l.head).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+
+	l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+//
+//go:nosplit
+func (l *bufferList) PushBack(e *buffer) {
+	linker := bufferElementMapper{}.linkerFor(e)
+	linker.SetNext(nil)
+	linker.SetPrev(l.tail)
+	if l.tail != nil {
+		bufferElementMapper{}.linkerFor(l.tail).SetNext(e)
+	} else {
+		l.head = e
+	}
+
+	l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+//
+//go:nosplit
+func (l *bufferList) PushBackList(m *bufferList) {
+	if l.head == nil {
+		l.head = m.head
+		l.tail = m.tail
+	} else if m.head != nil {
+		bufferElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+		bufferElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+		l.tail = m.tail
+	}
+	m.head = nil
+	m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+//
+//go:nosplit
+func (l *bufferList) InsertAfter(b, e *buffer) {
+	bLinker := bufferElementMapper{}.linkerFor(b)
+	eLinker := bufferElementMapper{}.linkerFor(e)
+
+	a := bLinker.Next()
+
+	eLinker.SetNext(a)
+	eLinker.SetPrev(b)
+	bLinker.SetNext(e)
+
+	if a != nil {
+		bufferElementMapper{}.linkerFor(a).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+}
+
+// InsertBefore inserts e before a.
+//
+//go:nosplit
+func (l *bufferList) InsertBefore(a, e *buffer) {
+	aLinker := bufferElementMapper{}.linkerFor(a)
+	eLinker := bufferElementMapper{}.linkerFor(e)
+
+	b := aLinker.Prev()
+	eLinker.SetNext(a)
+	eLinker.SetPrev(b)
+	aLinker.SetPrev(e)
+
+	if b != nil {
+		bufferElementMapper{}.linkerFor(b).SetNext(e)
+	} else {
+		l.head = e
+	}
+}
+
+// Remove removes e from l.
+//
+//go:nosplit
+func (l *bufferList) Remove(e *buffer) {
+	linker := bufferElementMapper{}.linkerFor(e)
+	prev := linker.Prev()
+	next := linker.Next()
+
+	if prev != nil {
+		bufferElementMapper{}.linkerFor(prev).SetNext(next)
+	} else if l.head == e {
+		l.head = next
+	}
+
+	if next != nil {
+		bufferElementMapper{}.linkerFor(next).SetPrev(prev)
+	} else if l.tail == e {
+		l.tail = prev
+	}
+
+	linker.SetNext(nil)
+	linker.SetPrev(nil)
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type bufferEntry struct {
+	next *buffer
+	prev *buffer
+}
+
+// Next returns the entry that follows e in the list.
+//
+//go:nosplit
+func (e *bufferEntry) Next() *buffer {
+	return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+//
+//go:nosplit
+func (e *bufferEntry) Prev() *buffer {
+	return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+//
+//go:nosplit
+func (e *bufferEntry) SetNext(elem *buffer) {
+	e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+//
+//go:nosplit
+func (e *bufferEntry) SetPrev(elem *buffer) {
+	e.prev = elem
+}
diff --git a/pkg/buffer/buffer_state_autogen.go b/pkg/buffer/buffer_state_autogen.go
new file mode 100644
index 000000000..aaa72b723
--- /dev/null
+++ b/pkg/buffer/buffer_state_autogen.go
@@ -0,0 +1,163 @@
+// automatically generated by stateify.
+
+package buffer
+
+import (
+	"gvisor.dev/gvisor/pkg/state"
+)
+
+func (b *buffer) StateTypeName() string {
+	return "pkg/buffer.buffer"
+}
+
+func (b *buffer) StateFields() []string {
+	return []string{
+		"data",
+		"read",
+		"write",
+		"bufferEntry",
+	}
+}
+
+func (b *buffer) beforeSave() {}
+
+// +checklocksignore
+func (b *buffer) StateSave(stateSinkObject state.Sink) {
+	b.beforeSave()
+	stateSinkObject.Save(0, &b.data)
+	stateSinkObject.Save(1, &b.read)
+	stateSinkObject.Save(2, &b.write)
+	stateSinkObject.Save(3, &b.bufferEntry)
+}
+
+func (b *buffer) afterLoad() {}
+
+// +checklocksignore
+func (b *buffer) StateLoad(stateSourceObject state.Source) {
+	stateSourceObject.Load(0, &b.data)
+	stateSourceObject.Load(1, &b.read)
+	stateSourceObject.Load(2, &b.write)
+	stateSourceObject.Load(3, &b.bufferEntry)
+}
+
+func (l *bufferList) StateTypeName() string {
+	return "pkg/buffer.bufferList"
+}
+
+func (l *bufferList) StateFields() []string {
+	return []string{
+		"head",
+		"tail",
+	}
+}
+
+func (l *bufferList) beforeSave() {}
+
+// +checklocksignore
+func (l *bufferList) StateSave(stateSinkObject state.Sink) {
+	l.beforeSave()
+	stateSinkObject.Save(0, &l.head)
+	stateSinkObject.Save(1, &l.tail)
+}
+
+func (l *bufferList) afterLoad() {}
+
+// +checklocksignore
+func (l *bufferList) StateLoad(stateSourceObject state.Source) {
+	stateSourceObject.Load(0, &l.head)
+	stateSourceObject.Load(1, &l.tail)
+}
+
+func (e *bufferEntry) StateTypeName() string {
+	return "pkg/buffer.bufferEntry"
+}
+
+func (e *bufferEntry) StateFields() []string {
+	return []string{
+		"next",
+		"prev",
+	}
+}
+
+func (e *bufferEntry) beforeSave() {}
+
+// +checklocksignore
+func (e *bufferEntry) StateSave(stateSinkObject state.Sink) {
+	e.beforeSave()
+	stateSinkObject.Save(0, &e.next)
+	stateSinkObject.Save(1, &e.prev)
+}
+
+func (e *bufferEntry) afterLoad() {}
+
+// +checklocksignore
+func (e *bufferEntry) StateLoad(stateSourceObject state.Source) {
+	stateSourceObject.Load(0, &e.next)
+	stateSourceObject.Load(1, &e.prev)
+}
+
+func (p *pool) StateTypeName() string {
+	return "pkg/buffer.pool"
+}
+
+func (p *pool) StateFields() []string {
+	return []string{
+		"bufferSize",
+		"embeddedStorage",
+	}
+}
+
+func (p *pool) beforeSave() {}
+
+// +checklocksignore
+func (p *pool) StateSave(stateSinkObject state.Sink) {
+	p.beforeSave()
+	stateSinkObject.Save(0, &p.bufferSize)
+	stateSinkObject.Save(1, &p.embeddedStorage)
+}
+
+// +checklocksignore
+func (p *pool) StateLoad(stateSourceObject state.Source) {
+	stateSourceObject.Load(0, &p.bufferSize)
+	stateSourceObject.LoadWait(1, &p.embeddedStorage)
+	stateSourceObject.AfterLoad(p.afterLoad)
+}
+
+func (v *View) StateTypeName() string {
+	return "pkg/buffer.View"
+}
+
+func (v *View) StateFields() []string {
+	return []string{
+		"data",
+		"size",
+		"pool",
+	}
+}
+
+func (v *View) beforeSave() {}
+
+// +checklocksignore
+func (v *View) StateSave(stateSinkObject state.Sink) {
+	v.beforeSave()
+	stateSinkObject.Save(0, &v.data)
+	stateSinkObject.Save(1, &v.size)
+	stateSinkObject.Save(2, &v.pool)
+}
+
+func (v *View) afterLoad() {}
+
+// +checklocksignore
+func (v *View) StateLoad(stateSourceObject state.Source) {
+	stateSourceObject.Load(0, &v.data)
+	stateSourceObject.Load(1, &v.size)
+	stateSourceObject.Load(2, &v.pool)
+}
+
+func init() {
+	state.Register((*buffer)(nil))
+	state.Register((*bufferList)(nil))
+	state.Register((*bufferEntry)(nil))
+	state.Register((*pool)(nil))
+	state.Register((*View)(nil))
+}
diff --git a/pkg/buffer/buffer_unsafe_state_autogen.go b/pkg/buffer/buffer_unsafe_state_autogen.go
new file mode 100644
index 000000000..5a5c40722
--- /dev/null
+++ b/pkg/buffer/buffer_unsafe_state_autogen.go
@@ -0,0 +1,3 @@
+// automatically generated by stateify.
+
+package buffer
diff --git a/pkg/buffer/pool.go b/pkg/buffer/pool.go
new file mode 100644
index 000000000..2ec41dd4f
--- /dev/null
+++ b/pkg/buffer/pool.go
@@ -0,0 +1,90 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+const (
+	// embeddedCount is the number of buffer structures embedded in the pool. It
+	// is also the number for overflow allocations.
+	embeddedCount = 8
+
+	// defaultBufferSize is the default size for each underlying storage buffer.
+	//
+	// It is slightly less than two pages. This is done intentionally to ensure
+	// that the buffer object aligns with runtime internals. This two page size
+	// will effectively minimize internal fragmentation, but still have a large
+	// enough chunk to limit excessive segmentation.
+	defaultBufferSize = 8144
+)
+
+// pool allocates buffer.
+//
+// It contains an embedded buffer storage for fast path when the number of
+// buffers needed is small.
+//
+// +stateify savable
+type pool struct {
+	bufferSize      int
+	avail           []buffer              `state:"nosave"`
+	embeddedStorage [embeddedCount]buffer `state:"wait"`
+}
+
+// get gets a new buffer from p.
+func (p *pool) get() *buffer {
+	buf := p.getNoInit()
+	buf.init(p.bufferSize)
+	return buf
+}
+
+// get gets a new buffer from p without initializing it.
+func (p *pool) getNoInit() *buffer {
+	if p.avail == nil {
+		p.avail = p.embeddedStorage[:]
+	}
+	if len(p.avail) == 0 {
+		p.avail = make([]buffer, embeddedCount)
+	}
+	if p.bufferSize <= 0 {
+		p.bufferSize = defaultBufferSize
+	}
+	buf := &p.avail[0]
+	p.avail = p.avail[1:]
+	return buf
+}
+
+// put releases buf.
+func (p *pool) put(buf *buffer) {
+	// Remove reference to the underlying storage, allowing it to be garbage
+	// collected.
+	buf.data = nil
+	buf.Reset()
+}
+
+// setBufferSize sets the size of underlying storage buffer for future
+// allocations. It can be called at any time.
+func (p *pool) setBufferSize(size int) {
+	p.bufferSize = size
+}
+
+// afterLoad is invoked by stateify.
+func (p *pool) afterLoad() {
+	// S/R does not save subslice into embeddedStorage correctly. Restore
+	// available portion of embeddedStorage manually. Restore as nil if none used.
+	for i := len(p.embeddedStorage); i > 0; i-- {
+		if p.embeddedStorage[i-1].data != nil {
+			p.avail = p.embeddedStorage[i:]
+			break
+		}
+	}
+}
diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go
new file mode 100644
index 000000000..8b42575b4
--- /dev/null
+++ b/pkg/buffer/safemem.go
@@ -0,0 +1,133 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// WriteBlock returns this buffer as a write Block.
+func (b *buffer) WriteBlock() safemem.Block {
+	return safemem.BlockFromSafeSlice(b.WriteSlice())
+}
+
+// ReadBlock returns this buffer as a read Block.
+func (b *buffer) ReadBlock() safemem.Block {
+	return safemem.BlockFromSafeSlice(b.ReadSlice())
+}
+
+// WriteFromSafememReader writes up to count bytes from r to v and advances the
+// write index by the number of bytes written. It calls r.ReadToBlocks() at
+// most once.
+func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, error) {
+	if count == 0 {
+		return 0, nil
+	}
+
+	var (
+		dst    safemem.BlockSeq
+		blocks []safemem.Block
+	)
+
+	// Need at least one buffer.
+	firstBuf := v.data.Back()
+	if firstBuf == nil {
+		firstBuf = v.pool.get()
+		v.data.PushBack(firstBuf)
+	}
+
+	// Does the last block have sufficient capacity alone?
+	if l := uint64(firstBuf.WriteSize()); l >= count {
+		dst = safemem.BlockSeqOf(firstBuf.WriteBlock().TakeFirst64(count))
+	} else {
+		// Append blocks until sufficient.
+		count -= l
+		blocks = append(blocks, firstBuf.WriteBlock())
+		for count > 0 {
+			emptyBuf := v.pool.get()
+			v.data.PushBack(emptyBuf)
+			block := emptyBuf.WriteBlock().TakeFirst64(count)
+			count -= uint64(block.Len())
+			blocks = append(blocks, block)
+		}
+		dst = safemem.BlockSeqFromSlice(blocks)
+	}
+
+	// Perform I/O.
+	n, err := r.ReadToBlocks(dst)
+	v.size += int64(n)
+
+	// Update all indices.
+	for left := n; left > 0; firstBuf = firstBuf.Next() {
+		if l := firstBuf.WriteSize(); left >= uint64(l) {
+			firstBuf.WriteMove(l) // Whole block.
+			left -= uint64(l)
+		} else {
+			firstBuf.WriteMove(int(left)) // Partial block.
+			left = 0
+		}
+	}
+
+	return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. It advances the
+// write index by the number of bytes written.
+func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	return v.WriteFromSafememReader(&safemem.BlockSeqReader{srcs}, srcs.NumBytes())
+}
+
+// ReadToSafememWriter reads up to count bytes from v to w. It does not advance
+// the read index. It calls w.WriteFromBlocks() at most once.
+func (v *View) ReadToSafememWriter(w safemem.Writer, count uint64) (uint64, error) {
+	if count == 0 {
+		return 0, nil
+	}
+
+	var (
+		src    safemem.BlockSeq
+		blocks []safemem.Block
+	)
+
+	firstBuf := v.data.Front()
+	if firstBuf == nil {
+		return 0, nil // No EOF.
+	}
+
+	// Is all the data in a single block?
+	if l := uint64(firstBuf.ReadSize()); l >= count {
+		src = safemem.BlockSeqOf(firstBuf.ReadBlock().TakeFirst64(count))
+	} else {
+		// Build a list of all the buffers.
+		count -= l
+		blocks = append(blocks, firstBuf.ReadBlock())
+		for buf := firstBuf.Next(); buf != nil && count > 0; buf = buf.Next() {
+			block := buf.ReadBlock().TakeFirst64(count)
+			count -= uint64(block.Len())
+			blocks = append(blocks, block)
+		}
+		src = safemem.BlockSeqFromSlice(blocks)
+	}
+
+	// Perform I/O. As documented, we don't advance the read index.
+	return w.WriteFromBlocks(src)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks. It does not advance the
+// read index by the number of bytes read, such that it's only safe to call if
+// the caller guarantees that ReadToBlocks will only be called once.
+func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	return v.ReadToSafememWriter(&safemem.BlockSeqWriter{dsts}, dsts.NumBytes())
+}
diff --git a/pkg/buffer/view.go b/pkg/buffer/view.go
new file mode 100644
index 000000000..7bcfcd543
--- /dev/null
+++ b/pkg/buffer/view.go
@@ -0,0 +1,566 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"fmt"
+	"io"
+)
+
+// Buffer is an alias to View.
+type Buffer = View
+
+// View is a non-linear buffer.
+//
+// All methods are thread compatible.
+//
+// +stateify savable
+type View struct {
+	data bufferList
+	size int64
+	pool pool
+}
+
+// TrimFront removes the first count bytes from the buffer.
+func (v *View) TrimFront(count int64) {
+	if count >= v.size {
+		v.advanceRead(v.size)
+	} else {
+		v.advanceRead(count)
+	}
+}
+
+// Remove deletes data at specified location in v. It returns false if specified
+// range does not fully reside in v.
+func (v *View) Remove(offset, length int) bool {
+	if offset < 0 || length < 0 {
+		return false
+	}
+	tgt := Range{begin: offset, end: offset + length}
+	if tgt.Len() != tgt.Intersect(Range{end: int(v.size)}).Len() {
+		return false
+	}
+
+	// Scan through each buffer and remove intersections.
+	var curr Range
+	for buf := v.data.Front(); buf != nil; {
+		origLen := buf.ReadSize()
+		curr.end = curr.begin + origLen
+
+		if x := curr.Intersect(tgt); x.Len() > 0 {
+			if !buf.Remove(x.Offset(-curr.begin)) {
+				panic("buf.Remove() failed")
+			}
+			if buf.ReadSize() == 0 {
+				// buf fully removed, removing it from the list.
+				oldBuf := buf
+				buf = buf.Next()
+				v.data.Remove(oldBuf)
+				v.pool.put(oldBuf)
+			} else {
+				// Only partial data intersects, moving on to next one.
+				buf = buf.Next()
+			}
+			v.size -= int64(x.Len())
+		} else {
+			// This buffer is not in range, moving on to next one.
+			buf = buf.Next()
+		}
+
+		curr.begin += origLen
+		if curr.begin >= tgt.end {
+			break
+		}
+	}
+	return true
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (v *View) ReadAt(p []byte, offset int64) (int, error) {
+	var (
+		skipped int64
+		done    int64
+	)
+	for buf := v.data.Front(); buf != nil && done < int64(len(p)); buf = buf.Next() {
+		needToSkip := int(offset - skipped)
+		if sz := buf.ReadSize(); sz <= needToSkip {
+			skipped += int64(sz)
+			continue
+		}
+
+		// Actually read data.
+		n := copy(p[done:], buf.ReadSlice()[needToSkip:])
+		skipped += int64(needToSkip)
+		done += int64(n)
+	}
+	if int(done) < len(p) || offset+done == v.size {
+		return int(done), io.EOF
+	}
+	return int(done), nil
+}
+
+// advanceRead advances the view's read index.
+//
+// Precondition: there must be sufficient bytes in the buffer.
+func (v *View) advanceRead(count int64) {
+	for buf := v.data.Front(); buf != nil && count > 0; {
+		sz := int64(buf.ReadSize())
+		if sz > count {
+			// There is still data for reading.
+			buf.ReadMove(int(count))
+			v.size -= count
+			count = 0
+			break
+		}
+
+		// Consume the whole buffer.
+		oldBuf := buf
+		buf = buf.Next() // Iterate.
+		v.data.Remove(oldBuf)
+		v.pool.put(oldBuf)
+
+		// Update counts.
+		count -= sz
+		v.size -= sz
+	}
+	if count > 0 {
+		panic(fmt.Sprintf("advanceRead still has %d bytes remaining", count))
+	}
+}
+
+// Truncate truncates the view to the given bytes.
+//
+// This will not grow the view, only shrink it. If a length is passed that is
+// greater than the current size of the view, then nothing will happen.
+//
+// Precondition: length must be >= 0.
+func (v *View) Truncate(length int64) {
+	if length < 0 {
+		panic("negative length provided")
+	}
+	if length >= v.size {
+		return // Nothing to do.
+	}
+	for buf := v.data.Back(); buf != nil && v.size > length; buf = v.data.Back() {
+		sz := int64(buf.ReadSize())
+		if after := v.size - sz; after < length {
+			// Truncate the buffer locally.
+			left := (length - after)
+			buf.write = buf.read + int(left)
+			v.size = length
+			break
+		}
+
+		// Drop the buffer completely; see above.
+		v.data.Remove(buf)
+		v.pool.put(buf)
+		v.size -= sz
+	}
+}
+
+// Grow grows the given view to the number of bytes, which will be appended. If
+// zero is true, all these bytes will be zero. If zero is false, then this is
+// the caller's responsibility.
+//
+// Precondition: length must be >= 0.
+func (v *View) Grow(length int64, zero bool) {
+	if length < 0 {
+		panic("negative length provided")
+	}
+	for v.size < length {
+		buf := v.data.Back()
+
+		// Is there some space in the last buffer?
+		if buf == nil || buf.Full() {
+			buf = v.pool.get()
+			v.data.PushBack(buf)
+		}
+
+		// Write up to length bytes.
+		sz := buf.WriteSize()
+		if int64(sz) > length-v.size {
+			sz = int(length - v.size)
+		}
+
+		// Zero the written section; note that this pattern is
+		// specifically recognized and optimized by the compiler.
+		if zero {
+			for i := buf.write; i < buf.write+sz; i++ {
+				buf.data[i] = 0
+			}
+		}
+
+		// Advance the index.
+		buf.WriteMove(sz)
+		v.size += int64(sz)
+	}
+}
+
+// Prepend prepends the given data.
+func (v *View) Prepend(data []byte) {
+	// Is there any space in the first buffer?
+	if buf := v.data.Front(); buf != nil && buf.read > 0 {
+		// Fill up before the first write.
+		avail := buf.read
+		bStart := 0
+		dStart := len(data) - avail
+		if avail > len(data) {
+			bStart = avail - len(data)
+			dStart = 0
+		}
+		n := copy(buf.data[bStart:], data[dStart:])
+		data = data[:dStart]
+		v.size += int64(n)
+		buf.read -= n
+	}
+
+	for len(data) > 0 {
+		// Do we need an empty buffer?
+		buf := v.pool.get()
+		v.data.PushFront(buf)
+
+		// The buffer is empty; copy last chunk.
+		avail := len(buf.data)
+		bStart := 0
+		dStart := len(data) - avail
+		if avail > len(data) {
+			bStart = avail - len(data)
+			dStart = 0
+		}
+
+		// We have to put the data at the end of the current
+		// buffer in order to ensure that the next prepend will
+		// correctly fill up the beginning of this buffer.
+		n := copy(buf.data[bStart:], data[dStart:])
+		data = data[:dStart]
+		v.size += int64(n)
+		buf.read = len(buf.data) - n
+		buf.write = len(buf.data)
+	}
+}
+
+// Append appends the given data.
+func (v *View) Append(data []byte) {
+	for done := 0; done < len(data); {
+		buf := v.data.Back()
+
+		// Ensure there's a buffer with space.
+		if buf == nil || buf.Full() {
+			buf = v.pool.get()
+			v.data.PushBack(buf)
+		}
+
+		// Copy in to the given buffer.
+		n := copy(buf.WriteSlice(), data[done:])
+		done += n
+		buf.WriteMove(n)
+		v.size += int64(n)
+	}
+}
+
+// AppendOwned takes ownership of data and appends it to v.
+func (v *View) AppendOwned(data []byte) {
+	if len(data) > 0 {
+		buf := v.pool.getNoInit()
+		buf.initWithData(data)
+		v.data.PushBack(buf)
+		v.size += int64(len(data))
+	}
+}
+
+// PullUp makes the specified range contiguous and returns the backing memory.
+func (v *View) PullUp(offset, length int) ([]byte, bool) {
+	if length == 0 {
+		return nil, true
+	}
+	tgt := Range{begin: offset, end: offset + length}
+	if tgt.Intersect(Range{end: int(v.size)}).Len() != length {
+		return nil, false
+	}
+
+	curr := Range{}
+	buf := v.data.Front()
+	for ; buf != nil; buf = buf.Next() {
+		origLen := buf.ReadSize()
+		curr.end = curr.begin + origLen
+
+		if x := curr.Intersect(tgt); x.Len() == tgt.Len() {
+			// buf covers the whole requested target range.
+			sub := x.Offset(-curr.begin)
+			return buf.ReadSlice()[sub.begin:sub.end], true
+		} else if x.Len() > 0 {
+			// buf is pointing at the starting buffer we want to merge.
+			break
+		}
+
+		curr.begin += origLen
+	}
+
+	// Calculate the total merged length.
+	totLen := 0
+	for n := buf; n != nil; n = n.Next() {
+		totLen += n.ReadSize()
+		if curr.begin+totLen >= tgt.end {
+			break
+		}
+	}
+
+	// Merge the buffers.
+	data := make([]byte, totLen)
+	off := 0
+	for n := buf; n != nil && off < totLen; {
+		copy(data[off:], n.ReadSlice())
+		off += n.ReadSize()
+
+		// Remove buffers except for the first one, which will be reused.
+		if n == buf {
+			n = n.Next()
+		} else {
+			old := n
+			n = n.Next()
+			v.data.Remove(old)
+			v.pool.put(old)
+		}
+	}
+
+	// Update the first buffer with merged data.
+	buf.initWithData(data)
+
+	r := tgt.Offset(-curr.begin)
+	return buf.data[r.begin:r.end], true
+}
+
+// Flatten returns a flattened copy of this data.
+//
+// This method should not be used in any performance-sensitive paths. It may
+// allocate a fresh byte slice sufficiently large to contain all the data in
+// the buffer. This is principally for debugging.
+//
+// N.B. Tee data still belongs to this view, as if there is a single buffer
+// present, then it will be returned directly. This should be used for
+// temporary use only, and a reference to the given slice should not be held.
+func (v *View) Flatten() []byte {
+	if buf := v.data.Front(); buf == nil {
+		return nil // No data at all.
+	} else if buf.Next() == nil {
+		return buf.ReadSlice() // Only one buffer.
+	}
+	data := make([]byte, 0, v.size) // Need to flatten.
+	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+		// Copy to the allocated slice.
+		data = append(data, buf.ReadSlice()...)
+	}
+	return data
+}
+
+// Size indicates the total amount of data available in this view.
+func (v *View) Size() int64 {
+	return v.size
+}
+
+// Copy makes a strict copy of this view.
+func (v *View) Copy() (other View) {
+	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+		other.Append(buf.ReadSlice())
+	}
+	return
+}
+
+// Apply applies the given function across all valid data.
+func (v *View) Apply(fn func([]byte)) {
+	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+		fn(buf.ReadSlice())
+	}
+}
+
+// SubApply applies fn to a given range of data in v. Any part of the range
+// outside of v is ignored.
+func (v *View) SubApply(offset, length int, fn func([]byte)) {
+	for buf := v.data.Front(); length > 0 && buf != nil; buf = buf.Next() {
+		d := buf.ReadSlice()
+		if offset >= len(d) {
+			offset -= len(d)
+			continue
+		}
+		if offset > 0 {
+			d = d[offset:]
+			offset = 0
+		}
+		if length < len(d) {
+			d = d[:length]
+		}
+		fn(d)
+		length -= len(d)
+	}
+}
+
+// Merge merges the provided View with this one.
+//
+// The other view will be appended to v, and other will be empty after this
+// operation completes.
+func (v *View) Merge(other *View) {
+	// Copy over all buffers.
+	for buf := other.data.Front(); buf != nil; buf = other.data.Front() {
+		other.data.Remove(buf)
+		v.data.PushBack(buf)
+	}
+
+	// Adjust sizes.
+	v.size += other.size
+	other.size = 0
+}
+
+// WriteFromReader writes to the buffer from an io.Reader.
+//
+// A minimum read size equal to unsafe.Sizeof(unintptr) is enforced,
+// provided that count is greater than or equal to unsafe.Sizeof(uintptr).
+func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) {
+	var (
+		done int64
+		n    int
+		err  error
+	)
+	for done < count {
+		buf := v.data.Back()
+
+		// Ensure we have an empty buffer.
+		if buf == nil || buf.Full() {
+			buf = v.pool.get()
+			v.data.PushBack(buf)
+		}
+
+		// Is this less than the minimum batch?
+		if buf.WriteSize() < minBatch && (count-done) >= int64(minBatch) {
+			tmp := make([]byte, minBatch)
+			n, err = r.Read(tmp)
+			v.Append(tmp[:n])
+			done += int64(n)
+			if err != nil {
+				break
+			}
+			continue
+		}
+
+		// Limit the read, if necessary.
+		sz := buf.WriteSize()
+		if left := count - done; int64(sz) > left {
+			sz = int(left)
+		}
+
+		// Pass the relevant portion of the buffer.
+		n, err = r.Read(buf.WriteSlice()[:sz])
+		buf.WriteMove(n)
+		done += int64(n)
+		v.size += int64(n)
+		if err == io.EOF {
+			err = nil // Short write allowed.
+			break
+		} else if err != nil {
+			break
+		}
+	}
+	return done, err
+}
+
+// ReadToWriter reads from the buffer into an io.Writer.
+//
+// N.B. This does not consume the bytes read. TrimFront should
+// be called appropriately after this call in order to do so.
+//
+// A minimum write size equal to unsafe.Sizeof(unintptr) is enforced,
+// provided that count is greater than or equal to unsafe.Sizeof(uintptr).
+func (v *View) ReadToWriter(w io.Writer, count int64) (int64, error) {
+	var (
+		done int64
+		n    int
+		err  error
+	)
+	offset := 0 // Spill-over for batching.
+	for buf := v.data.Front(); buf != nil && done < count; buf = buf.Next() {
+		// Has this been consumed? Skip it.
+		sz := buf.ReadSize()
+		if sz <= offset {
+			offset -= sz
+			continue
+		}
+		sz -= offset
+
+		// Is this less than the minimum batch?
+		left := count - done
+		if sz < minBatch && left >= int64(minBatch) && (v.size-done) >= int64(minBatch) {
+			tmp := make([]byte, minBatch)
+			n, err = v.ReadAt(tmp, done)
+			w.Write(tmp[:n])
+			done += int64(n)
+			offset = n - sz // Reset below.
+			if err != nil {
+				break
+			}
+			continue
+		}
+
+		// Limit the write if necessary.
+		if int64(sz) >= left {
+			sz = int(left)
+		}
+
+		// Perform the actual write.
+		n, err = w.Write(buf.ReadSlice()[offset : offset+sz])
+		done += int64(n)
+		if err != nil {
+			break
+		}
+
+		// Reset spill-over.
+		offset = 0
+	}
+	return done, err
+}
+
+// A Range specifies a range of buffer.
+type Range struct {
+	begin int
+	end   int
+}
+
+// Intersect returns the intersection of x and y.
+func (x Range) Intersect(y Range) Range {
+	if x.begin < y.begin {
+		x.begin = y.begin
+	}
+	if x.end > y.end {
+		x.end = y.end
+	}
+	if x.begin >= x.end {
+		return Range{}
+	}
+	return x
+}
+
+// Offset returns x offset by off.
+func (x Range) Offset(off int) Range {
+	x.begin += off
+	x.end += off
+	return x
+}
+
+// Len returns the length of x.
+func (x Range) Len() int {
+	l := x.end - x.begin
+	if l < 0 {
+		l = 0
+	}
+	return l
+}
diff --git a/pkg/buffer/view_unsafe.go b/pkg/buffer/view_unsafe.go
new file mode 100644
index 000000000..d1ef39b26
--- /dev/null
+++ b/pkg/buffer/view_unsafe.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"unsafe"
+)
+
+// minBatch is the smallest Read or Write operation that the
+// WriteFromReader and ReadToWriter functions will use.
+//
+// This is defined as the size of a native pointer.
+const minBatch = int(unsafe.Sizeof(uintptr(0)))
diff --git a/pkg/tcpip/header/parse/parse.go b/pkg/tcpip/header/parse/parse.go
index ebb4b2c1d..1c913b5e1 100644
--- a/pkg/tcpip/header/parse/parse.go
+++ b/pkg/tcpip/header/parse/parse.go
@@ -60,9 +60,13 @@ func IPv4(pkt *stack.PacketBuffer) bool {
 		return false
 	}
 	ipHdr = header.IPv4(hdr)
+	length := int(ipHdr.TotalLength()) - len(hdr)
+	if length < 0 {
+		return false
+	}
 
 	pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
-	pkt.Data().CapLength(int(ipHdr.TotalLength()) - len(hdr))
+	pkt.Data().CapLength(length)
 	return true
 }
 
diff --git a/pkg/tcpip/network/internal/fragmentation/reassembler.go b/pkg/tcpip/network/internal/fragmentation/reassembler.go
index 90075a70c..56b76a284 100644
--- a/pkg/tcpip/network/internal/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/internal/fragmentation/reassembler.go
@@ -167,8 +167,7 @@ func (r *reassembler) process(first, last uint16, more bool, proto uint8, pkt *s
 
 	resPkt := r.holes[0].pkt
 	for i := 1; i < len(r.holes); i++ {
-		fragData := r.holes[i].pkt.Data()
-		resPkt.Data().ReadFromData(fragData, fragData.Size())
+		stack.MergeFragment(resPkt, r.holes[i].pkt)
 	}
 	return resPkt, r.proto, true, memConsumed, nil
 }
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 880290b4b..febbb3f38 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -1472,13 +1472,19 @@ func (e *endpoint) processExtensionHeaders(h header.IPv6, pkt *stack.PacketBuffe
 			// If the last header in the payload isn't a known IPv6 extension header,
 			// handle it as if it is transport layer data.
 
+			// Calculate the number of octets parsed from data. We want to remove all
+			// the data except the unparsed portion located at the end, which its size
+			// is extHdr.Buf.Size().
+			trim := pkt.Data().Size() - extHdr.Buf.Size()
+
 			// For unfragmented packets, extHdr still contains the transport header.
 			// Get rid of it.
 			//
 			// For reassembled fragments, pkt.TransportHeader is unset, so this is a
 			// no-op and pkt.Data begins with the transport header.
-			extHdr.Buf.TrimFront(pkt.TransportHeader().View().Size())
-			pkt.Data().Replace(extHdr.Buf)
+			trim += pkt.TransportHeader().View().Size()
+
+			pkt.Data().DeleteFront(trim)
 
 			stats.PacketsDelivered.Increment()
 			if p := tcpip.TransportProtocolNumber(extHdr.Identifier); p == header.ICMPv6ProtocolNumber {
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index fc3c54e34..e2e073091 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -16,9 +16,10 @@ package stack
 import (
 	"fmt"
 
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	tcpipbuffer "gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
@@ -39,7 +40,7 @@ type PacketBufferOptions struct {
 
 	// Data is the initial unparsed data for the new packet. If set, it will be
 	// owned by the new packet.
-	Data buffer.VectorisedView
+	Data tcpipbuffer.VectorisedView
 
 	// IsForwardedPacket identifies that the PacketBuffer being created is for a
 	// forwarded packet.
@@ -56,6 +57,34 @@ type PacketBufferOptions struct {
 // empty. Use of PacketBuffer in any other order is unsupported.
 //
 // PacketBuffer must be created with NewPacketBuffer.
+//
+// Internal structure: A PacketBuffer holds a pointer to buffer.Buffer, which
+// exposes a logically-contiguous byte storage. The underlying storage structure
+// is abstracted out, and should not be a concern here for most of the time.
+//
+// |- reserved ->|
+//               |--->| consumed (incoming)
+// 0             V    V
+// +--------+----+----+--------------------+
+// |        |    |    | current data ...   | (buf)
+// +--------+----+----+--------------------+
+//          ^    |
+//          |<---| pushed (outgoing)
+//
+// When a PacketBuffer is created, a `reserved` header region can be specified,
+// which stack pushes headers in this region for an outgoing packet. There could
+// be no such region for an incoming packet, and `reserved` is 0. The value of
+// `reserved` never changes in the entire lifetime of the packet.
+//
+// Outgoing Packet: When a header is pushed, `pushed` gets incremented by the
+// pushed length, and the current value is stored for each header. PacketBuffer
+// substracts this value from `reserved` to compute the starting offset of each
+// header in `buf`.
+//
+// Incoming Packet: When a header is consumed (a.k.a. parsed), the current
+// `consumed` value is stored for each header, and it gets incremented by the
+// consumed length. PacketBuffer adds this value to `reserved` to compute the
+// starting offset of each header in `buf`.
 type PacketBuffer struct {
 	_ sync.NoCopy
 
@@ -63,28 +92,16 @@ type PacketBuffer struct {
 	// PacketBuffers.
 	PacketBufferEntry
 
-	// data holds the payload of the packet.
-	//
-	// For inbound packets, Data is initially the whole packet. Then gets moved to
-	// headers via PacketHeader.Consume, when the packet is being parsed.
-	//
-	// For outbound packets, Data is the innermost layer, defined by the protocol.
-	// Headers are pushed in front of it via PacketHeader.Push.
-	//
-	// The bytes backing Data are immutable, a.k.a. users shouldn't write to its
-	// backing storage.
-	data buffer.VectorisedView
+	// buf is the underlying buffer for the packet. See struct level docs for
+	// details.
+	buf      *buffer.Buffer
+	reserved int
+	pushed   int
+	consumed int
 
 	// headers stores metadata about each header.
 	headers [numHeaderType]headerInfo
 
-	// header is the internal storage for outbound packets. Headers will be pushed
-	// (prepended) on this storage as the packet is being constructed.
-	//
-	// TODO(gvisor.dev/issue/2404): Switch to an implementation that header and
-	// data are held in the same underlying buffer storage.
-	header buffer.Prependable
-
 	// NetworkProtocolNumber is only valid when NetworkHeader().View().IsEmpty()
 	// returns false.
 	// TODO(gvisor.dev/issue/3574): Remove the separately passed protocol
@@ -131,10 +148,14 @@ type PacketBuffer struct {
 // NewPacketBuffer creates a new PacketBuffer with opts.
 func NewPacketBuffer(opts PacketBufferOptions) *PacketBuffer {
 	pk := &PacketBuffer{
-		data: opts.Data,
+		buf: &buffer.Buffer{},
 	}
 	if opts.ReserveHeaderBytes != 0 {
-		pk.header = buffer.NewPrependable(opts.ReserveHeaderBytes)
+		pk.buf.AppendOwned(make([]byte, opts.ReserveHeaderBytes))
+		pk.reserved = opts.ReserveHeaderBytes
+	}
+	for _, v := range opts.Data.Views() {
+		pk.buf.AppendOwned(v)
 	}
 	if opts.IsForwardedPacket {
 		pk.NetworkPacketInfo.IsForwardedPacket = opts.IsForwardedPacket
@@ -145,13 +166,13 @@ func NewPacketBuffer(opts PacketBufferOptions) *PacketBuffer {
 // ReservedHeaderBytes returns the number of bytes initially reserved for
 // headers.
 func (pk *PacketBuffer) ReservedHeaderBytes() int {
-	return pk.header.UsedLength() + pk.header.AvailableLength()
+	return pk.reserved
 }
 
 // AvailableHeaderBytes returns the number of bytes currently available for
 // headers. This is relevant to PacketHeader.Push method only.
 func (pk *PacketBuffer) AvailableHeaderBytes() int {
-	return pk.header.AvailableLength()
+	return pk.reserved - pk.pushed
 }
 
 // LinkHeader returns the handle to link-layer header.
@@ -180,24 +201,18 @@ func (pk *PacketBuffer) TransportHeader() PacketHeader {
 
 // HeaderSize returns the total size of all headers in bytes.
 func (pk *PacketBuffer) HeaderSize() int {
-	// Note for inbound packets (Consume called), headers are not stored in
-	// pk.header. Thus, calculation of size of each header is needed.
-	var size int
-	for i := range pk.headers {
-		size += len(pk.headers[i].buf)
-	}
-	return size
+	return pk.pushed + pk.consumed
 }
 
 // Size returns the size of packet in bytes.
 func (pk *PacketBuffer) Size() int {
-	return pk.HeaderSize() + pk.data.Size()
+	return int(pk.buf.Size()) - pk.headerOffset()
 }
 
 // MemSize returns the estimation size of the pk in memory, including backing
 // buffer data.
 func (pk *PacketBuffer) MemSize() int {
-	return pk.HeaderSize() + pk.data.MemSize() + packetBufferStructSize
+	return int(pk.buf.Size()) + packetBufferStructSize
 }
 
 // Data returns the handle to data portion of pk.
@@ -206,61 +221,65 @@ func (pk *PacketBuffer) Data() PacketData {
 }
 
 // Views returns the underlying storage of the whole packet.
-func (pk *PacketBuffer) Views() []buffer.View {
-	// Optimization for outbound packets that headers are in pk.header.
-	useHeader := true
-	for i := range pk.headers {
-		if !canUseHeader(&pk.headers[i]) {
-			useHeader = false
-			break
-		}
-	}
+func (pk *PacketBuffer) Views() []tcpipbuffer.View {
+	var views []tcpipbuffer.View
+	offset := pk.headerOffset()
+	pk.buf.SubApply(offset, int(pk.buf.Size())-offset, func(v []byte) {
+		views = append(views, v)
+	})
+	return views
+}
 
-	dataViews := pk.data.Views()
-
-	var vs []buffer.View
-	if useHeader {
-		vs = make([]buffer.View, 0, 1+len(dataViews))
-		vs = append(vs, pk.header.View())
-	} else {
-		vs = make([]buffer.View, 0, len(pk.headers)+len(dataViews))
-		for i := range pk.headers {
-			if v := pk.headers[i].buf; len(v) > 0 {
-				vs = append(vs, v)
-			}
-		}
-	}
-	return append(vs, dataViews...)
+func (pk *PacketBuffer) headerOffset() int {
+	return pk.reserved - pk.pushed
+}
+
+func (pk *PacketBuffer) headerOffsetOf(typ headerType) int {
+	return pk.reserved + pk.headers[typ].offset
 }
 
-func canUseHeader(h *headerInfo) bool {
-	// h.offset will be negative if the header was pushed in to prependable
-	// portion, or doesn't matter when it's empty.
-	return len(h.buf) == 0 || h.offset < 0
+func (pk *PacketBuffer) dataOffset() int {
+	return pk.reserved + pk.consumed
 }
 
-func (pk *PacketBuffer) push(typ headerType, size int) buffer.View {
+func (pk *PacketBuffer) push(typ headerType, size int) tcpipbuffer.View {
 	h := &pk.headers[typ]
-	if h.buf != nil {
+	if h.length > 0 {
 		panic(fmt.Sprintf("push must not be called twice: type %s", typ))
 	}
-	h.buf = buffer.View(pk.header.Prepend(size))
-	h.offset = -pk.header.UsedLength()
-	return h.buf
+	if pk.pushed+size > pk.reserved {
+		panic("not enough headroom reserved")
+	}
+	pk.pushed += size
+	h.offset = -pk.pushed
+	h.length = size
+	return pk.headerView(typ)
 }
 
-func (pk *PacketBuffer) consume(typ headerType, size int) (v buffer.View, consumed bool) {
+func (pk *PacketBuffer) consume(typ headerType, size int) (v tcpipbuffer.View, consumed bool) {
 	h := &pk.headers[typ]
-	if h.buf != nil {
+	if h.length > 0 {
 		panic(fmt.Sprintf("consume must not be called twice: type %s", typ))
 	}
-	v, ok := pk.data.PullUp(size)
+	if pk.headerOffset()+pk.consumed+size > int(pk.buf.Size()) {
+		return nil, false
+	}
+	h.offset = pk.consumed
+	h.length = size
+	pk.consumed += size
+	return pk.headerView(typ), true
+}
+
+func (pk *PacketBuffer) headerView(typ headerType) tcpipbuffer.View {
+	h := &pk.headers[typ]
+	if h.length == 0 {
+		return nil
+	}
+	v, ok := pk.buf.PullUp(pk.headerOffsetOf(typ), h.length)
 	if !ok {
-		return
+		panic("PullUp failed")
 	}
-	pk.data.TrimFront(size)
-	h.buf = v
-	return h.buf, true
+	return v
 }
 
 // Clone makes a shallow copy of pk.
@@ -270,9 +289,11 @@ func (pk *PacketBuffer) consume(typ headerType, size int) (v buffer.View, consum
 func (pk *PacketBuffer) Clone() *PacketBuffer {
 	return &PacketBuffer{
 		PacketBufferEntry:            pk.PacketBufferEntry,
-		data:                         pk.data.Clone(nil),
+		buf:                          pk.buf,
+		reserved:                     pk.reserved,
+		pushed:                       pk.pushed,
+		consumed:                     pk.consumed,
 		headers:                      pk.headers,
-		header:                       pk.header,
 		Hash:                         pk.Hash,
 		Owner:                        pk.Owner,
 		GSOOptions:                   pk.GSOOptions,
@@ -306,9 +327,11 @@ func (pk *PacketBuffer) Network() header.Network {
 // See PacketBuffer.Data for details about how a packet buffer holds an inbound
 // packet.
 func (pk *PacketBuffer) CloneToInbound() *PacketBuffer {
-	newPk := NewPacketBuffer(PacketBufferOptions{
-		Data: buffer.NewVectorisedView(pk.Size(), pk.Views()),
-	})
+	newPk := &PacketBuffer{
+		buf: pk.buf,
+		// Treat unfilled header portion as reserved.
+		reserved: pk.AvailableHeaderBytes(),
+	}
 	// TODO(gvisor.dev/issue/5696): reimplement conntrack so that no need to
 	// maintain this flag in the packet. Currently conntrack needs this flag to
 	// tell if a noop connection should be inserted at Input hook. Once conntrack
@@ -322,15 +345,12 @@ func (pk *PacketBuffer) CloneToInbound() *PacketBuffer {
 
 // headerInfo stores metadata about a header in a packet.
 type headerInfo struct {
-	// buf is the memorized slice for both prepended and consumed header.
-	// When header is prepended, buf serves as memorized value, which is a slice
-	// of pk.header. When header is consumed, buf is the slice pulled out from
-	// pk.Data, which is the only place to hold this header.
-	buf buffer.View
-
-	// offset will be a negative number denoting the offset where this header is
-	// from the end of pk.header, if it is prepended. Otherwise, zero.
+	// offset is the offset of the header in pk.buf relative to
+	// pk.buf[pk.reserved]. See the PacketBuffer struct for details.
 	offset int
+
+	// length is the length of this header.
+	length int
 }
 
 // PacketHeader is a handle object to a header in the underlying packet.
@@ -340,14 +360,14 @@ type PacketHeader struct {
 }
 
 // View returns the underlying storage of h.
-func (h PacketHeader) View() buffer.View {
-	return h.pk.headers[h.typ].buf
+func (h PacketHeader) View() tcpipbuffer.View {
+	return h.pk.headerView(h.typ)
 }
 
 // Push pushes size bytes in the front of its residing packet, and returns the
 // backing storage. Callers may only call one of Push or Consume once on each
 // header in the lifetime of the underlying packet.
-func (h PacketHeader) Push(size int) buffer.View {
+func (h PacketHeader) Push(size int) tcpipbuffer.View {
 	return h.pk.push(h.typ, size)
 }
 
@@ -356,7 +376,7 @@ func (h PacketHeader) Push(size int) buffer.View {
 // size, consumed will be false, and the state of h will not be affected.
 // Callers may only call one of Push or Consume once on each header in the
 // lifetime of the underlying packet.
-func (h PacketHeader) Consume(size int) (v buffer.View, consumed bool) {
+func (h PacketHeader) Consume(size int) (v tcpipbuffer.View, consumed bool) {
 	return h.pk.consume(h.typ, size)
 }
 
@@ -367,55 +387,84 @@ type PacketData struct {
 
 // PullUp returns a contiguous view of size bytes from the beginning of d.
 // Callers should not write to or keep the view for later use.
-func (d PacketData) PullUp(size int) (buffer.View, bool) {
-	return d.pk.data.PullUp(size)
+func (d PacketData) PullUp(size int) (tcpipbuffer.View, bool) {
+	return d.pk.buf.PullUp(d.pk.dataOffset(), size)
 }
 
 // DeleteFront removes count from the beginning of d. It panics if count >
 // d.Size(). All backing storage references after the front of the d are
 // invalidated.
 func (d PacketData) DeleteFront(count int) {
-	d.pk.data.TrimFront(count)
+	if !d.pk.buf.Remove(d.pk.dataOffset(), count) {
+		panic("count > d.Size()")
+	}
 }
 
 // CapLength reduces d to at most length bytes.
 func (d PacketData) CapLength(length int) {
-	d.pk.data.CapLength(length)
+	if length < 0 {
+		panic("length < 0")
+	}
+	if currLength := d.Size(); currLength > length {
+		trim := currLength - length
+		d.pk.buf.Remove(int(d.pk.buf.Size())-trim, trim)
+	}
 }
 
 // Views returns the underlying storage of d in a slice of Views. Caller should
 // not modify the returned slice.
-func (d PacketData) Views() []buffer.View {
-	return d.pk.data.Views()
+func (d PacketData) Views() []tcpipbuffer.View {
+	var views []tcpipbuffer.View
+	offset := d.pk.dataOffset()
+	d.pk.buf.SubApply(offset, int(d.pk.buf.Size())-offset, func(v []byte) {
+		views = append(views, v)
+	})
+	return views
 }
 
 // AppendView appends v into d, taking the ownership of v.
-func (d PacketData) AppendView(v buffer.View) {
-	d.pk.data.AppendView(v)
+func (d PacketData) AppendView(v tcpipbuffer.View) {
+	d.pk.buf.AppendOwned(v)
 }
 
-// ReadFromData moves at most count bytes from the beginning of srcData to the
-// end of d and returns the number of bytes moved.
-func (d PacketData) ReadFromData(srcData PacketData, count int) int {
-	return srcData.pk.data.ReadToVV(&d.pk.data, count)
+// MergeFragment appends the data portion of frag to dst. It takes ownership of
+// frag and frag should not be used again.
+func MergeFragment(dst, frag *PacketBuffer) {
+	frag.buf.TrimFront(int64(frag.dataOffset()))
+	dst.buf.Merge(frag.buf)
 }
 
 // ReadFromVV moves at most count bytes from the beginning of srcVV to the end
 // of d and returns the number of bytes moved.
-func (d PacketData) ReadFromVV(srcVV *buffer.VectorisedView, count int) int {
-	return srcVV.ReadToVV(&d.pk.data, count)
+func (d PacketData) ReadFromVV(srcVV *tcpipbuffer.VectorisedView, count int) int {
+	done := 0
+	for _, v := range srcVV.Views() {
+		if len(v) < count {
+			count -= len(v)
+			done += len(v)
+			d.pk.buf.AppendOwned(v)
+		} else {
+			v = v[:count]
+			count -= len(v)
+			done += len(v)
+			d.pk.buf.Append(v)
+			break
+		}
+	}
+	srcVV.TrimFront(done)
+	return done
 }
 
 // Size returns the number of bytes in the data payload of the packet.
 func (d PacketData) Size() int {
-	return d.pk.data.Size()
+	return int(d.pk.buf.Size()) - d.pk.dataOffset()
 }
 
 // AsRange returns a Range representing the current data payload of the packet.
 func (d PacketData) AsRange() Range {
 	return Range{
 		pk:     d.pk,
-		offset: d.pk.HeaderSize(),
+		offset: d.pk.dataOffset(),
 		length: d.Size(),
 	}
 }
@@ -425,17 +474,12 @@ func (d PacketData) AsRange() Range {
 //
 // This method exists for compatibility between PacketBuffer and VectorisedView.
 // It may be removed later and should be used with care.
-func (d PacketData) ExtractVV() buffer.VectorisedView {
-	return d.pk.data
-}
-
-// Replace replaces the data portion of the packet with vv, taking the ownership
-// of vv.
-//
-// This method exists for compatibility between PacketBuffer and VectorisedView.
-// It may be removed later and should be used with care.
-func (d PacketData) Replace(vv buffer.VectorisedView) {
-	d.pk.data = vv
+func (d PacketData) ExtractVV() tcpipbuffer.VectorisedView {
+	var vv tcpipbuffer.VectorisedView
+	d.pk.buf.SubApply(d.pk.dataOffset(), d.pk.Size(), func(v []byte) {
+		vv.AppendView(v)
+	})
+	return vv
 }
 
 // Range represents a contiguous subportion of a PacketBuffer.
@@ -479,9 +523,9 @@ func (r Range) Capped(max int) Range {
 // AsView returns the backing storage of r if possible. It will allocate a new
 // View if r spans multiple pieces internally. Caller should not write to the
 // returned View in any way.
-func (r Range) AsView() buffer.View {
+func (r Range) AsView() tcpipbuffer.View {
 	var allocated bool
-	var v buffer.View
+	var v tcpipbuffer.View
 	r.iterate(func(b []byte) {
 		if v == nil {
 			// v has not been assigned, allowing first view to be returned.
@@ -502,7 +546,7 @@ func (r Range) AsView() buffer.View {
 }
 
 // ToOwnedView returns a owned copy of data in r.
-func (r Range) ToOwnedView() buffer.View {
+func (r Range) ToOwnedView() tcpipbuffer.View {
 	if r.length == 0 {
 		return nil
 	}
@@ -523,63 +567,7 @@ func (r Range) Checksum() uint16 {
 // iterate calls fn for each piece in r. fn is always called with a non-empty
 // slice.
 func (r Range) iterate(fn func([]byte)) {
-	w := window{
-		offset: r.offset,
-		length: r.length,
-	}
-	// Header portion.
-	for i := range r.pk.headers {
-		if b := w.process(r.pk.headers[i].buf); len(b) > 0 {
-			fn(b)
-		}
-		if w.isDone() {
-			break
-		}
-	}
-	// Data portion.
-	if !w.isDone() {
-		for _, v := range r.pk.data.Views() {
-			if b := w.process(v); len(b) > 0 {
-				fn(b)
-			}
-			if w.isDone() {
-				break
-			}
-		}
-	}
-}
-
-// window represents contiguous region of byte stream. User would call process()
-// to input bytes, and obtain a subslice that is inside the window.
-type window struct {
-	offset int
-	length int
-}
-
-// isDone returns true if the window has passed and further process() calls will
-// always return an empty slice. This can be used to end processing early.
-func (w *window) isDone() bool {
-	return w.length == 0
-}
-
-// process feeds b in and returns a subslice that is inside the window. The
-// returned slice will be a subslice of b, and it does not keep b after method
-// returns. This method may return an empty slice if nothing in b is inside the
-// window.
-func (w *window) process(b []byte) (inWindow []byte) {
-	if w.offset >= len(b) {
-		w.offset -= len(b)
-		return nil
-	}
-	if w.offset > 0 {
-		b = b[w.offset:]
-		w.offset = 0
-	}
-	if w.length < len(b) {
-		b = b[:w.length]
-	}
-	w.length -= len(b)
-	return b
+	r.pk.buf.SubApply(r.offset, r.length, fn)
 }
 
 // PayloadSince returns packet payload starting from and including a particular
@@ -587,21 +575,14 @@ func (w *window) process(b []byte) (inWindow []byte) {
 //
 // The returned View is owned by the caller - its backing buffer is separate
 // from the packet header's underlying packet buffer.
-func PayloadSince(h PacketHeader) buffer.View {
-	size := h.pk.data.Size()
-	for _, hinfo := range h.pk.headers[h.typ:] {
-		size += len(hinfo.buf)
+func PayloadSince(h PacketHeader) tcpipbuffer.View {
+	offset := h.pk.headerOffset()
+	for i := headerType(0); i < h.typ; i++ {
+		offset += h.pk.headers[i].length
 	}
-
-	v := make(buffer.View, 0, size)
-
-	for _, hinfo := range h.pk.headers[h.typ:] {
-		v = append(v, hinfo.buf...)
-	}
-
-	for _, view := range h.pk.data.Views() {
-		v = append(v, view...)
-	}
-
-	return v
+	return Range{
+		pk:     h.pk,
+		offset: offset,
+		length: int(h.pk.buf.Size()) - offset,
+	}.ToOwnedView()
 }
author	gVisor bot <gvisor-bot@google.com>	2021-05-13 21:00:53 +0000
committer	gVisor bot <gvisor-bot@google.com>	2021-05-13 21:00:53 +0000
commit	e1cfd3185c285f4dc69804210cc0d77ec582beb5 (patch)
tree	866c789fcc0f67ec0933e3cf88fed6628e85b6f6
parent	f4d9f967005fdf7995439f56839cbb4a7589ff6c (diff)
parent	84f04cc858644e9748a82f33b834a84c8b0fc934 (diff)