summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorgVisor bot <gvisor-bot@google.com>2021-05-13 21:00:53 +0000
committergVisor bot <gvisor-bot@google.com>2021-05-13 21:00:53 +0000
commite1cfd3185c285f4dc69804210cc0d77ec582beb5 (patch)
tree866c789fcc0f67ec0933e3cf88fed6628e85b6f6
parentf4d9f967005fdf7995439f56839cbb4a7589ff6c (diff)
parent84f04cc858644e9748a82f33b834a84c8b0fc934 (diff)
Merge release-20210510.0-27-g84f04cc85 (automated)
-rw-r--r--pkg/buffer/buffer.go105
-rw-r--r--pkg/buffer/buffer_list.go221
-rw-r--r--pkg/buffer/buffer_state_autogen.go163
-rw-r--r--pkg/buffer/buffer_unsafe_state_autogen.go3
-rw-r--r--pkg/buffer/pool.go90
-rw-r--r--pkg/buffer/safemem.go133
-rw-r--r--pkg/buffer/view.go566
-rw-r--r--pkg/buffer/view_unsafe.go25
-rw-r--r--pkg/tcpip/header/parse/parse.go6
-rw-r--r--pkg/tcpip/network/internal/fragmentation/reassembler.go3
-rw-r--r--pkg/tcpip/network/ipv6/ipv6.go10
-rw-r--r--pkg/tcpip/stack/packet_buffer.go367
12 files changed, 1494 insertions, 198 deletions
diff --git a/pkg/buffer/buffer.go b/pkg/buffer/buffer.go
new file mode 100644
index 000000000..5b77a6a3f
--- /dev/null
+++ b/pkg/buffer/buffer.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package buffer provides the implementation of a buffer view.
+//
+// A view is an flexible buffer, supporting the safecopy operations natively as
+// well as the ability to grow via either prepend or append, as well as shrink.
+package buffer
+
+// buffer encapsulates a queueable byte buffer.
+//
+// +stateify savable
+type buffer struct {
+ data []byte
+ read int
+ write int
+ bufferEntry
+}
+
+// init performs in-place initialization for zero value.
+func (b *buffer) init(size int) {
+ b.data = make([]byte, size)
+}
+
+// initWithData initializes b with data, taking ownership.
+func (b *buffer) initWithData(data []byte) {
+ b.data = data
+ b.read = 0
+ b.write = len(data)
+}
+
+// Reset resets read and write locations, effectively emptying the buffer.
+func (b *buffer) Reset() {
+ b.read = 0
+ b.write = 0
+}
+
+// Remove removes r from the unread portion. It returns false if r does not
+// fully reside in b.
+func (b *buffer) Remove(r Range) bool {
+ sz := b.ReadSize()
+ switch {
+ case r.Len() != r.Intersect(Range{end: sz}).Len():
+ return false
+ case r.Len() == 0:
+ // Noop
+ case r.begin == 0:
+ b.read += r.end
+ case r.end == sz:
+ b.write -= r.Len()
+ default:
+ // Remove from the middle of b.data.
+ copy(b.data[b.read+r.begin:], b.data[b.read+r.end:b.write])
+ b.write -= r.Len()
+ }
+ return true
+}
+
+// Full indicates the buffer is full.
+//
+// This indicates there is no capacity left to write.
+func (b *buffer) Full() bool {
+ return b.write == len(b.data)
+}
+
+// ReadSize returns the number of bytes available for reading.
+func (b *buffer) ReadSize() int {
+ return b.write - b.read
+}
+
+// ReadMove advances the read index by the given amount.
+func (b *buffer) ReadMove(n int) {
+ b.read += n
+}
+
+// ReadSlice returns the read slice for this buffer.
+func (b *buffer) ReadSlice() []byte {
+ return b.data[b.read:b.write]
+}
+
+// WriteSize returns the number of bytes available for writing.
+func (b *buffer) WriteSize() int {
+ return len(b.data) - b.write
+}
+
+// WriteMove advances the write index by the given amount.
+func (b *buffer) WriteMove(n int) {
+ b.write += n
+}
+
+// WriteSlice returns the write slice for this buffer.
+func (b *buffer) WriteSlice() []byte {
+ return b.data[b.write:]
+}
diff --git a/pkg/buffer/buffer_list.go b/pkg/buffer/buffer_list.go
new file mode 100644
index 000000000..6b5bea3fc
--- /dev/null
+++ b/pkg/buffer/buffer_list.go
@@ -0,0 +1,221 @@
+package buffer
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type bufferElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (bufferElementMapper) linkerFor(elem *buffer) *buffer { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+// for e := l.Front(); e != nil; e = e.Next() {
+// // do something with e.
+// }
+//
+// +stateify savable
+type bufferList struct {
+ head *buffer
+ tail *buffer
+}
+
+// Reset resets list l to the empty state.
+func (l *bufferList) Reset() {
+ l.head = nil
+ l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+//
+//go:nosplit
+func (l *bufferList) Empty() bool {
+ return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+//
+//go:nosplit
+func (l *bufferList) Front() *buffer {
+ return l.head
+}
+
+// Back returns the last element of list l or nil.
+//
+//go:nosplit
+func (l *bufferList) Back() *buffer {
+ return l.tail
+}
+
+// Len returns the number of elements in the list.
+//
+// NOTE: This is an O(n) operation.
+//
+//go:nosplit
+func (l *bufferList) Len() (count int) {
+ for e := l.Front(); e != nil; e = (bufferElementMapper{}.linkerFor(e)).Next() {
+ count++
+ }
+ return count
+}
+
+// PushFront inserts the element e at the front of list l.
+//
+//go:nosplit
+func (l *bufferList) PushFront(e *buffer) {
+ linker := bufferElementMapper{}.linkerFor(e)
+ linker.SetNext(l.head)
+ linker.SetPrev(nil)
+ if l.head != nil {
+ bufferElementMapper{}.linkerFor(l.head).SetPrev(e)
+ } else {
+ l.tail = e
+ }
+
+ l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+//
+//go:nosplit
+func (l *bufferList) PushBack(e *buffer) {
+ linker := bufferElementMapper{}.linkerFor(e)
+ linker.SetNext(nil)
+ linker.SetPrev(l.tail)
+ if l.tail != nil {
+ bufferElementMapper{}.linkerFor(l.tail).SetNext(e)
+ } else {
+ l.head = e
+ }
+
+ l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+//
+//go:nosplit
+func (l *bufferList) PushBackList(m *bufferList) {
+ if l.head == nil {
+ l.head = m.head
+ l.tail = m.tail
+ } else if m.head != nil {
+ bufferElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+ bufferElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+ l.tail = m.tail
+ }
+ m.head = nil
+ m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+//
+//go:nosplit
+func (l *bufferList) InsertAfter(b, e *buffer) {
+ bLinker := bufferElementMapper{}.linkerFor(b)
+ eLinker := bufferElementMapper{}.linkerFor(e)
+
+ a := bLinker.Next()
+
+ eLinker.SetNext(a)
+ eLinker.SetPrev(b)
+ bLinker.SetNext(e)
+
+ if a != nil {
+ bufferElementMapper{}.linkerFor(a).SetPrev(e)
+ } else {
+ l.tail = e
+ }
+}
+
+// InsertBefore inserts e before a.
+//
+//go:nosplit
+func (l *bufferList) InsertBefore(a, e *buffer) {
+ aLinker := bufferElementMapper{}.linkerFor(a)
+ eLinker := bufferElementMapper{}.linkerFor(e)
+
+ b := aLinker.Prev()
+ eLinker.SetNext(a)
+ eLinker.SetPrev(b)
+ aLinker.SetPrev(e)
+
+ if b != nil {
+ bufferElementMapper{}.linkerFor(b).SetNext(e)
+ } else {
+ l.head = e
+ }
+}
+
+// Remove removes e from l.
+//
+//go:nosplit
+func (l *bufferList) Remove(e *buffer) {
+ linker := bufferElementMapper{}.linkerFor(e)
+ prev := linker.Prev()
+ next := linker.Next()
+
+ if prev != nil {
+ bufferElementMapper{}.linkerFor(prev).SetNext(next)
+ } else if l.head == e {
+ l.head = next
+ }
+
+ if next != nil {
+ bufferElementMapper{}.linkerFor(next).SetPrev(prev)
+ } else if l.tail == e {
+ l.tail = prev
+ }
+
+ linker.SetNext(nil)
+ linker.SetPrev(nil)
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type bufferEntry struct {
+ next *buffer
+ prev *buffer
+}
+
+// Next returns the entry that follows e in the list.
+//
+//go:nosplit
+func (e *bufferEntry) Next() *buffer {
+ return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+//
+//go:nosplit
+func (e *bufferEntry) Prev() *buffer {
+ return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+//
+//go:nosplit
+func (e *bufferEntry) SetNext(elem *buffer) {
+ e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+//
+//go:nosplit
+func (e *bufferEntry) SetPrev(elem *buffer) {
+ e.prev = elem
+}
diff --git a/pkg/buffer/buffer_state_autogen.go b/pkg/buffer/buffer_state_autogen.go
new file mode 100644
index 000000000..aaa72b723
--- /dev/null
+++ b/pkg/buffer/buffer_state_autogen.go
@@ -0,0 +1,163 @@
+// automatically generated by stateify.
+
+package buffer
+
+import (
+ "gvisor.dev/gvisor/pkg/state"
+)
+
+func (b *buffer) StateTypeName() string {
+ return "pkg/buffer.buffer"
+}
+
+func (b *buffer) StateFields() []string {
+ return []string{
+ "data",
+ "read",
+ "write",
+ "bufferEntry",
+ }
+}
+
+func (b *buffer) beforeSave() {}
+
+// +checklocksignore
+func (b *buffer) StateSave(stateSinkObject state.Sink) {
+ b.beforeSave()
+ stateSinkObject.Save(0, &b.data)
+ stateSinkObject.Save(1, &b.read)
+ stateSinkObject.Save(2, &b.write)
+ stateSinkObject.Save(3, &b.bufferEntry)
+}
+
+func (b *buffer) afterLoad() {}
+
+// +checklocksignore
+func (b *buffer) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &b.data)
+ stateSourceObject.Load(1, &b.read)
+ stateSourceObject.Load(2, &b.write)
+ stateSourceObject.Load(3, &b.bufferEntry)
+}
+
+func (l *bufferList) StateTypeName() string {
+ return "pkg/buffer.bufferList"
+}
+
+func (l *bufferList) StateFields() []string {
+ return []string{
+ "head",
+ "tail",
+ }
+}
+
+func (l *bufferList) beforeSave() {}
+
+// +checklocksignore
+func (l *bufferList) StateSave(stateSinkObject state.Sink) {
+ l.beforeSave()
+ stateSinkObject.Save(0, &l.head)
+ stateSinkObject.Save(1, &l.tail)
+}
+
+func (l *bufferList) afterLoad() {}
+
+// +checklocksignore
+func (l *bufferList) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &l.head)
+ stateSourceObject.Load(1, &l.tail)
+}
+
+func (e *bufferEntry) StateTypeName() string {
+ return "pkg/buffer.bufferEntry"
+}
+
+func (e *bufferEntry) StateFields() []string {
+ return []string{
+ "next",
+ "prev",
+ }
+}
+
+func (e *bufferEntry) beforeSave() {}
+
+// +checklocksignore
+func (e *bufferEntry) StateSave(stateSinkObject state.Sink) {
+ e.beforeSave()
+ stateSinkObject.Save(0, &e.next)
+ stateSinkObject.Save(1, &e.prev)
+}
+
+func (e *bufferEntry) afterLoad() {}
+
+// +checklocksignore
+func (e *bufferEntry) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &e.next)
+ stateSourceObject.Load(1, &e.prev)
+}
+
+func (p *pool) StateTypeName() string {
+ return "pkg/buffer.pool"
+}
+
+func (p *pool) StateFields() []string {
+ return []string{
+ "bufferSize",
+ "embeddedStorage",
+ }
+}
+
+func (p *pool) beforeSave() {}
+
+// +checklocksignore
+func (p *pool) StateSave(stateSinkObject state.Sink) {
+ p.beforeSave()
+ stateSinkObject.Save(0, &p.bufferSize)
+ stateSinkObject.Save(1, &p.embeddedStorage)
+}
+
+// +checklocksignore
+func (p *pool) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &p.bufferSize)
+ stateSourceObject.LoadWait(1, &p.embeddedStorage)
+ stateSourceObject.AfterLoad(p.afterLoad)
+}
+
+func (v *View) StateTypeName() string {
+ return "pkg/buffer.View"
+}
+
+func (v *View) StateFields() []string {
+ return []string{
+ "data",
+ "size",
+ "pool",
+ }
+}
+
+func (v *View) beforeSave() {}
+
+// +checklocksignore
+func (v *View) StateSave(stateSinkObject state.Sink) {
+ v.beforeSave()
+ stateSinkObject.Save(0, &v.data)
+ stateSinkObject.Save(1, &v.size)
+ stateSinkObject.Save(2, &v.pool)
+}
+
+func (v *View) afterLoad() {}
+
+// +checklocksignore
+func (v *View) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &v.data)
+ stateSourceObject.Load(1, &v.size)
+ stateSourceObject.Load(2, &v.pool)
+}
+
+func init() {
+ state.Register((*buffer)(nil))
+ state.Register((*bufferList)(nil))
+ state.Register((*bufferEntry)(nil))
+ state.Register((*pool)(nil))
+ state.Register((*View)(nil))
+}
diff --git a/pkg/buffer/buffer_unsafe_state_autogen.go b/pkg/buffer/buffer_unsafe_state_autogen.go
new file mode 100644
index 000000000..5a5c40722
--- /dev/null
+++ b/pkg/buffer/buffer_unsafe_state_autogen.go
@@ -0,0 +1,3 @@
+// automatically generated by stateify.
+
+package buffer
diff --git a/pkg/buffer/pool.go b/pkg/buffer/pool.go
new file mode 100644
index 000000000..2ec41dd4f
--- /dev/null
+++ b/pkg/buffer/pool.go
@@ -0,0 +1,90 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+const (
+ // embeddedCount is the number of buffer structures embedded in the pool. It
+ // is also the number for overflow allocations.
+ embeddedCount = 8
+
+ // defaultBufferSize is the default size for each underlying storage buffer.
+ //
+ // It is slightly less than two pages. This is done intentionally to ensure
+ // that the buffer object aligns with runtime internals. This two page size
+ // will effectively minimize internal fragmentation, but still have a large
+ // enough chunk to limit excessive segmentation.
+ defaultBufferSize = 8144
+)
+
+// pool allocates buffer.
+//
+// It contains an embedded buffer storage for fast path when the number of
+// buffers needed is small.
+//
+// +stateify savable
+type pool struct {
+ bufferSize int
+ avail []buffer `state:"nosave"`
+ embeddedStorage [embeddedCount]buffer `state:"wait"`
+}
+
+// get gets a new buffer from p.
+func (p *pool) get() *buffer {
+ buf := p.getNoInit()
+ buf.init(p.bufferSize)
+ return buf
+}
+
+// get gets a new buffer from p without initializing it.
+func (p *pool) getNoInit() *buffer {
+ if p.avail == nil {
+ p.avail = p.embeddedStorage[:]
+ }
+ if len(p.avail) == 0 {
+ p.avail = make([]buffer, embeddedCount)
+ }
+ if p.bufferSize <= 0 {
+ p.bufferSize = defaultBufferSize
+ }
+ buf := &p.avail[0]
+ p.avail = p.avail[1:]
+ return buf
+}
+
+// put releases buf.
+func (p *pool) put(buf *buffer) {
+ // Remove reference to the underlying storage, allowing it to be garbage
+ // collected.
+ buf.data = nil
+ buf.Reset()
+}
+
+// setBufferSize sets the size of underlying storage buffer for future
+// allocations. It can be called at any time.
+func (p *pool) setBufferSize(size int) {
+ p.bufferSize = size
+}
+
+// afterLoad is invoked by stateify.
+func (p *pool) afterLoad() {
+ // S/R does not save subslice into embeddedStorage correctly. Restore
+ // available portion of embeddedStorage manually. Restore as nil if none used.
+ for i := len(p.embeddedStorage); i > 0; i-- {
+ if p.embeddedStorage[i-1].data != nil {
+ p.avail = p.embeddedStorage[i:]
+ break
+ }
+ }
+}
diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go
new file mode 100644
index 000000000..8b42575b4
--- /dev/null
+++ b/pkg/buffer/safemem.go
@@ -0,0 +1,133 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+ "gvisor.dev/gvisor/pkg/safemem"
+)
+
+// WriteBlock returns this buffer as a write Block.
+func (b *buffer) WriteBlock() safemem.Block {
+ return safemem.BlockFromSafeSlice(b.WriteSlice())
+}
+
+// ReadBlock returns this buffer as a read Block.
+func (b *buffer) ReadBlock() safemem.Block {
+ return safemem.BlockFromSafeSlice(b.ReadSlice())
+}
+
+// WriteFromSafememReader writes up to count bytes from r to v and advances the
+// write index by the number of bytes written. It calls r.ReadToBlocks() at
+// most once.
+func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, error) {
+ if count == 0 {
+ return 0, nil
+ }
+
+ var (
+ dst safemem.BlockSeq
+ blocks []safemem.Block
+ )
+
+ // Need at least one buffer.
+ firstBuf := v.data.Back()
+ if firstBuf == nil {
+ firstBuf = v.pool.get()
+ v.data.PushBack(firstBuf)
+ }
+
+ // Does the last block have sufficient capacity alone?
+ if l := uint64(firstBuf.WriteSize()); l >= count {
+ dst = safemem.BlockSeqOf(firstBuf.WriteBlock().TakeFirst64(count))
+ } else {
+ // Append blocks until sufficient.
+ count -= l
+ blocks = append(blocks, firstBuf.WriteBlock())
+ for count > 0 {
+ emptyBuf := v.pool.get()
+ v.data.PushBack(emptyBuf)
+ block := emptyBuf.WriteBlock().TakeFirst64(count)
+ count -= uint64(block.Len())
+ blocks = append(blocks, block)
+ }
+ dst = safemem.BlockSeqFromSlice(blocks)
+ }
+
+ // Perform I/O.
+ n, err := r.ReadToBlocks(dst)
+ v.size += int64(n)
+
+ // Update all indices.
+ for left := n; left > 0; firstBuf = firstBuf.Next() {
+ if l := firstBuf.WriteSize(); left >= uint64(l) {
+ firstBuf.WriteMove(l) // Whole block.
+ left -= uint64(l)
+ } else {
+ firstBuf.WriteMove(int(left)) // Partial block.
+ left = 0
+ }
+ }
+
+ return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. It advances the
+// write index by the number of bytes written.
+func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+ return v.WriteFromSafememReader(&safemem.BlockSeqReader{srcs}, srcs.NumBytes())
+}
+
+// ReadToSafememWriter reads up to count bytes from v to w. It does not advance
+// the read index. It calls w.WriteFromBlocks() at most once.
+func (v *View) ReadToSafememWriter(w safemem.Writer, count uint64) (uint64, error) {
+ if count == 0 {
+ return 0, nil
+ }
+
+ var (
+ src safemem.BlockSeq
+ blocks []safemem.Block
+ )
+
+ firstBuf := v.data.Front()
+ if firstBuf == nil {
+ return 0, nil // No EOF.
+ }
+
+ // Is all the data in a single block?
+ if l := uint64(firstBuf.ReadSize()); l >= count {
+ src = safemem.BlockSeqOf(firstBuf.ReadBlock().TakeFirst64(count))
+ } else {
+ // Build a list of all the buffers.
+ count -= l
+ blocks = append(blocks, firstBuf.ReadBlock())
+ for buf := firstBuf.Next(); buf != nil && count > 0; buf = buf.Next() {
+ block := buf.ReadBlock().TakeFirst64(count)
+ count -= uint64(block.Len())
+ blocks = append(blocks, block)
+ }
+ src = safemem.BlockSeqFromSlice(blocks)
+ }
+
+ // Perform I/O. As documented, we don't advance the read index.
+ return w.WriteFromBlocks(src)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks. It does not advance the
+// read index by the number of bytes read, such that it's only safe to call if
+// the caller guarantees that ReadToBlocks will only be called once.
+func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+ return v.ReadToSafememWriter(&safemem.BlockSeqWriter{dsts}, dsts.NumBytes())
+}
diff --git a/pkg/buffer/view.go b/pkg/buffer/view.go
new file mode 100644
index 000000000..7bcfcd543
--- /dev/null
+++ b/pkg/buffer/view.go
@@ -0,0 +1,566 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+ "fmt"
+ "io"
+)
+
+// Buffer is an alias to View.
+type Buffer = View
+
+// View is a non-linear buffer.
+//
+// All methods are thread compatible.
+//
+// +stateify savable
+type View struct {
+ data bufferList
+ size int64
+ pool pool
+}
+
+// TrimFront removes the first count bytes from the buffer.
+func (v *View) TrimFront(count int64) {
+ if count >= v.size {
+ v.advanceRead(v.size)
+ } else {
+ v.advanceRead(count)
+ }
+}
+
+// Remove deletes data at specified location in v. It returns false if specified
+// range does not fully reside in v.
+func (v *View) Remove(offset, length int) bool {
+ if offset < 0 || length < 0 {
+ return false
+ }
+ tgt := Range{begin: offset, end: offset + length}
+ if tgt.Len() != tgt.Intersect(Range{end: int(v.size)}).Len() {
+ return false
+ }
+
+ // Scan through each buffer and remove intersections.
+ var curr Range
+ for buf := v.data.Front(); buf != nil; {
+ origLen := buf.ReadSize()
+ curr.end = curr.begin + origLen
+
+ if x := curr.Intersect(tgt); x.Len() > 0 {
+ if !buf.Remove(x.Offset(-curr.begin)) {
+ panic("buf.Remove() failed")
+ }
+ if buf.ReadSize() == 0 {
+ // buf fully removed, removing it from the list.
+ oldBuf := buf
+ buf = buf.Next()
+ v.data.Remove(oldBuf)
+ v.pool.put(oldBuf)
+ } else {
+ // Only partial data intersects, moving on to next one.
+ buf = buf.Next()
+ }
+ v.size -= int64(x.Len())
+ } else {
+ // This buffer is not in range, moving on to next one.
+ buf = buf.Next()
+ }
+
+ curr.begin += origLen
+ if curr.begin >= tgt.end {
+ break
+ }
+ }
+ return true
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (v *View) ReadAt(p []byte, offset int64) (int, error) {
+ var (
+ skipped int64
+ done int64
+ )
+ for buf := v.data.Front(); buf != nil && done < int64(len(p)); buf = buf.Next() {
+ needToSkip := int(offset - skipped)
+ if sz := buf.ReadSize(); sz <= needToSkip {
+ skipped += int64(sz)
+ continue
+ }
+
+ // Actually read data.
+ n := copy(p[done:], buf.ReadSlice()[needToSkip:])
+ skipped += int64(needToSkip)
+ done += int64(n)
+ }
+ if int(done) < len(p) || offset+done == v.size {
+ return int(done), io.EOF
+ }
+ return int(done), nil
+}
+
+// advanceRead advances the view's read index.
+//
+// Precondition: there must be sufficient bytes in the buffer.
+func (v *View) advanceRead(count int64) {
+ for buf := v.data.Front(); buf != nil && count > 0; {
+ sz := int64(buf.ReadSize())
+ if sz > count {
+ // There is still data for reading.
+ buf.ReadMove(int(count))
+ v.size -= count
+ count = 0
+ break
+ }
+
+ // Consume the whole buffer.
+ oldBuf := buf
+ buf = buf.Next() // Iterate.
+ v.data.Remove(oldBuf)
+ v.pool.put(oldBuf)
+
+ // Update counts.
+ count -= sz
+ v.size -= sz
+ }
+ if count > 0 {
+ panic(fmt.Sprintf("advanceRead still has %d bytes remaining", count))
+ }
+}
+
+// Truncate truncates the view to the given bytes.
+//
+// This will not grow the view, only shrink it. If a length is passed that is
+// greater than the current size of the view, then nothing will happen.
+//
+// Precondition: length must be >= 0.
+func (v *View) Truncate(length int64) {
+ if length < 0 {
+ panic("negative length provided")
+ }
+ if length >= v.size {
+ return // Nothing to do.
+ }
+ for buf := v.data.Back(); buf != nil && v.size > length; buf = v.data.Back() {
+ sz := int64(buf.ReadSize())
+ if after := v.size - sz; after < length {
+ // Truncate the buffer locally.
+ left := (length - after)
+ buf.write = buf.read + int(left)
+ v.size = length
+ break
+ }
+
+ // Drop the buffer completely; see above.
+ v.data.Remove(buf)
+ v.pool.put(buf)
+ v.size -= sz
+ }
+}
+
+// Grow grows the given view to the number of bytes, which will be appended. If
+// zero is true, all these bytes will be zero. If zero is false, then this is
+// the caller's responsibility.
+//
+// Precondition: length must be >= 0.
+func (v *View) Grow(length int64, zero bool) {
+ if length < 0 {
+ panic("negative length provided")
+ }
+ for v.size < length {
+ buf := v.data.Back()
+
+ // Is there some space in the last buffer?
+ if buf == nil || buf.Full() {
+ buf = v.pool.get()
+ v.data.PushBack(buf)
+ }
+
+ // Write up to length bytes.
+ sz := buf.WriteSize()
+ if int64(sz) > length-v.size {
+ sz = int(length - v.size)
+ }
+
+ // Zero the written section; note that this pattern is
+ // specifically recognized and optimized by the compiler.
+ if zero {
+ for i := buf.write; i < buf.write+sz; i++ {
+ buf.data[i] = 0
+ }
+ }
+
+ // Advance the index.
+ buf.WriteMove(sz)
+ v.size += int64(sz)
+ }
+}
+
+// Prepend prepends the given data.
+func (v *View) Prepend(data []byte) {
+ // Is there any space in the first buffer?
+ if buf := v.data.Front(); buf != nil && buf.read > 0 {
+ // Fill up before the first write.
+ avail := buf.read
+ bStart := 0
+ dStart := len(data) - avail
+ if avail > len(data) {
+ bStart = avail - len(data)
+ dStart = 0
+ }
+ n := copy(buf.data[bStart:], data[dStart:])
+ data = data[:dStart]
+ v.size += int64(n)
+ buf.read -= n
+ }
+
+ for len(data) > 0 {
+ // Do we need an empty buffer?
+ buf := v.pool.get()
+ v.data.PushFront(buf)
+
+ // The buffer is empty; copy last chunk.
+ avail := len(buf.data)
+ bStart := 0
+ dStart := len(data) - avail
+ if avail > len(data) {
+ bStart = avail - len(data)
+ dStart = 0
+ }
+
+ // We have to put the data at the end of the current
+ // buffer in order to ensure that the next prepend will
+ // correctly fill up the beginning of this buffer.
+ n := copy(buf.data[bStart:], data[dStart:])
+ data = data[:dStart]
+ v.size += int64(n)
+ buf.read = len(buf.data) - n
+ buf.write = len(buf.data)
+ }
+}
+
+// Append appends the given data.
+func (v *View) Append(data []byte) {
+ for done := 0; done < len(data); {
+ buf := v.data.Back()
+
+ // Ensure there's a buffer with space.
+ if buf == nil || buf.Full() {
+ buf = v.pool.get()
+ v.data.PushBack(buf)
+ }
+
+ // Copy in to the given buffer.
+ n := copy(buf.WriteSlice(), data[done:])
+ done += n
+ buf.WriteMove(n)
+ v.size += int64(n)
+ }
+}
+
+// AppendOwned takes ownership of data and appends it to v.
+func (v *View) AppendOwned(data []byte) {
+ if len(data) > 0 {
+ buf := v.pool.getNoInit()
+ buf.initWithData(data)
+ v.data.PushBack(buf)
+ v.size += int64(len(data))
+ }
+}
+
+// PullUp makes the specified range contiguous and returns the backing memory.
+func (v *View) PullUp(offset, length int) ([]byte, bool) {
+ if length == 0 {
+ return nil, true
+ }
+ tgt := Range{begin: offset, end: offset + length}
+ if tgt.Intersect(Range{end: int(v.size)}).Len() != length {
+ return nil, false
+ }
+
+ curr := Range{}
+ buf := v.data.Front()
+ for ; buf != nil; buf = buf.Next() {
+ origLen := buf.ReadSize()
+ curr.end = curr.begin + origLen
+
+ if x := curr.Intersect(tgt); x.Len() == tgt.Len() {
+ // buf covers the whole requested target range.
+ sub := x.Offset(-curr.begin)
+ return buf.ReadSlice()[sub.begin:sub.end], true
+ } else if x.Len() > 0 {
+ // buf is pointing at the starting buffer we want to merge.
+ break
+ }
+
+ curr.begin += origLen
+ }
+
+ // Calculate the total merged length.
+ totLen := 0
+ for n := buf; n != nil; n = n.Next() {
+ totLen += n.ReadSize()
+ if curr.begin+totLen >= tgt.end {
+ break
+ }
+ }
+
+ // Merge the buffers.
+ data := make([]byte, totLen)
+ off := 0
+ for n := buf; n != nil && off < totLen; {
+ copy(data[off:], n.ReadSlice())
+ off += n.ReadSize()
+
+ // Remove buffers except for the first one, which will be reused.
+ if n == buf {
+ n = n.Next()
+ } else {
+ old := n
+ n = n.Next()
+ v.data.Remove(old)
+ v.pool.put(old)
+ }
+ }
+
+ // Update the first buffer with merged data.
+ buf.initWithData(data)
+
+ r := tgt.Offset(-curr.begin)
+ return buf.data[r.begin:r.end], true
+}
+
+// Flatten returns a flattened copy of this data.
+//
+// This method should not be used in any performance-sensitive paths. It may
+// allocate a fresh byte slice sufficiently large to contain all the data in
+// the buffer. This is principally for debugging.
+//
+// N.B. Tee data still belongs to this view, as if there is a single buffer
+// present, then it will be returned directly. This should be used for
+// temporary use only, and a reference to the given slice should not be held.
+func (v *View) Flatten() []byte {
+ if buf := v.data.Front(); buf == nil {
+ return nil // No data at all.
+ } else if buf.Next() == nil {
+ return buf.ReadSlice() // Only one buffer.
+ }
+ data := make([]byte, 0, v.size) // Need to flatten.
+ for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+ // Copy to the allocated slice.
+ data = append(data, buf.ReadSlice()...)
+ }
+ return data
+}
+
+// Size indicates the total amount of data available in this view.
+func (v *View) Size() int64 {
+ return v.size
+}
+
+// Copy makes a strict copy of this view.
+func (v *View) Copy() (other View) {
+ for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+ other.Append(buf.ReadSlice())
+ }
+ return
+}
+
+// Apply applies the given function across all valid data.
+func (v *View) Apply(fn func([]byte)) {
+ for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+ fn(buf.ReadSlice())
+ }
+}
+
+// SubApply applies fn to a given range of data in v. Any part of the range
+// outside of v is ignored.
+func (v *View) SubApply(offset, length int, fn func([]byte)) {
+ for buf := v.data.Front(); length > 0 && buf != nil; buf = buf.Next() {
+ d := buf.ReadSlice()
+ if offset >= len(d) {
+ offset -= len(d)
+ continue
+ }
+ if offset > 0 {
+ d = d[offset:]
+ offset = 0
+ }
+ if length < len(d) {
+ d = d[:length]
+ }
+ fn(d)
+ length -= len(d)
+ }
+}
+
+// Merge merges the provided View with this one.
+//
+// The other view will be appended to v, and other will be empty after this
+// operation completes.
+func (v *View) Merge(other *View) {
+ // Copy over all buffers.
+ for buf := other.data.Front(); buf != nil; buf = other.data.Front() {
+ other.data.Remove(buf)
+ v.data.PushBack(buf)
+ }
+
+ // Adjust sizes.
+ v.size += other.size
+ other.size = 0
+}
+
+// WriteFromReader writes to the buffer from an io.Reader.
+//
+// A minimum read size equal to unsafe.Sizeof(unintptr) is enforced,
+// provided that count is greater than or equal to unsafe.Sizeof(uintptr).
+func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) {
+ var (
+ done int64
+ n int
+ err error
+ )
+ for done < count {
+ buf := v.data.Back()
+
+ // Ensure we have an empty buffer.
+ if buf == nil || buf.Full() {
+ buf = v.pool.get()
+ v.data.PushBack(buf)
+ }
+
+ // Is this less than the minimum batch?
+ if buf.WriteSize() < minBatch && (count-done) >= int64(minBatch) {
+ tmp := make([]byte, minBatch)
+ n, err = r.Read(tmp)
+ v.Append(tmp[:n])
+ done += int64(n)
+ if err != nil {
+ break
+ }
+ continue
+ }
+
+ // Limit the read, if necessary.
+ sz := buf.WriteSize()
+ if left := count - done; int64(sz) > left {
+ sz = int(left)
+ }
+
+ // Pass the relevant portion of the buffer.
+ n, err = r.Read(buf.WriteSlice()[:sz])
+ buf.WriteMove(n)
+ done += int64(n)
+ v.size += int64(n)
+ if err == io.EOF {
+ err = nil // Short write allowed.
+ break
+ } else if err != nil {
+ break
+ }
+ }
+ return done, err
+}
+
+// ReadToWriter reads from the buffer into an io.Writer.
+//
+// N.B. This does not consume the bytes read. TrimFront should
+// be called appropriately after this call in order to do so.
+//
+// A minimum write size equal to unsafe.Sizeof(unintptr) is enforced,
+// provided that count is greater than or equal to unsafe.Sizeof(uintptr).
+func (v *View) ReadToWriter(w io.Writer, count int64) (int64, error) {
+ var (
+ done int64
+ n int
+ err error
+ )
+ offset := 0 // Spill-over for batching.
+ for buf := v.data.Front(); buf != nil && done < count; buf = buf.Next() {
+ // Has this been consumed? Skip it.
+ sz := buf.ReadSize()
+ if sz <= offset {
+ offset -= sz
+ continue
+ }
+ sz -= offset
+
+ // Is this less than the minimum batch?
+ left := count - done
+ if sz < minBatch && left >= int64(minBatch) && (v.size-done) >= int64(minBatch) {
+ tmp := make([]byte, minBatch)
+ n, err = v.ReadAt(tmp, done)
+ w.Write(tmp[:n])
+ done += int64(n)
+ offset = n - sz // Reset below.
+ if err != nil {
+ break
+ }
+ continue
+ }
+
+ // Limit the write if necessary.
+ if int64(sz) >= left {
+ sz = int(left)
+ }
+
+ // Perform the actual write.
+ n, err = w.Write(buf.ReadSlice()[offset : offset+sz])
+ done += int64(n)
+ if err != nil {
+ break
+ }
+
+ // Reset spill-over.
+ offset = 0
+ }
+ return done, err
+}
+
+// A Range specifies a range of buffer.
+type Range struct {
+ begin int
+ end int
+}
+
+// Intersect returns the intersection of x and y.
+func (x Range) Intersect(y Range) Range {
+ if x.begin < y.begin {
+ x.begin = y.begin
+ }
+ if x.end > y.end {
+ x.end = y.end
+ }
+ if x.begin >= x.end {
+ return Range{}
+ }
+ return x
+}
+
+// Offset returns x offset by off.
+func (x Range) Offset(off int) Range {
+ x.begin += off
+ x.end += off
+ return x
+}
+
+// Len returns the length of x.
+func (x Range) Len() int {
+ l := x.end - x.begin
+ if l < 0 {
+ l = 0
+ }
+ return l
+}
diff --git a/pkg/buffer/view_unsafe.go b/pkg/buffer/view_unsafe.go
new file mode 100644
index 000000000..d1ef39b26
--- /dev/null
+++ b/pkg/buffer/view_unsafe.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+ "unsafe"
+)
+
+// minBatch is the smallest Read or Write operation that the
+// WriteFromReader and ReadToWriter functions will use.
+//
+// This is defined as the size of a native pointer.
+const minBatch = int(unsafe.Sizeof(uintptr(0)))
diff --git a/pkg/tcpip/header/parse/parse.go b/pkg/tcpip/header/parse/parse.go
index ebb4b2c1d..1c913b5e1 100644
--- a/pkg/tcpip/header/parse/parse.go
+++ b/pkg/tcpip/header/parse/parse.go
@@ -60,9 +60,13 @@ func IPv4(pkt *stack.PacketBuffer) bool {
return false
}
ipHdr = header.IPv4(hdr)
+ length := int(ipHdr.TotalLength()) - len(hdr)
+ if length < 0 {
+ return false
+ }
pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
- pkt.Data().CapLength(int(ipHdr.TotalLength()) - len(hdr))
+ pkt.Data().CapLength(length)
return true
}
diff --git a/pkg/tcpip/network/internal/fragmentation/reassembler.go b/pkg/tcpip/network/internal/fragmentation/reassembler.go
index 90075a70c..56b76a284 100644
--- a/pkg/tcpip/network/internal/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/internal/fragmentation/reassembler.go
@@ -167,8 +167,7 @@ func (r *reassembler) process(first, last uint16, more bool, proto uint8, pkt *s
resPkt := r.holes[0].pkt
for i := 1; i < len(r.holes); i++ {
- fragData := r.holes[i].pkt.Data()
- resPkt.Data().ReadFromData(fragData, fragData.Size())
+ stack.MergeFragment(resPkt, r.holes[i].pkt)
}
return resPkt, r.proto, true, memConsumed, nil
}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 880290b4b..febbb3f38 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -1472,13 +1472,19 @@ func (e *endpoint) processExtensionHeaders(h header.IPv6, pkt *stack.PacketBuffe
// If the last header in the payload isn't a known IPv6 extension header,
// handle it as if it is transport layer data.
+ // Calculate the number of octets parsed from data. We want to remove all
+ // the data except the unparsed portion located at the end, which its size
+ // is extHdr.Buf.Size().
+ trim := pkt.Data().Size() - extHdr.Buf.Size()
+
// For unfragmented packets, extHdr still contains the transport header.
// Get rid of it.
//
// For reassembled fragments, pkt.TransportHeader is unset, so this is a
// no-op and pkt.Data begins with the transport header.
- extHdr.Buf.TrimFront(pkt.TransportHeader().View().Size())
- pkt.Data().Replace(extHdr.Buf)
+ trim += pkt.TransportHeader().View().Size()
+
+ pkt.Data().DeleteFront(trim)
stats.PacketsDelivered.Increment()
if p := tcpip.TransportProtocolNumber(extHdr.Identifier); p == header.ICMPv6ProtocolNumber {
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index fc3c54e34..e2e073091 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -16,9 +16,10 @@ package stack
import (
"fmt"
+ "gvisor.dev/gvisor/pkg/buffer"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
- "gvisor.dev/gvisor/pkg/tcpip/buffer"
+ tcpipbuffer "gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
)
@@ -39,7 +40,7 @@ type PacketBufferOptions struct {
// Data is the initial unparsed data for the new packet. If set, it will be
// owned by the new packet.
- Data buffer.VectorisedView
+ Data tcpipbuffer.VectorisedView
// IsForwardedPacket identifies that the PacketBuffer being created is for a
// forwarded packet.
@@ -56,6 +57,34 @@ type PacketBufferOptions struct {
// empty. Use of PacketBuffer in any other order is unsupported.
//
// PacketBuffer must be created with NewPacketBuffer.
+//
+// Internal structure: A PacketBuffer holds a pointer to buffer.Buffer, which
+// exposes a logically-contiguous byte storage. The underlying storage structure
+// is abstracted out, and should not be a concern here for most of the time.
+//
+// |- reserved ->|
+// |--->| consumed (incoming)
+// 0 V V
+// +--------+----+----+--------------------+
+// | | | | current data ... | (buf)
+// +--------+----+----+--------------------+
+// ^ |
+// |<---| pushed (outgoing)
+//
+// When a PacketBuffer is created, a `reserved` header region can be specified,
+// which stack pushes headers in this region for an outgoing packet. There could
+// be no such region for an incoming packet, and `reserved` is 0. The value of
+// `reserved` never changes in the entire lifetime of the packet.
+//
+// Outgoing Packet: When a header is pushed, `pushed` gets incremented by the
+// pushed length, and the current value is stored for each header. PacketBuffer
+// substracts this value from `reserved` to compute the starting offset of each
+// header in `buf`.
+//
+// Incoming Packet: When a header is consumed (a.k.a. parsed), the current
+// `consumed` value is stored for each header, and it gets incremented by the
+// consumed length. PacketBuffer adds this value to `reserved` to compute the
+// starting offset of each header in `buf`.
type PacketBuffer struct {
_ sync.NoCopy
@@ -63,28 +92,16 @@ type PacketBuffer struct {
// PacketBuffers.
PacketBufferEntry
- // data holds the payload of the packet.
- //
- // For inbound packets, Data is initially the whole packet. Then gets moved to
- // headers via PacketHeader.Consume, when the packet is being parsed.
- //
- // For outbound packets, Data is the innermost layer, defined by the protocol.
- // Headers are pushed in front of it via PacketHeader.Push.
- //
- // The bytes backing Data are immutable, a.k.a. users shouldn't write to its
- // backing storage.
- data buffer.VectorisedView
+ // buf is the underlying buffer for the packet. See struct level docs for
+ // details.
+ buf *buffer.Buffer
+ reserved int
+ pushed int
+ consumed int
// headers stores metadata about each header.
headers [numHeaderType]headerInfo
- // header is the internal storage for outbound packets. Headers will be pushed
- // (prepended) on this storage as the packet is being constructed.
- //
- // TODO(gvisor.dev/issue/2404): Switch to an implementation that header and
- // data are held in the same underlying buffer storage.
- header buffer.Prependable
-
// NetworkProtocolNumber is only valid when NetworkHeader().View().IsEmpty()
// returns false.
// TODO(gvisor.dev/issue/3574): Remove the separately passed protocol
@@ -131,10 +148,14 @@ type PacketBuffer struct {
// NewPacketBuffer creates a new PacketBuffer with opts.
func NewPacketBuffer(opts PacketBufferOptions) *PacketBuffer {
pk := &PacketBuffer{
- data: opts.Data,
+ buf: &buffer.Buffer{},
}
if opts.ReserveHeaderBytes != 0 {
- pk.header = buffer.NewPrependable(opts.ReserveHeaderBytes)
+ pk.buf.AppendOwned(make([]byte, opts.ReserveHeaderBytes))
+ pk.reserved = opts.ReserveHeaderBytes
+ }
+ for _, v := range opts.Data.Views() {
+ pk.buf.AppendOwned(v)
}
if opts.IsForwardedPacket {
pk.NetworkPacketInfo.IsForwardedPacket = opts.IsForwardedPacket
@@ -145,13 +166,13 @@ func NewPacketBuffer(opts PacketBufferOptions) *PacketBuffer {
// ReservedHeaderBytes returns the number of bytes initially reserved for
// headers.
func (pk *PacketBuffer) ReservedHeaderBytes() int {
- return pk.header.UsedLength() + pk.header.AvailableLength()
+ return pk.reserved
}
// AvailableHeaderBytes returns the number of bytes currently available for
// headers. This is relevant to PacketHeader.Push method only.
func (pk *PacketBuffer) AvailableHeaderBytes() int {
- return pk.header.AvailableLength()
+ return pk.reserved - pk.pushed
}
// LinkHeader returns the handle to link-layer header.
@@ -180,24 +201,18 @@ func (pk *PacketBuffer) TransportHeader() PacketHeader {
// HeaderSize returns the total size of all headers in bytes.
func (pk *PacketBuffer) HeaderSize() int {
- // Note for inbound packets (Consume called), headers are not stored in
- // pk.header. Thus, calculation of size of each header is needed.
- var size int
- for i := range pk.headers {
- size += len(pk.headers[i].buf)
- }
- return size
+ return pk.pushed + pk.consumed
}
// Size returns the size of packet in bytes.
func (pk *PacketBuffer) Size() int {
- return pk.HeaderSize() + pk.data.Size()
+ return int(pk.buf.Size()) - pk.headerOffset()
}
// MemSize returns the estimation size of the pk in memory, including backing
// buffer data.
func (pk *PacketBuffer) MemSize() int {
- return pk.HeaderSize() + pk.data.MemSize() + packetBufferStructSize
+ return int(pk.buf.Size()) + packetBufferStructSize
}
// Data returns the handle to data portion of pk.
@@ -206,61 +221,65 @@ func (pk *PacketBuffer) Data() PacketData {
}
// Views returns the underlying storage of the whole packet.
-func (pk *PacketBuffer) Views() []buffer.View {
- // Optimization for outbound packets that headers are in pk.header.
- useHeader := true
- for i := range pk.headers {
- if !canUseHeader(&pk.headers[i]) {
- useHeader = false
- break
- }
- }
+func (pk *PacketBuffer) Views() []tcpipbuffer.View {
+ var views []tcpipbuffer.View
+ offset := pk.headerOffset()
+ pk.buf.SubApply(offset, int(pk.buf.Size())-offset, func(v []byte) {
+ views = append(views, v)
+ })
+ return views
+}
- dataViews := pk.data.Views()
-
- var vs []buffer.View
- if useHeader {
- vs = make([]buffer.View, 0, 1+len(dataViews))
- vs = append(vs, pk.header.View())
- } else {
- vs = make([]buffer.View, 0, len(pk.headers)+len(dataViews))
- for i := range pk.headers {
- if v := pk.headers[i].buf; len(v) > 0 {
- vs = append(vs, v)
- }
- }
- }
- return append(vs, dataViews...)
+func (pk *PacketBuffer) headerOffset() int {
+ return pk.reserved - pk.pushed
+}
+
+func (pk *PacketBuffer) headerOffsetOf(typ headerType) int {
+ return pk.reserved + pk.headers[typ].offset
}
-func canUseHeader(h *headerInfo) bool {
- // h.offset will be negative if the header was pushed in to prependable
- // portion, or doesn't matter when it's empty.
- return len(h.buf) == 0 || h.offset < 0
+func (pk *PacketBuffer) dataOffset() int {
+ return pk.reserved + pk.consumed
}
-func (pk *PacketBuffer) push(typ headerType, size int) buffer.View {
+func (pk *PacketBuffer) push(typ headerType, size int) tcpipbuffer.View {
h := &pk.headers[typ]
- if h.buf != nil {
+ if h.length > 0 {
panic(fmt.Sprintf("push must not be called twice: type %s", typ))
}
- h.buf = buffer.View(pk.header.Prepend(size))
- h.offset = -pk.header.UsedLength()
- return h.buf
+ if pk.pushed+size > pk.reserved {
+ panic("not enough headroom reserved")
+ }
+ pk.pushed += size
+ h.offset = -pk.pushed
+ h.length = size
+ return pk.headerView(typ)
}
-func (pk *PacketBuffer) consume(typ headerType, size int) (v buffer.View, consumed bool) {
+func (pk *PacketBuffer) consume(typ headerType, size int) (v tcpipbuffer.View, consumed bool) {
h := &pk.headers[typ]
- if h.buf != nil {
+ if h.length > 0 {
panic(fmt.Sprintf("consume must not be called twice: type %s", typ))
}
- v, ok := pk.data.PullUp(size)
+ if pk.headerOffset()+pk.consumed+size > int(pk.buf.Size()) {
+ return nil, false
+ }
+ h.offset = pk.consumed
+ h.length = size
+ pk.consumed += size
+ return pk.headerView(typ), true
+}
+
+func (pk *PacketBuffer) headerView(typ headerType) tcpipbuffer.View {
+ h := &pk.headers[typ]
+ if h.length == 0 {
+ return nil
+ }
+ v, ok := pk.buf.PullUp(pk.headerOffsetOf(typ), h.length)
if !ok {
- return
+ panic("PullUp failed")
}
- pk.data.TrimFront(size)
- h.buf = v
- return h.buf, true
+ return v
}
// Clone makes a shallow copy of pk.
@@ -270,9 +289,11 @@ func (pk *PacketBuffer) consume(typ headerType, size int) (v buffer.View, consum
func (pk *PacketBuffer) Clone() *PacketBuffer {
return &PacketBuffer{
PacketBufferEntry: pk.PacketBufferEntry,
- data: pk.data.Clone(nil),
+ buf: pk.buf,
+ reserved: pk.reserved,
+ pushed: pk.pushed,
+ consumed: pk.consumed,
headers: pk.headers,
- header: pk.header,
Hash: pk.Hash,
Owner: pk.Owner,
GSOOptions: pk.GSOOptions,
@@ -306,9 +327,11 @@ func (pk *PacketBuffer) Network() header.Network {
// See PacketBuffer.Data for details about how a packet buffer holds an inbound
// packet.
func (pk *PacketBuffer) CloneToInbound() *PacketBuffer {
- newPk := NewPacketBuffer(PacketBufferOptions{
- Data: buffer.NewVectorisedView(pk.Size(), pk.Views()),
- })
+ newPk := &PacketBuffer{
+ buf: pk.buf,
+ // Treat unfilled header portion as reserved.
+ reserved: pk.AvailableHeaderBytes(),
+ }
// TODO(gvisor.dev/issue/5696): reimplement conntrack so that no need to
// maintain this flag in the packet. Currently conntrack needs this flag to
// tell if a noop connection should be inserted at Input hook. Once conntrack
@@ -322,15 +345,12 @@ func (pk *PacketBuffer) CloneToInbound() *PacketBuffer {
// headerInfo stores metadata about a header in a packet.
type headerInfo struct {
- // buf is the memorized slice for both prepended and consumed header.
- // When header is prepended, buf serves as memorized value, which is a slice
- // of pk.header. When header is consumed, buf is the slice pulled out from
- // pk.Data, which is the only place to hold this header.
- buf buffer.View
-
- // offset will be a negative number denoting the offset where this header is
- // from the end of pk.header, if it is prepended. Otherwise, zero.
+ // offset is the offset of the header in pk.buf relative to
+ // pk.buf[pk.reserved]. See the PacketBuffer struct for details.
offset int
+
+ // length is the length of this header.
+ length int
}
// PacketHeader is a handle object to a header in the underlying packet.
@@ -340,14 +360,14 @@ type PacketHeader struct {
}
// View returns the underlying storage of h.
-func (h PacketHeader) View() buffer.View {
- return h.pk.headers[h.typ].buf
+func (h PacketHeader) View() tcpipbuffer.View {
+ return h.pk.headerView(h.typ)
}
// Push pushes size bytes in the front of its residing packet, and returns the
// backing storage. Callers may only call one of Push or Consume once on each
// header in the lifetime of the underlying packet.
-func (h PacketHeader) Push(size int) buffer.View {
+func (h PacketHeader) Push(size int) tcpipbuffer.View {
return h.pk.push(h.typ, size)
}
@@ -356,7 +376,7 @@ func (h PacketHeader) Push(size int) buffer.View {
// size, consumed will be false, and the state of h will not be affected.
// Callers may only call one of Push or Consume once on each header in the
// lifetime of the underlying packet.
-func (h PacketHeader) Consume(size int) (v buffer.View, consumed bool) {
+func (h PacketHeader) Consume(size int) (v tcpipbuffer.View, consumed bool) {
return h.pk.consume(h.typ, size)
}
@@ -367,55 +387,84 @@ type PacketData struct {
// PullUp returns a contiguous view of size bytes from the beginning of d.
// Callers should not write to or keep the view for later use.
-func (d PacketData) PullUp(size int) (buffer.View, bool) {
- return d.pk.data.PullUp(size)
+func (d PacketData) PullUp(size int) (tcpipbuffer.View, bool) {
+ return d.pk.buf.PullUp(d.pk.dataOffset(), size)
}
// DeleteFront removes count from the beginning of d. It panics if count >
// d.Size(). All backing storage references after the front of the d are
// invalidated.
func (d PacketData) DeleteFront(count int) {
- d.pk.data.TrimFront(count)
+ if !d.pk.buf.Remove(d.pk.dataOffset(), count) {
+ panic("count > d.Size()")
+ }
}
// CapLength reduces d to at most length bytes.
func (d PacketData) CapLength(length int) {
- d.pk.data.CapLength(length)
+ if length < 0 {
+ panic("length < 0")
+ }
+ if currLength := d.Size(); currLength > length {
+ trim := currLength - length
+ d.pk.buf.Remove(int(d.pk.buf.Size())-trim, trim)
+ }
}
// Views returns the underlying storage of d in a slice of Views. Caller should
// not modify the returned slice.
-func (d PacketData) Views() []buffer.View {
- return d.pk.data.Views()
+func (d PacketData) Views() []tcpipbuffer.View {
+ var views []tcpipbuffer.View
+ offset := d.pk.dataOffset()
+ d.pk.buf.SubApply(offset, int(d.pk.buf.Size())-offset, func(v []byte) {
+ views = append(views, v)
+ })
+ return views
}
// AppendView appends v into d, taking the ownership of v.
-func (d PacketData) AppendView(v buffer.View) {
- d.pk.data.AppendView(v)
+func (d PacketData) AppendView(v tcpipbuffer.View) {
+ d.pk.buf.AppendOwned(v)
}
-// ReadFromData moves at most count bytes from the beginning of srcData to the
-// end of d and returns the number of bytes moved.
-func (d PacketData) ReadFromData(srcData PacketData, count int) int {
- return srcData.pk.data.ReadToVV(&d.pk.data, count)
+// MergeFragment appends the data portion of frag to dst. It takes ownership of
+// frag and frag should not be used again.
+func MergeFragment(dst, frag *PacketBuffer) {
+ frag.buf.TrimFront(int64(frag.dataOffset()))
+ dst.buf.Merge(frag.buf)
}
// ReadFromVV moves at most count bytes from the beginning of srcVV to the end
// of d and returns the number of bytes moved.
-func (d PacketData) ReadFromVV(srcVV *buffer.VectorisedView, count int) int {
- return srcVV.ReadToVV(&d.pk.data, count)
+func (d PacketData) ReadFromVV(srcVV *tcpipbuffer.VectorisedView, count int) int {
+ done := 0
+ for _, v := range srcVV.Views() {
+ if len(v) < count {
+ count -= len(v)
+ done += len(v)
+ d.pk.buf.AppendOwned(v)
+ } else {
+ v = v[:count]
+ count -= len(v)
+ done += len(v)
+ d.pk.buf.Append(v)
+ break
+ }
+ }
+ srcVV.TrimFront(done)
+ return done
}
// Size returns the number of bytes in the data payload of the packet.
func (d PacketData) Size() int {
- return d.pk.data.Size()
+ return int(d.pk.buf.Size()) - d.pk.dataOffset()
}
// AsRange returns a Range representing the current data payload of the packet.
func (d PacketData) AsRange() Range {
return Range{
pk: d.pk,
- offset: d.pk.HeaderSize(),
+ offset: d.pk.dataOffset(),
length: d.Size(),
}
}
@@ -425,17 +474,12 @@ func (d PacketData) AsRange() Range {
//
// This method exists for compatibility between PacketBuffer and VectorisedView.
// It may be removed later and should be used with care.
-func (d PacketData) ExtractVV() buffer.VectorisedView {
- return d.pk.data
-}
-
-// Replace replaces the data portion of the packet with vv, taking the ownership
-// of vv.
-//
-// This method exists for compatibility between PacketBuffer and VectorisedView.
-// It may be removed later and should be used with care.
-func (d PacketData) Replace(vv buffer.VectorisedView) {
- d.pk.data = vv
+func (d PacketData) ExtractVV() tcpipbuffer.VectorisedView {
+ var vv tcpipbuffer.VectorisedView
+ d.pk.buf.SubApply(d.pk.dataOffset(), d.pk.Size(), func(v []byte) {
+ vv.AppendView(v)
+ })
+ return vv
}
// Range represents a contiguous subportion of a PacketBuffer.
@@ -479,9 +523,9 @@ func (r Range) Capped(max int) Range {
// AsView returns the backing storage of r if possible. It will allocate a new
// View if r spans multiple pieces internally. Caller should not write to the
// returned View in any way.
-func (r Range) AsView() buffer.View {
+func (r Range) AsView() tcpipbuffer.View {
var allocated bool
- var v buffer.View
+ var v tcpipbuffer.View
r.iterate(func(b []byte) {
if v == nil {
// v has not been assigned, allowing first view to be returned.
@@ -502,7 +546,7 @@ func (r Range) AsView() buffer.View {
}
// ToOwnedView returns a owned copy of data in r.
-func (r Range) ToOwnedView() buffer.View {
+func (r Range) ToOwnedView() tcpipbuffer.View {
if r.length == 0 {
return nil
}
@@ -523,63 +567,7 @@ func (r Range) Checksum() uint16 {
// iterate calls fn for each piece in r. fn is always called with a non-empty
// slice.
func (r Range) iterate(fn func([]byte)) {
- w := window{
- offset: r.offset,
- length: r.length,
- }
- // Header portion.
- for i := range r.pk.headers {
- if b := w.process(r.pk.headers[i].buf); len(b) > 0 {
- fn(b)
- }
- if w.isDone() {
- break
- }
- }
- // Data portion.
- if !w.isDone() {
- for _, v := range r.pk.data.Views() {
- if b := w.process(v); len(b) > 0 {
- fn(b)
- }
- if w.isDone() {
- break
- }
- }
- }
-}
-
-// window represents contiguous region of byte stream. User would call process()
-// to input bytes, and obtain a subslice that is inside the window.
-type window struct {
- offset int
- length int
-}
-
-// isDone returns true if the window has passed and further process() calls will
-// always return an empty slice. This can be used to end processing early.
-func (w *window) isDone() bool {
- return w.length == 0
-}
-
-// process feeds b in and returns a subslice that is inside the window. The
-// returned slice will be a subslice of b, and it does not keep b after method
-// returns. This method may return an empty slice if nothing in b is inside the
-// window.
-func (w *window) process(b []byte) (inWindow []byte) {
- if w.offset >= len(b) {
- w.offset -= len(b)
- return nil
- }
- if w.offset > 0 {
- b = b[w.offset:]
- w.offset = 0
- }
- if w.length < len(b) {
- b = b[:w.length]
- }
- w.length -= len(b)
- return b
+ r.pk.buf.SubApply(r.offset, r.length, fn)
}
// PayloadSince returns packet payload starting from and including a particular
@@ -587,21 +575,14 @@ func (w *window) process(b []byte) (inWindow []byte) {
//
// The returned View is owned by the caller - its backing buffer is separate
// from the packet header's underlying packet buffer.
-func PayloadSince(h PacketHeader) buffer.View {
- size := h.pk.data.Size()
- for _, hinfo := range h.pk.headers[h.typ:] {
- size += len(hinfo.buf)
+func PayloadSince(h PacketHeader) tcpipbuffer.View {
+ offset := h.pk.headerOffset()
+ for i := headerType(0); i < h.typ; i++ {
+ offset += h.pk.headers[i].length
}
-
- v := make(buffer.View, 0, size)
-
- for _, hinfo := range h.pk.headers[h.typ:] {
- v = append(v, hinfo.buf...)
- }
-
- for _, view := range h.pk.data.Views() {
- v = append(v, view...)
- }
-
- return v
+ return Range{
+ pk: h.pk,
+ offset: offset,
+ length: int(h.pk.buf.Size()) - offset,
+ }.ToOwnedView()
}