diff options
Diffstat (limited to 'pkg')
23 files changed, 408 insertions, 1488 deletions
diff --git a/pkg/buffer/buffer.go b/pkg/buffer/buffer.go deleted file mode 100644 index 311808ae9..000000000 --- a/pkg/buffer/buffer.go +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package buffer provides the implementation of a buffer view. -// -// A view is an flexible buffer, supporting the safecopy operations natively as -// well as the ability to grow via either prepend or append, as well as shrink. -package buffer - -// buffer encapsulates a queueable byte buffer. -// -// +stateify savable -type buffer struct { - data []byte - read int - write int - bufferEntry -} - -// init performs in-place initialization for zero value. -func (b *buffer) init(size int) { - b.data = make([]byte, size) -} - -// Reset resets read and write locations, effectively emptying the buffer. -func (b *buffer) Reset() { - b.read = 0 - b.write = 0 -} - -// Full indicates the buffer is full. -// -// This indicates there is no capacity left to write. -func (b *buffer) Full() bool { - return b.write == len(b.data) -} - -// ReadSize returns the number of bytes available for reading. -func (b *buffer) ReadSize() int { - return b.write - b.read -} - -// ReadMove advances the read index by the given amount. -func (b *buffer) ReadMove(n int) { - b.read += n -} - -// ReadSlice returns the read slice for this buffer. -func (b *buffer) ReadSlice() []byte { - return b.data[b.read:b.write] -} - -// WriteSize returns the number of bytes available for writing. -func (b *buffer) WriteSize() int { - return len(b.data) - b.write -} - -// WriteMove advances the write index by the given amount. -func (b *buffer) WriteMove(n int) { - b.write += n -} - -// WriteSlice returns the write slice for this buffer. -func (b *buffer) WriteSlice() []byte { - return b.data[b.write:] -} diff --git a/pkg/buffer/buffer_list.go b/pkg/buffer/buffer_list.go deleted file mode 100644 index 6b5bea3fc..000000000 --- a/pkg/buffer/buffer_list.go +++ /dev/null @@ -1,221 +0,0 @@ -package buffer - -// ElementMapper provides an identity mapping by default. -// -// This can be replaced to provide a struct that maps elements to linker -// objects, if they are not the same. An ElementMapper is not typically -// required if: Linker is left as is, Element is left as is, or Linker and -// Element are the same type. -type bufferElementMapper struct{} - -// linkerFor maps an Element to a Linker. -// -// This default implementation should be inlined. -// -//go:nosplit -func (bufferElementMapper) linkerFor(elem *buffer) *buffer { return elem } - -// List is an intrusive list. Entries can be added to or removed from the list -// in O(1) time and with no additional memory allocations. -// -// The zero value for List is an empty list ready to use. -// -// To iterate over a list (where l is a List): -// for e := l.Front(); e != nil; e = e.Next() { -// // do something with e. -// } -// -// +stateify savable -type bufferList struct { - head *buffer - tail *buffer -} - -// Reset resets list l to the empty state. -func (l *bufferList) Reset() { - l.head = nil - l.tail = nil -} - -// Empty returns true iff the list is empty. -// -//go:nosplit -func (l *bufferList) Empty() bool { - return l.head == nil -} - -// Front returns the first element of list l or nil. -// -//go:nosplit -func (l *bufferList) Front() *buffer { - return l.head -} - -// Back returns the last element of list l or nil. -// -//go:nosplit -func (l *bufferList) Back() *buffer { - return l.tail -} - -// Len returns the number of elements in the list. -// -// NOTE: This is an O(n) operation. -// -//go:nosplit -func (l *bufferList) Len() (count int) { - for e := l.Front(); e != nil; e = (bufferElementMapper{}.linkerFor(e)).Next() { - count++ - } - return count -} - -// PushFront inserts the element e at the front of list l. -// -//go:nosplit -func (l *bufferList) PushFront(e *buffer) { - linker := bufferElementMapper{}.linkerFor(e) - linker.SetNext(l.head) - linker.SetPrev(nil) - if l.head != nil { - bufferElementMapper{}.linkerFor(l.head).SetPrev(e) - } else { - l.tail = e - } - - l.head = e -} - -// PushBack inserts the element e at the back of list l. -// -//go:nosplit -func (l *bufferList) PushBack(e *buffer) { - linker := bufferElementMapper{}.linkerFor(e) - linker.SetNext(nil) - linker.SetPrev(l.tail) - if l.tail != nil { - bufferElementMapper{}.linkerFor(l.tail).SetNext(e) - } else { - l.head = e - } - - l.tail = e -} - -// PushBackList inserts list m at the end of list l, emptying m. -// -//go:nosplit -func (l *bufferList) PushBackList(m *bufferList) { - if l.head == nil { - l.head = m.head - l.tail = m.tail - } else if m.head != nil { - bufferElementMapper{}.linkerFor(l.tail).SetNext(m.head) - bufferElementMapper{}.linkerFor(m.head).SetPrev(l.tail) - - l.tail = m.tail - } - m.head = nil - m.tail = nil -} - -// InsertAfter inserts e after b. -// -//go:nosplit -func (l *bufferList) InsertAfter(b, e *buffer) { - bLinker := bufferElementMapper{}.linkerFor(b) - eLinker := bufferElementMapper{}.linkerFor(e) - - a := bLinker.Next() - - eLinker.SetNext(a) - eLinker.SetPrev(b) - bLinker.SetNext(e) - - if a != nil { - bufferElementMapper{}.linkerFor(a).SetPrev(e) - } else { - l.tail = e - } -} - -// InsertBefore inserts e before a. -// -//go:nosplit -func (l *bufferList) InsertBefore(a, e *buffer) { - aLinker := bufferElementMapper{}.linkerFor(a) - eLinker := bufferElementMapper{}.linkerFor(e) - - b := aLinker.Prev() - eLinker.SetNext(a) - eLinker.SetPrev(b) - aLinker.SetPrev(e) - - if b != nil { - bufferElementMapper{}.linkerFor(b).SetNext(e) - } else { - l.head = e - } -} - -// Remove removes e from l. -// -//go:nosplit -func (l *bufferList) Remove(e *buffer) { - linker := bufferElementMapper{}.linkerFor(e) - prev := linker.Prev() - next := linker.Next() - - if prev != nil { - bufferElementMapper{}.linkerFor(prev).SetNext(next) - } else if l.head == e { - l.head = next - } - - if next != nil { - bufferElementMapper{}.linkerFor(next).SetPrev(prev) - } else if l.tail == e { - l.tail = prev - } - - linker.SetNext(nil) - linker.SetPrev(nil) -} - -// Entry is a default implementation of Linker. Users can add anonymous fields -// of this type to their structs to make them automatically implement the -// methods needed by List. -// -// +stateify savable -type bufferEntry struct { - next *buffer - prev *buffer -} - -// Next returns the entry that follows e in the list. -// -//go:nosplit -func (e *bufferEntry) Next() *buffer { - return e.next -} - -// Prev returns the entry that precedes e in the list. -// -//go:nosplit -func (e *bufferEntry) Prev() *buffer { - return e.prev -} - -// SetNext assigns 'entry' as the entry that follows e in the list. -// -//go:nosplit -func (e *bufferEntry) SetNext(elem *buffer) { - e.next = elem -} - -// SetPrev assigns 'entry' as the entry that precedes e in the list. -// -//go:nosplit -func (e *bufferEntry) SetPrev(elem *buffer) { - e.prev = elem -} diff --git a/pkg/buffer/buffer_state_autogen.go b/pkg/buffer/buffer_state_autogen.go deleted file mode 100644 index 29007f642..000000000 --- a/pkg/buffer/buffer_state_autogen.go +++ /dev/null @@ -1,153 +0,0 @@ -// automatically generated by stateify. - -package buffer - -import ( - "gvisor.dev/gvisor/pkg/state" -) - -func (b *buffer) StateTypeName() string { - return "pkg/buffer.buffer" -} - -func (b *buffer) StateFields() []string { - return []string{ - "data", - "read", - "write", - "bufferEntry", - } -} - -func (b *buffer) beforeSave() {} - -func (b *buffer) StateSave(stateSinkObject state.Sink) { - b.beforeSave() - stateSinkObject.Save(0, &b.data) - stateSinkObject.Save(1, &b.read) - stateSinkObject.Save(2, &b.write) - stateSinkObject.Save(3, &b.bufferEntry) -} - -func (b *buffer) afterLoad() {} - -func (b *buffer) StateLoad(stateSourceObject state.Source) { - stateSourceObject.Load(0, &b.data) - stateSourceObject.Load(1, &b.read) - stateSourceObject.Load(2, &b.write) - stateSourceObject.Load(3, &b.bufferEntry) -} - -func (l *bufferList) StateTypeName() string { - return "pkg/buffer.bufferList" -} - -func (l *bufferList) StateFields() []string { - return []string{ - "head", - "tail", - } -} - -func (l *bufferList) beforeSave() {} - -func (l *bufferList) StateSave(stateSinkObject state.Sink) { - l.beforeSave() - stateSinkObject.Save(0, &l.head) - stateSinkObject.Save(1, &l.tail) -} - -func (l *bufferList) afterLoad() {} - -func (l *bufferList) StateLoad(stateSourceObject state.Source) { - stateSourceObject.Load(0, &l.head) - stateSourceObject.Load(1, &l.tail) -} - -func (e *bufferEntry) StateTypeName() string { - return "pkg/buffer.bufferEntry" -} - -func (e *bufferEntry) StateFields() []string { - return []string{ - "next", - "prev", - } -} - -func (e *bufferEntry) beforeSave() {} - -func (e *bufferEntry) StateSave(stateSinkObject state.Sink) { - e.beforeSave() - stateSinkObject.Save(0, &e.next) - stateSinkObject.Save(1, &e.prev) -} - -func (e *bufferEntry) afterLoad() {} - -func (e *bufferEntry) StateLoad(stateSourceObject state.Source) { - stateSourceObject.Load(0, &e.next) - stateSourceObject.Load(1, &e.prev) -} - -func (p *pool) StateTypeName() string { - return "pkg/buffer.pool" -} - -func (p *pool) StateFields() []string { - return []string{ - "bufferSize", - "embeddedStorage", - } -} - -func (p *pool) beforeSave() {} - -func (p *pool) StateSave(stateSinkObject state.Sink) { - p.beforeSave() - stateSinkObject.Save(0, &p.bufferSize) - stateSinkObject.Save(1, &p.embeddedStorage) -} - -func (p *pool) StateLoad(stateSourceObject state.Source) { - stateSourceObject.Load(0, &p.bufferSize) - stateSourceObject.LoadWait(1, &p.embeddedStorage) - stateSourceObject.AfterLoad(p.afterLoad) -} - -func (v *View) StateTypeName() string { - return "pkg/buffer.View" -} - -func (v *View) StateFields() []string { - return []string{ - "data", - "size", - "pool", - } -} - -func (v *View) beforeSave() {} - -func (v *View) StateSave(stateSinkObject state.Sink) { - v.beforeSave() - stateSinkObject.Save(0, &v.data) - stateSinkObject.Save(1, &v.size) - stateSinkObject.Save(2, &v.pool) -} - -func (v *View) afterLoad() {} - -func (v *View) StateLoad(stateSourceObject state.Source) { - stateSourceObject.Load(0, &v.data) - stateSourceObject.Load(1, &v.size) - stateSourceObject.Load(2, &v.pool) -} - -func init() { - state.Register((*buffer)(nil)) - state.Register((*bufferList)(nil)) - state.Register((*bufferEntry)(nil)) - state.Register((*pool)(nil)) - state.Register((*View)(nil)) -} diff --git a/pkg/buffer/buffer_unsafe_state_autogen.go b/pkg/buffer/buffer_unsafe_state_autogen.go deleted file mode 100644 index 5a5c40722..000000000 --- a/pkg/buffer/buffer_unsafe_state_autogen.go +++ /dev/null @@ -1,3 +0,0 @@ -// automatically generated by stateify. - -package buffer diff --git a/pkg/buffer/pool.go b/pkg/buffer/pool.go deleted file mode 100644 index 7ad6132ab..000000000 --- a/pkg/buffer/pool.go +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package buffer - -const ( - // embeddedCount is the number of buffer structures embedded in the pool. It - // is also the number for overflow allocations. - embeddedCount = 8 - - // defaultBufferSize is the default size for each underlying storage buffer. - // - // It is slightly less than two pages. This is done intentionally to ensure - // that the buffer object aligns with runtime internals. This two page size - // will effectively minimize internal fragmentation, but still have a large - // enough chunk to limit excessive segmentation. - defaultBufferSize = 8144 -) - -// pool allocates buffer. -// -// It contains an embedded buffer storage for fast path when the number of -// buffers needed is small. -// -// +stateify savable -type pool struct { - bufferSize int - avail []buffer `state:"nosave"` - embeddedStorage [embeddedCount]buffer `state:"wait"` -} - -// get gets a new buffer from p. -func (p *pool) get() *buffer { - if p.avail == nil { - p.avail = p.embeddedStorage[:] - } - if len(p.avail) == 0 { - p.avail = make([]buffer, embeddedCount) - } - if p.bufferSize <= 0 { - p.bufferSize = defaultBufferSize - } - buf := &p.avail[0] - buf.init(p.bufferSize) - p.avail = p.avail[1:] - return buf -} - -// put releases buf. -func (p *pool) put(buf *buffer) { - // Remove reference to the underlying storage, allowing it to be garbage - // collected. - buf.data = nil -} - -// setBufferSize sets the size of underlying storage buffer for future -// allocations. It can be called at any time. -func (p *pool) setBufferSize(size int) { - p.bufferSize = size -} - -// afterLoad is invoked by stateify. -func (p *pool) afterLoad() { - // S/R does not save subslice into embeddedStorage correctly. Restore - // available portion of embeddedStorage manually. Restore as nil if none used. - for i := len(p.embeddedStorage); i > 0; i-- { - if p.embeddedStorage[i-1].data != nil { - p.avail = p.embeddedStorage[i:] - break - } - } -} diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go deleted file mode 100644 index 8b42575b4..000000000 --- a/pkg/buffer/safemem.go +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package buffer - -import ( - "gvisor.dev/gvisor/pkg/safemem" -) - -// WriteBlock returns this buffer as a write Block. -func (b *buffer) WriteBlock() safemem.Block { - return safemem.BlockFromSafeSlice(b.WriteSlice()) -} - -// ReadBlock returns this buffer as a read Block. -func (b *buffer) ReadBlock() safemem.Block { - return safemem.BlockFromSafeSlice(b.ReadSlice()) -} - -// WriteFromSafememReader writes up to count bytes from r to v and advances the -// write index by the number of bytes written. It calls r.ReadToBlocks() at -// most once. -func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, error) { - if count == 0 { - return 0, nil - } - - var ( - dst safemem.BlockSeq - blocks []safemem.Block - ) - - // Need at least one buffer. - firstBuf := v.data.Back() - if firstBuf == nil { - firstBuf = v.pool.get() - v.data.PushBack(firstBuf) - } - - // Does the last block have sufficient capacity alone? - if l := uint64(firstBuf.WriteSize()); l >= count { - dst = safemem.BlockSeqOf(firstBuf.WriteBlock().TakeFirst64(count)) - } else { - // Append blocks until sufficient. - count -= l - blocks = append(blocks, firstBuf.WriteBlock()) - for count > 0 { - emptyBuf := v.pool.get() - v.data.PushBack(emptyBuf) - block := emptyBuf.WriteBlock().TakeFirst64(count) - count -= uint64(block.Len()) - blocks = append(blocks, block) - } - dst = safemem.BlockSeqFromSlice(blocks) - } - - // Perform I/O. - n, err := r.ReadToBlocks(dst) - v.size += int64(n) - - // Update all indices. - for left := n; left > 0; firstBuf = firstBuf.Next() { - if l := firstBuf.WriteSize(); left >= uint64(l) { - firstBuf.WriteMove(l) // Whole block. - left -= uint64(l) - } else { - firstBuf.WriteMove(int(left)) // Partial block. - left = 0 - } - } - - return n, err -} - -// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. It advances the -// write index by the number of bytes written. -func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { - return v.WriteFromSafememReader(&safemem.BlockSeqReader{srcs}, srcs.NumBytes()) -} - -// ReadToSafememWriter reads up to count bytes from v to w. It does not advance -// the read index. It calls w.WriteFromBlocks() at most once. -func (v *View) ReadToSafememWriter(w safemem.Writer, count uint64) (uint64, error) { - if count == 0 { - return 0, nil - } - - var ( - src safemem.BlockSeq - blocks []safemem.Block - ) - - firstBuf := v.data.Front() - if firstBuf == nil { - return 0, nil // No EOF. - } - - // Is all the data in a single block? - if l := uint64(firstBuf.ReadSize()); l >= count { - src = safemem.BlockSeqOf(firstBuf.ReadBlock().TakeFirst64(count)) - } else { - // Build a list of all the buffers. - count -= l - blocks = append(blocks, firstBuf.ReadBlock()) - for buf := firstBuf.Next(); buf != nil && count > 0; buf = buf.Next() { - block := buf.ReadBlock().TakeFirst64(count) - count -= uint64(block.Len()) - blocks = append(blocks, block) - } - src = safemem.BlockSeqFromSlice(blocks) - } - - // Perform I/O. As documented, we don't advance the read index. - return w.WriteFromBlocks(src) -} - -// ReadToBlocks implements safemem.Reader.ReadToBlocks. It does not advance the -// read index by the number of bytes read, such that it's only safe to call if -// the caller guarantees that ReadToBlocks will only be called once. -func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { - return v.ReadToSafememWriter(&safemem.BlockSeqWriter{dsts}, dsts.NumBytes()) -} diff --git a/pkg/buffer/view.go b/pkg/buffer/view.go deleted file mode 100644 index 00652d675..000000000 --- a/pkg/buffer/view.go +++ /dev/null @@ -1,391 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package buffer - -import ( - "fmt" - "io" -) - -// View is a non-linear buffer. -// -// All methods are thread compatible. -// -// +stateify savable -type View struct { - data bufferList - size int64 - pool pool -} - -// TrimFront removes the first count bytes from the buffer. -func (v *View) TrimFront(count int64) { - if count >= v.size { - v.advanceRead(v.size) - } else { - v.advanceRead(count) - } -} - -// ReadAt implements io.ReaderAt.ReadAt. -func (v *View) ReadAt(p []byte, offset int64) (int, error) { - var ( - skipped int64 - done int64 - ) - for buf := v.data.Front(); buf != nil && done < int64(len(p)); buf = buf.Next() { - needToSkip := int(offset - skipped) - if sz := buf.ReadSize(); sz <= needToSkip { - skipped += int64(sz) - continue - } - - // Actually read data. - n := copy(p[done:], buf.ReadSlice()[needToSkip:]) - skipped += int64(needToSkip) - done += int64(n) - } - if int(done) < len(p) || offset+done == v.size { - return int(done), io.EOF - } - return int(done), nil -} - -// advanceRead advances the view's read index. -// -// Precondition: there must be sufficient bytes in the buffer. -func (v *View) advanceRead(count int64) { - for buf := v.data.Front(); buf != nil && count > 0; { - sz := int64(buf.ReadSize()) - if sz > count { - // There is still data for reading. - buf.ReadMove(int(count)) - v.size -= count - count = 0 - break - } - - // Consume the whole buffer. - oldBuf := buf - buf = buf.Next() // Iterate. - v.data.Remove(oldBuf) - oldBuf.Reset() - v.pool.put(oldBuf) - - // Update counts. - count -= sz - v.size -= sz - } - if count > 0 { - panic(fmt.Sprintf("advanceRead still has %d bytes remaining", count)) - } -} - -// Truncate truncates the view to the given bytes. -// -// This will not grow the view, only shrink it. If a length is passed that is -// greater than the current size of the view, then nothing will happen. -// -// Precondition: length must be >= 0. -func (v *View) Truncate(length int64) { - if length < 0 { - panic("negative length provided") - } - if length >= v.size { - return // Nothing to do. - } - for buf := v.data.Back(); buf != nil && v.size > length; buf = v.data.Back() { - sz := int64(buf.ReadSize()) - if after := v.size - sz; after < length { - // Truncate the buffer locally. - left := (length - after) - buf.write = buf.read + int(left) - v.size = length - break - } - - // Drop the buffer completely; see above. - v.data.Remove(buf) - buf.Reset() - v.pool.put(buf) - v.size -= sz - } -} - -// Grow grows the given view to the number of bytes, which will be appended. If -// zero is true, all these bytes will be zero. If zero is false, then this is -// the caller's responsibility. -// -// Precondition: length must be >= 0. -func (v *View) Grow(length int64, zero bool) { - if length < 0 { - panic("negative length provided") - } - for v.size < length { - buf := v.data.Back() - - // Is there some space in the last buffer? - if buf == nil || buf.Full() { - buf = v.pool.get() - v.data.PushBack(buf) - } - - // Write up to length bytes. - sz := buf.WriteSize() - if int64(sz) > length-v.size { - sz = int(length - v.size) - } - - // Zero the written section; note that this pattern is - // specifically recognized and optimized by the compiler. - if zero { - for i := buf.write; i < buf.write+sz; i++ { - buf.data[i] = 0 - } - } - - // Advance the index. - buf.WriteMove(sz) - v.size += int64(sz) - } -} - -// Prepend prepends the given data. -func (v *View) Prepend(data []byte) { - // Is there any space in the first buffer? - if buf := v.data.Front(); buf != nil && buf.read > 0 { - // Fill up before the first write. - avail := buf.read - bStart := 0 - dStart := len(data) - avail - if avail > len(data) { - bStart = avail - len(data) - dStart = 0 - } - n := copy(buf.data[bStart:], data[dStart:]) - data = data[:dStart] - v.size += int64(n) - buf.read -= n - } - - for len(data) > 0 { - // Do we need an empty buffer? - buf := v.pool.get() - v.data.PushFront(buf) - - // The buffer is empty; copy last chunk. - avail := len(buf.data) - bStart := 0 - dStart := len(data) - avail - if avail > len(data) { - bStart = avail - len(data) - dStart = 0 - } - - // We have to put the data at the end of the current - // buffer in order to ensure that the next prepend will - // correctly fill up the beginning of this buffer. - n := copy(buf.data[bStart:], data[dStart:]) - data = data[:dStart] - v.size += int64(n) - buf.read = len(buf.data) - n - buf.write = len(buf.data) - } -} - -// Append appends the given data. -func (v *View) Append(data []byte) { - for done := 0; done < len(data); { - buf := v.data.Back() - - // Ensure there's a buffer with space. - if buf == nil || buf.Full() { - buf = v.pool.get() - v.data.PushBack(buf) - } - - // Copy in to the given buffer. - n := copy(buf.WriteSlice(), data[done:]) - done += n - buf.WriteMove(n) - v.size += int64(n) - } -} - -// Flatten returns a flattened copy of this data. -// -// This method should not be used in any performance-sensitive paths. It may -// allocate a fresh byte slice sufficiently large to contain all the data in -// the buffer. This is principally for debugging. -// -// N.B. Tee data still belongs to this view, as if there is a single buffer -// present, then it will be returned directly. This should be used for -// temporary use only, and a reference to the given slice should not be held. -func (v *View) Flatten() []byte { - if buf := v.data.Front(); buf == nil { - return nil // No data at all. - } else if buf.Next() == nil { - return buf.ReadSlice() // Only one buffer. - } - data := make([]byte, 0, v.size) // Need to flatten. - for buf := v.data.Front(); buf != nil; buf = buf.Next() { - // Copy to the allocated slice. - data = append(data, buf.ReadSlice()...) - } - return data -} - -// Size indicates the total amount of data available in this view. -func (v *View) Size() int64 { - return v.size -} - -// Copy makes a strict copy of this view. -func (v *View) Copy() (other View) { - for buf := v.data.Front(); buf != nil; buf = buf.Next() { - other.Append(buf.ReadSlice()) - } - return -} - -// Apply applies the given function across all valid data. -func (v *View) Apply(fn func([]byte)) { - for buf := v.data.Front(); buf != nil; buf = buf.Next() { - fn(buf.ReadSlice()) - } -} - -// Merge merges the provided View with this one. -// -// The other view will be appended to v, and other will be empty after this -// operation completes. -func (v *View) Merge(other *View) { - // Copy over all buffers. - for buf := other.data.Front(); buf != nil; buf = other.data.Front() { - other.data.Remove(buf) - v.data.PushBack(buf) - } - - // Adjust sizes. - v.size += other.size - other.size = 0 -} - -// WriteFromReader writes to the buffer from an io.Reader. -// -// A minimum read size equal to unsafe.Sizeof(unintptr) is enforced, -// provided that count is greater than or equal to unsafe.Sizeof(uintptr). -func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) { - var ( - done int64 - n int - err error - ) - for done < count { - buf := v.data.Back() - - // Ensure we have an empty buffer. - if buf == nil || buf.Full() { - buf = v.pool.get() - v.data.PushBack(buf) - } - - // Is this less than the minimum batch? - if buf.WriteSize() < minBatch && (count-done) >= int64(minBatch) { - tmp := make([]byte, minBatch) - n, err = r.Read(tmp) - v.Append(tmp[:n]) - done += int64(n) - if err != nil { - break - } - continue - } - - // Limit the read, if necessary. - sz := buf.WriteSize() - if left := count - done; int64(sz) > left { - sz = int(left) - } - - // Pass the relevant portion of the buffer. - n, err = r.Read(buf.WriteSlice()[:sz]) - buf.WriteMove(n) - done += int64(n) - v.size += int64(n) - if err == io.EOF { - err = nil // Short write allowed. - break - } else if err != nil { - break - } - } - return done, err -} - -// ReadToWriter reads from the buffer into an io.Writer. -// -// N.B. This does not consume the bytes read. TrimFront should -// be called appropriately after this call in order to do so. -// -// A minimum write size equal to unsafe.Sizeof(unintptr) is enforced, -// provided that count is greater than or equal to unsafe.Sizeof(uintptr). -func (v *View) ReadToWriter(w io.Writer, count int64) (int64, error) { - var ( - done int64 - n int - err error - ) - offset := 0 // Spill-over for batching. - for buf := v.data.Front(); buf != nil && done < count; buf = buf.Next() { - // Has this been consumed? Skip it. - sz := buf.ReadSize() - if sz <= offset { - offset -= sz - continue - } - sz -= offset - - // Is this less than the minimum batch? - left := count - done - if sz < minBatch && left >= int64(minBatch) && (v.size-done) >= int64(minBatch) { - tmp := make([]byte, minBatch) - n, err = v.ReadAt(tmp, done) - w.Write(tmp[:n]) - done += int64(n) - offset = n - sz // Reset below. - if err != nil { - break - } - continue - } - - // Limit the write if necessary. - if int64(sz) >= left { - sz = int(left) - } - - // Perform the actual write. - n, err = w.Write(buf.ReadSlice()[offset : offset+sz]) - done += int64(n) - if err != nil { - break - } - - // Reset spill-over. - offset = 0 - } - return done, err -} diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index df27554d3..91d5dc174 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -407,33 +407,44 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err != nil { return err } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + + // Order of checks is important. First check if parent directory can be + // executed, then check for existence, and lastly check if mount is writable. + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } name := rp.Component() if name == "." || name == ".." { return syserror.EEXIST } - if len(name) > maxFilenameLen { - return syserror.ENAMETOOLONG - } if parent.isDeleted() { return syserror.ENOENT } + + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), parent, name, &ds) + switch { + case err != nil && err != syserror.ENOENT: + return err + case child != nil: + return syserror.EEXIST + } + mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() - parent.dirMu.Lock() - defer parent.dirMu.Unlock() + + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return err + } + if !dir && rp.MustBeDir() { + return syserror.ENOENT + } if parent.isSynthetic() { - if child := parent.children[name]; child != nil { - return syserror.EEXIST - } - if !dir && rp.MustBeDir() { - return syserror.ENOENT - } if createInSyntheticDir == nil { return syserror.EPERM } @@ -449,47 +460,20 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } - if fs.opts.interop == InteropModeShared { - if child := parent.children[name]; child != nil && child.isSynthetic() { - return syserror.EEXIST - } - if !dir && rp.MustBeDir() { - return syserror.ENOENT - } - // The existence of a non-synthetic dentry at name would be inconclusive - // because the file it represents may have been deleted from the remote - // filesystem, so we would need to make an RPC to revalidate the dentry. - // Just attempt the file creation RPC instead. If a file does exist, the - // RPC will fail with EEXIST like we would have. If the RPC succeeds, and a - // stale dentry exists, the dentry will fail revalidation next time it's - // used. - if err := createInRemoteDir(parent, name, &ds); err != nil { - return err - } - ev := linux.IN_CREATE - if dir { - ev |= linux.IN_ISDIR - } - parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) - return nil - } - if child := parent.children[name]; child != nil { - return syserror.EEXIST - } - if !dir && rp.MustBeDir() { - return syserror.ENOENT - } - // No cached dentry exists; however, there might still be an existing file - // at name. As above, we attempt the file creation RPC anyway. + // No cached dentry exists; however, in InteropModeShared there might still be + // an existing file at name. Just attempt the file creation RPC anyways. If a + // file does exist, the RPC will fail with EEXIST like we would have. if err := createInRemoteDir(parent, name, &ds); err != nil { return err } - if child, ok := parent.children[name]; ok && child == nil { - // Delete the now-stale negative dentry. - delete(parent.children, name) + if fs.opts.interop != InteropModeShared { + if child, ok := parent.children[name]; ok && child == nil { + // Delete the now-stale negative dentry. + delete(parent.children, name) + } + parent.touchCMtime() + parent.dirents = nil } - parent.touchCMtime() - parent.dirents = nil ev := linux.IN_CREATE if dir { ev |= linux.IN_ISDIR diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index e77523f22..a7a553619 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -208,7 +208,9 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving // * Filesystem.mu must be locked for at least reading. // * isDir(parentInode) == true. func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string, parent *Dentry) error { - if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite|vfs.MayExec); err != nil { + // Order of checks is important. First check if parent directory can be + // executed, then check for existence, and lastly check if mount is writable. + if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayExec); err != nil { return err } if name == "." || name == ".." { @@ -223,6 +225,9 @@ func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string if parent.VFSDentry().IsDead() { return syserror.ENOENT } + if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite); err != nil { + return err + } return nil } diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index d55bdc97f..e46f593c7 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -480,9 +480,6 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err != nil { return err } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { - return err - } name := rp.Component() if name == "." || name == ".." { return syserror.EEXIST @@ -490,11 +487,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if parent.vfsd.IsDead() { return syserror.ENOENT } - mnt := rp.Mount() - if err := mnt.CheckBeginWrite(); err != nil { + + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } - defer mnt.EndWrite() + parent.dirMu.Lock() defer parent.dirMu.Unlock() @@ -514,6 +511,14 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return syserror.ENOENT } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } // Ensure that the parent directory is copied-up so that we can create the // new file in the upper layer. if err := parent.copyUpLocked(ctx); err != nil { diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 9296db2fb..453e41d11 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -153,7 +153,10 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err != nil { return err } - if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + + // Order of checks is important. First check if parent directory can be + // executed, then check for existence, and lastly check if mount is writable. + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } name := rp.Component() @@ -179,6 +182,10 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return err } defer mnt.EndWrite() + + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return err + } if err := create(parentDir, name); err != nil { return err } diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index b989e14c7..c551acd99 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -21,8 +21,8 @@ import ( "sync/atomic" "syscall" - "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -75,10 +75,18 @@ type Pipe struct { // mu protects all pipe internal state below. mu sync.Mutex `state:"nosave"` - // view is the underlying set of buffers. + // buf holds the pipe's data. buf is a circular buffer; the first valid + // byte in buf is at offset off, and the pipe contains size valid bytes. + // bufBlocks contains two identical safemem.Blocks representing buf; this + // avoids needing to heap-allocate a new safemem.Block slice when buf is + // resized. bufBlockSeq is a safemem.BlockSeq representing bufBlocks. // - // This is protected by mu. - view buffer.View + // These fields are protected by mu. + buf []byte + bufBlocks [2]safemem.Block `state:"nosave"` + bufBlockSeq safemem.BlockSeq `state:"nosave"` + off int64 + size int64 // max is the maximum size of the pipe in bytes. When this max has been // reached, writers will get EWOULDBLOCK. @@ -99,12 +107,6 @@ type Pipe struct { // // N.B. The size will be bounded. func NewPipe(isNamed bool, sizeBytes int64) *Pipe { - if sizeBytes < MinimumPipeSize { - sizeBytes = MinimumPipeSize - } - if sizeBytes > MaximumPipeSize { - sizeBytes = MaximumPipeSize - } var p Pipe initPipe(&p, isNamed, sizeBytes) return &p @@ -175,75 +177,71 @@ func (p *Pipe) Open(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) *fs.F } } -type readOps struct { - // left returns the bytes remaining. - left func() int64 - - // limit limits subsequence reads. - limit func(int64) - - // read performs the actual read operation. - read func(*buffer.View) (int64, error) -} - -// read reads data from the pipe into dst and returns the number of bytes -// read, or returns ErrWouldBlock if the pipe is empty. +// peekLocked passes the first count bytes in the pipe to f and returns its +// result. If fewer than count bytes are available, the safemem.BlockSeq passed +// to f will be less than count bytes in length. // -// Precondition: this pipe must have readers. -func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { - p.mu.Lock() - defer p.mu.Unlock() - return p.readLocked(ctx, ops) -} - -func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) { +// peekLocked does not mutate the pipe; if the read consumes bytes from the +// pipe, then the caller is responsible for calling p.consumeLocked() and +// p.Notify(waiter.EventOut). (The latter must be called with p.mu unlocked.) +// +// Preconditions: +// * p.mu must be locked. +// * This pipe must have readers. +func (p *Pipe) peekLocked(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { // Don't block for a zero-length read even if the pipe is empty. - if ops.left() == 0 { + if count == 0 { return 0, nil } - // Is the pipe empty? - if p.view.Size() == 0 { - if !p.HasWriters() { - // There are no writers, return EOF. - return 0, io.EOF + // Limit the amount of data read to the amount of data in the pipe. + if count > p.size { + if p.size == 0 { + if !p.HasWriters() { + return 0, io.EOF + } + return 0, syserror.ErrWouldBlock } - return 0, syserror.ErrWouldBlock + count = p.size } - // Limit how much we consume. - if ops.left() > p.view.Size() { - ops.limit(p.view.Size()) - } + // Prepare the view of the data to be read. + bs := p.bufBlockSeq.DropFirst64(uint64(p.off)).TakeFirst64(uint64(count)) - // Copy user data; the read op is responsible for trimming. - done, err := ops.read(&p.view) - return done, err + // Perform the read. + done, err := f(bs) + return int64(done), err } -type writeOps struct { - // left returns the bytes remaining. - left func() int64 - - // limit should limit subsequent writes. - limit func(int64) - - // write should write to the provided buffer. - write func(*buffer.View) (int64, error) -} - -// write writes data from sv into the pipe and returns the number of bytes -// written. If no bytes are written because the pipe is full (or has less than -// atomicIOBytes free capacity), write returns ErrWouldBlock. +// consumeLocked consumes the first n bytes in the pipe, such that they will no +// longer be visible to future reads. // -// Precondition: this pipe must have writers. -func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { - p.mu.Lock() - defer p.mu.Unlock() - return p.writeLocked(ctx, ops) +// Preconditions: +// * p.mu must be locked. +// * The pipe must contain at least n bytes. +func (p *Pipe) consumeLocked(n int64) { + p.off += n + if max := int64(len(p.buf)); p.off >= max { + p.off -= max + } + p.size -= n } -func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) { +// writeLocked passes a safemem.BlockSeq representing the first count bytes of +// unused space in the pipe to f and returns the result. If fewer than count +// bytes are free, the safemem.BlockSeq passed to f will be less than count +// bytes in length. If the pipe is full or otherwise cannot accomodate a write +// of any number of bytes up to count, writeLocked returns ErrWouldBlock +// without calling f. +// +// Unlike peekLocked, writeLocked assumes that f returns the number of bytes +// written to the pipe, and increases the number of bytes stored in the pipe +// accordingly. Callers are still responsible for calling +// p.Notify(waiter.EventIn) with p.mu unlocked. +// +// Preconditions: +// * p.mu must be locked. +func (p *Pipe) writeLocked(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { // Can't write to a pipe with no readers. if !p.HasReaders() { return 0, syscall.EPIPE @@ -251,29 +249,59 @@ func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) { // POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be // atomic, but requires no atomicity for writes larger than this. - wanted := ops.left() - avail := p.max - p.view.Size() - if wanted > avail { - if wanted <= atomicIOBytes { + avail := p.max - p.size + short := false + if count > avail { + if count <= atomicIOBytes { return 0, syserror.ErrWouldBlock } - ops.limit(avail) + count = avail + short = true } - // Copy user data. - done, err := ops.write(&p.view) - if err != nil { - return done, err + // Ensure that the buffer is big enough. + if newLen, oldCap := p.size+count, int64(len(p.buf)); newLen > oldCap { + // Allocate a new buffer. + newCap := oldCap * 2 + if oldCap == 0 { + newCap = 8 // arbitrary; sending individual integers across pipes is relatively common + } + for newLen > newCap { + newCap *= 2 + } + if newCap > p.max { + newCap = p.max + } + newBuf := make([]byte, newCap) + // Copy the old buffer's contents to the beginning of the new one. + safemem.CopySeq( + safemem.BlockSeqOf(safemem.BlockFromSafeSlice(newBuf)), + p.bufBlockSeq.DropFirst64(uint64(p.off)).TakeFirst64(uint64(p.size))) + // Switch to the new buffer. + p.buf = newBuf + p.bufBlocks[0] = safemem.BlockFromSafeSlice(newBuf) + p.bufBlocks[1] = p.bufBlocks[0] + p.bufBlockSeq = safemem.BlockSeqFromSlice(p.bufBlocks[:]) + p.off = 0 } - if done < avail { - // Non-failure, but short write. - return done, nil + // Prepare the view of the space to be written. + woff := p.off + p.size + if woff >= int64(len(p.buf)) { + woff -= int64(len(p.buf)) } - if done < wanted { - // Partial write due to full pipe. Note that this could also be - // the short write case above, we would expect a second call - // and the write to return zero bytes in this case. + bs := p.bufBlockSeq.DropFirst64(uint64(woff)).TakeFirst64(uint64(count)) + + // Perform the write. + doneU64, err := f(bs) + done := int64(doneU64) + p.size += done + if done < count || err != nil { + return done, err + } + + // If we shortened the write, adjust the returned error appropriately. + if short { return done, syserror.ErrWouldBlock } @@ -324,7 +352,7 @@ func (p *Pipe) HasWriters() bool { // Precondition: mu must be held. func (p *Pipe) rReadinessLocked() waiter.EventMask { ready := waiter.EventMask(0) - if p.HasReaders() && p.view.Size() != 0 { + if p.HasReaders() && p.size != 0 { ready |= waiter.EventIn } if !p.HasWriters() && p.hadWriter { @@ -350,7 +378,7 @@ func (p *Pipe) rReadiness() waiter.EventMask { // Precondition: mu must be held. func (p *Pipe) wReadinessLocked() waiter.EventMask { ready := waiter.EventMask(0) - if p.HasWriters() && p.view.Size() < p.max { + if p.HasWriters() && p.size < p.max { ready |= waiter.EventOut } if !p.HasReaders() { @@ -383,7 +411,7 @@ func (p *Pipe) queued() int64 { } func (p *Pipe) queuedLocked() int64 { - return p.view.Size() + return p.size } // FifoSize implements fs.FifoSizer.FifoSize. @@ -406,7 +434,7 @@ func (p *Pipe) SetFifoSize(size int64) (int64, error) { } p.mu.Lock() defer p.mu.Unlock() - if size < p.view.Size() { + if size < p.size { return 0, syserror.EBUSY } p.max = size diff --git a/pkg/sentry/kernel/pipe/pipe_state_autogen.go b/pkg/sentry/kernel/pipe/pipe_state_autogen.go index 3413c8bbb..9cee1f13a 100644 --- a/pkg/sentry/kernel/pipe/pipe_state_autogen.go +++ b/pkg/sentry/kernel/pipe/pipe_state_autogen.go @@ -41,7 +41,9 @@ func (p *Pipe) StateFields() []string { "isNamed", "readers", "writers", - "view", + "buf", + "off", + "size", "max", "hadWriter", } @@ -54,20 +56,23 @@ func (p *Pipe) StateSave(stateSinkObject state.Sink) { stateSinkObject.Save(0, &p.isNamed) stateSinkObject.Save(1, &p.readers) stateSinkObject.Save(2, &p.writers) - stateSinkObject.Save(3, &p.view) - stateSinkObject.Save(4, &p.max) - stateSinkObject.Save(5, &p.hadWriter) + stateSinkObject.Save(3, &p.buf) + stateSinkObject.Save(4, &p.off) + stateSinkObject.Save(5, &p.size) + stateSinkObject.Save(6, &p.max) + stateSinkObject.Save(7, &p.hadWriter) } -func (p *Pipe) afterLoad() {} - func (p *Pipe) StateLoad(stateSourceObject state.Source) { stateSourceObject.Load(0, &p.isNamed) stateSourceObject.Load(1, &p.readers) stateSourceObject.Load(2, &p.writers) - stateSourceObject.Load(3, &p.view) - stateSourceObject.Load(4, &p.max) - stateSourceObject.Load(5, &p.hadWriter) + stateSourceObject.Load(3, &p.buf) + stateSourceObject.Load(4, &p.off) + stateSourceObject.Load(5, &p.size) + stateSourceObject.Load(6, &p.max) + stateSourceObject.Load(7, &p.hadWriter) + stateSourceObject.AfterLoad(p.afterLoad) } func (r *Reader) StateTypeName() string { diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index f665920cb..77246edbe 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -21,9 +21,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/amutex" - "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" @@ -44,46 +44,37 @@ func (p *Pipe) Release(context.Context) { // Read reads from the Pipe into dst. func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error) { - n, err := p.read(ctx, readOps{ - left: func() int64 { - return dst.NumBytes() - }, - limit: func(l int64) { - dst = dst.TakeFirst64(l) - }, - read: func(view *buffer.View) (int64, error) { - n, err := dst.CopyOutFrom(ctx, view) - dst = dst.DropFirst64(n) - view.TrimFront(n) - return n, err - }, - }) + n, err := dst.CopyOutFrom(ctx, p) if n > 0 { p.Notify(waiter.EventOut) } return n, err } +// ReadToBlocks implements safemem.Reader.ReadToBlocks for Pipe.Read. +func (p *Pipe) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + n, err := p.read(int64(dsts.NumBytes()), func(srcs safemem.BlockSeq) (uint64, error) { + return safemem.CopySeq(dsts, srcs) + }, true /* removeFromSrc */) + return uint64(n), err +} + +func (p *Pipe) read(count int64, f func(srcs safemem.BlockSeq) (uint64, error), removeFromSrc bool) (int64, error) { + p.mu.Lock() + defer p.mu.Unlock() + n, err := p.peekLocked(count, f) + if n > 0 && removeFromSrc { + p.consumeLocked(n) + } + return n, err +} + // WriteTo writes to w from the Pipe. func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) (int64, error) { - ops := readOps{ - left: func() int64 { - return count - }, - limit: func(l int64) { - count = l - }, - read: func(view *buffer.View) (int64, error) { - n, err := view.ReadToWriter(w, count) - if !dup { - view.TrimFront(n) - } - count -= n - return n, err - }, - } - n, err := p.read(ctx, ops) - if n > 0 { + n, err := p.read(count, func(srcs safemem.BlockSeq) (uint64, error) { + return safemem.FromIOWriter{w}.WriteFromBlocks(srcs) + }, !dup /* removeFromSrc */) + if n > 0 && !dup { p.Notify(waiter.EventOut) } return n, err @@ -91,39 +82,31 @@ func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) // Write writes to the Pipe from src. func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) { - n, err := p.write(ctx, writeOps{ - left: func() int64 { - return src.NumBytes() - }, - limit: func(l int64) { - src = src.TakeFirst64(l) - }, - write: func(view *buffer.View) (int64, error) { - n, err := src.CopyInTo(ctx, view) - src = src.DropFirst64(n) - return n, err - }, - }) + n, err := src.CopyInTo(ctx, p) if n > 0 { p.Notify(waiter.EventIn) } return n, err } +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks for Pipe.Write. +func (p *Pipe) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + n, err := p.write(int64(srcs.NumBytes()), func(dsts safemem.BlockSeq) (uint64, error) { + return safemem.CopySeq(dsts, srcs) + }) + return uint64(n), err +} + +func (p *Pipe) write(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { + p.mu.Lock() + defer p.mu.Unlock() + return p.writeLocked(count, f) +} + // ReadFrom reads from r to the Pipe. func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, error) { - n, err := p.write(ctx, writeOps{ - left: func() int64 { - return count - }, - limit: func(l int64) { - count = l - }, - write: func(view *buffer.View) (int64, error) { - n, err := view.WriteFromReader(r, count) - count -= n - return n, err - }, + n, err := p.write(count, func(dsts safemem.BlockSeq) (uint64, error) { + return safemem.FromIOReader{r}.ReadToBlocks(dsts) }) if n > 0 { p.Notify(waiter.EventIn) diff --git a/pkg/buffer/view_unsafe.go b/pkg/sentry/kernel/pipe/save_restore.go index d1ef39b26..f135827de 100644 --- a/pkg/buffer/view_unsafe.go +++ b/pkg/sentry/kernel/pipe/save_restore.go @@ -12,14 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -package buffer +package pipe import ( - "unsafe" + "gvisor.dev/gvisor/pkg/safemem" ) -// minBatch is the smallest Read or Write operation that the -// WriteFromReader and ReadToWriter functions will use. -// -// This is defined as the size of a native pointer. -const minBatch = int(unsafe.Sizeof(uintptr(0))) +// afterLoad is called by stateify. +func (p *Pipe) afterLoad() { + p.bufBlocks[0] = safemem.BlockFromSafeSlice(p.buf) + p.bufBlocks[1] = p.bufBlocks[0] + p.bufBlockSeq = safemem.BlockSeqFromSlice(p.bufBlocks[:]) +} diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 2d47d2e82..d5a91730d 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -16,7 +16,6 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -269,12 +268,10 @@ func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) { // SpliceToNonPipe performs a splice operation from fd to a non-pipe file. func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) { fd.pipe.mu.Lock() - defer fd.pipe.mu.Unlock() // Cap the sequence at number of bytes actually available. - v := fd.pipe.queuedLocked() - if v < count { - count = v + if count > fd.pipe.size { + count = fd.pipe.size } src := usermem.IOSequence{ IO: fd, @@ -291,154 +288,97 @@ func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescripti n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{}) } if n > 0 { - fd.pipe.view.TrimFront(n) + fd.pipe.consumeLocked(n) + } + + fd.pipe.mu.Unlock() + + if n > 0 { + fd.pipe.Notify(waiter.EventOut) } return n, err } // SpliceFromNonPipe performs a splice operation from a non-pipe file to fd. func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) { - fd.pipe.mu.Lock() - defer fd.pipe.mu.Unlock() - dst := usermem.IOSequence{ IO: fd, Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}), } + var ( + n int64 + err error + ) + fd.pipe.mu.Lock() if off == -1 { - return in.Read(ctx, dst, vfs.ReadOptions{}) + n, err = in.Read(ctx, dst, vfs.ReadOptions{}) + } else { + n, err = in.PRead(ctx, dst, off, vfs.ReadOptions{}) + } + fd.pipe.mu.Unlock() + + if n > 0 { + fd.pipe.Notify(waiter.EventIn) } - return in.PRead(ctx, dst, off, vfs.ReadOptions{}) + return n, err } // CopyIn implements usermem.IO.CopyIn. Note that it is the caller's -// responsibility to trim fd.pipe.view after the read is completed. +// responsibility to call fd.pipe.consumeLocked() and +// fd.pipe.Notify(waiter.EventOut) after the read is completed. +// +// Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) { - origCount := int64(len(dst)) - n, err := fd.pipe.readLocked(ctx, readOps{ - left: func() int64 { - return int64(len(dst)) - }, - limit: func(l int64) { - dst = dst[:l] - }, - read: func(view *buffer.View) (int64, error) { - n, err := view.ReadAt(dst, 0) - return int64(n), err - }, + n, err := fd.pipe.peekLocked(int64(len(dst)), func(srcs safemem.BlockSeq) (uint64, error) { + return safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), srcs) }) - if n > 0 { - fd.pipe.Notify(waiter.EventOut) - } - if err == nil && n != origCount { - return int(n), syserror.ErrWouldBlock - } return int(n), err } -// CopyOut implements usermem.IO.CopyOut. +// CopyOut implements usermem.IO.CopyOut. Note that it is the caller's +// responsibility to call fd.pipe.Notify(waiter.EventIn) after the +// write is completed. +// +// Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) { - origCount := int64(len(src)) - n, err := fd.pipe.writeLocked(ctx, writeOps{ - left: func() int64 { - return int64(len(src)) - }, - limit: func(l int64) { - src = src[:l] - }, - write: func(view *buffer.View) (int64, error) { - view.Append(src) - return int64(len(src)), nil - }, + n, err := fd.pipe.writeLocked(int64(len(src)), func(dsts safemem.BlockSeq) (uint64, error) { + return safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src))) }) - if n > 0 { - fd.pipe.Notify(waiter.EventIn) - } - if err == nil && n != origCount { - return int(n), syserror.ErrWouldBlock - } return int(n), err } // ZeroOut implements usermem.IO.ZeroOut. +// +// Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { - origCount := toZero - n, err := fd.pipe.writeLocked(ctx, writeOps{ - left: func() int64 { - return toZero - }, - limit: func(l int64) { - toZero = l - }, - write: func(view *buffer.View) (int64, error) { - view.Grow(view.Size()+toZero, true /* zero */) - return toZero, nil - }, + n, err := fd.pipe.writeLocked(toZero, func(dsts safemem.BlockSeq) (uint64, error) { + return safemem.ZeroSeq(dsts) }) - if n > 0 { - fd.pipe.Notify(waiter.EventIn) - } - if err == nil && n != origCount { - return n, syserror.ErrWouldBlock - } return n, err } // CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's -// responsibility to trim fd.pipe.view after the read is completed. +// responsibility to call fd.pipe.consumeLocked() and +// fd.pipe.Notify(waiter.EventOut) after the read is completed. +// +// Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { - count := ars.NumBytes() - if count == 0 { - return 0, nil - } - origCount := count - n, err := fd.pipe.readLocked(ctx, readOps{ - left: func() int64 { - return count - }, - limit: func(l int64) { - count = l - }, - read: func(view *buffer.View) (int64, error) { - n, err := view.ReadToSafememWriter(dst, uint64(count)) - return int64(n), err - }, + return fd.pipe.peekLocked(ars.NumBytes(), func(srcs safemem.BlockSeq) (uint64, error) { + return dst.WriteFromBlocks(srcs) }) - if n > 0 { - fd.pipe.Notify(waiter.EventOut) - } - if err == nil && n != origCount { - return n, syserror.ErrWouldBlock - } - return n, err } // CopyOutFrom implements usermem.IO.CopyOutFrom. +// +// Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { - count := ars.NumBytes() - if count == 0 { - return 0, nil - } - origCount := count - n, err := fd.pipe.writeLocked(ctx, writeOps{ - left: func() int64 { - return count - }, - limit: func(l int64) { - count = l - }, - write: func(view *buffer.View) (int64, error) { - n, err := view.WriteFromSafememReader(src, uint64(count)) - return int64(n), err - }, + n, err := fd.pipe.writeLocked(ars.NumBytes(), func(dsts safemem.BlockSeq) (uint64, error) { + return src.ReadToBlocks(dsts) }) if n > 0 { fd.pipe.Notify(waiter.EventIn) } - if err == nil && n != origCount { - return n, syserror.ErrWouldBlock - } return n, err } @@ -481,37 +421,23 @@ func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFr } lockTwoPipes(dst.pipe, src.pipe) - defer dst.pipe.mu.Unlock() - defer src.pipe.mu.Unlock() - - n, err := dst.pipe.writeLocked(ctx, writeOps{ - left: func() int64 { - return count - }, - limit: func(l int64) { - count = l - }, - write: func(dstView *buffer.View) (int64, error) { - return src.pipe.readLocked(ctx, readOps{ - left: func() int64 { - return count - }, - limit: func(l int64) { - count = l - }, - read: func(srcView *buffer.View) (int64, error) { - n, err := srcView.ReadToSafememWriter(dstView, uint64(count)) - if n > 0 && removeFromSrc { - srcView.TrimFront(int64(n)) - } - return int64(n), err - }, - }) - }, + n, err := dst.pipe.writeLocked(count, func(dsts safemem.BlockSeq) (uint64, error) { + n, err := src.pipe.peekLocked(int64(dsts.NumBytes()), func(srcs safemem.BlockSeq) (uint64, error) { + return safemem.CopySeq(dsts, srcs) + }) + if n > 0 && removeFromSrc { + src.pipe.consumeLocked(n) + } + return uint64(n), err }) + dst.pipe.mu.Unlock() + src.pipe.mu.Unlock() + if n > 0 { dst.pipe.Notify(waiter.EventIn) - src.pipe.Notify(waiter.EventOut) + if removeFromSrc { + src.pipe.Notify(waiter.EventOut) + } } return n, err } diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 94fb425b2..03749a8bf 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -186,6 +186,21 @@ var Metrics = tcpip.Stats{ IPTablesInputDropped: mustCreateMetric("/netstack/ip/iptables/input_dropped", "Total number of IP packets dropped in the Input chain."), IPTablesOutputDropped: mustCreateMetric("/netstack/ip/iptables/output_dropped", "Total number of IP packets dropped in the Output chain."), }, + ARP: tcpip.ARPStats{ + PacketsReceived: mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."), + DisabledPacketsReceived: mustCreateMetric("/netstack/arp/disabled_packets_received", "Number of ARP packets received from the link layer when the ARP layer is disabled."), + MalformedPacketsReceived: mustCreateMetric("/netstack/arp/malformed_packets_received", "Number of ARP packets which failed ARP header validation checks."), + RequestsReceived: mustCreateMetric("/netstack/arp/requests_received", "Number of ARP requests received."), + RequestsReceivedUnknownTargetAddress: mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."), + OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."), + OutgoingRequestBadLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."), + OutgoingRequestNetworkUnreachableErrors: mustCreateMetric("/netstack/arp/outgoing_requests_network_unreachable", "Number of failed attempts to send an ARP request with a network unreachable error."), + OutgoingRequestsDropped: mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."), + OutgoingRequestsSent: mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."), + RepliesReceived: mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."), + OutgoingRepliesDropped: mustCreateMetric("/netstack/arp/outgoing_replies_dropped", "Number of ARP replies which failed to write to a link-layer endpoint."), + OutgoingRepliesSent: mustCreateMetric("/netstack/arp/outgoing_replies_sent", "Number of ARP replies sent."), + }, TCP: tcpip.TCPStats{ ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."), PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."), diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index 1c4cdb0dd..134051124 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -29,24 +29,23 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 || (opts.SrcStart+opts.Length < 0) { return 0, syserror.EINVAL } - + if opts.Length == 0 { + return 0, nil + } if opts.Length > int64(kernel.MAX_RW_COUNT) { opts.Length = int64(kernel.MAX_RW_COUNT) } var ( - total int64 n int64 err error inCh chan struct{} outCh chan struct{} ) - for opts.Length > 0 { + for { n, err = fs.Splice(t, outFile, inFile, opts) - opts.Length -= n - total += n - if err != syserror.ErrWouldBlock { + if n != 0 || err != syserror.ErrWouldBlock { break } else if err == syserror.ErrWouldBlock && nonBlocking { break @@ -87,13 +86,13 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB } } - if total > 0 { + if n > 0 { // On Linux, inotify behavior is not very consistent with splice(2). We try // our best to emulate Linux for very basic calls to splice, where for some // reason, events are generated for output files, but not input files. outFile.Dirent.InotifyEvent(linux.IN_MODIFY, 0) } - return total, err + return n, err } // Sendfile implements linux system call sendfile(2). diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go index 3d5c0d270..3259d052f 100644 --- a/pkg/tcpip/network/arp/arp.go +++ b/pkg/tcpip/network/arp/arp.go @@ -119,21 +119,28 @@ func (*endpoint) WriteHeaderIncludedPacket(*stack.Route, *stack.PacketBuffer) *t } func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { + stats := e.protocol.stack.Stats().ARP + stats.PacketsReceived.Increment() + if !e.isEnabled() { + stats.DisabledPacketsReceived.Increment() return } h := header.ARP(pkt.NetworkHeader().View()) if !h.IsValid() { + stats.MalformedPacketsReceived.Increment() return } switch h.Op() { case header.ARPRequest: + stats.RequestsReceived.Increment() localAddr := tcpip.Address(h.ProtocolAddressTarget()) if e.nud == nil { if e.linkAddrCache.CheckLocalAddress(e.nic.ID(), header.IPv4ProtocolNumber, localAddr) == 0 { + stats.RequestsReceivedUnknownTargetAddress.Increment() return // we have no useful answer, ignore the request } @@ -142,6 +149,7 @@ func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { e.linkAddrCache.AddLinkAddress(e.nic.ID(), addr, linkAddr) } else { if e.protocol.stack.CheckLocalAddress(e.nic.ID(), header.IPv4ProtocolNumber, localAddr) == 0 { + stats.RequestsReceivedUnknownTargetAddress.Increment() return // we have no useful answer, ignore the request } @@ -177,9 +185,14 @@ func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { // // Send the packet to the (new) target hardware address on the same // hardware on which the request was received. - _ = e.nic.WritePacketToRemote(tcpip.LinkAddress(origSender), nil /* gso */, ProtocolNumber, respPkt) + if err := e.nic.WritePacketToRemote(tcpip.LinkAddress(origSender), nil /* gso */, ProtocolNumber, respPkt); err != nil { + stats.OutgoingRepliesDropped.Increment() + } else { + stats.OutgoingRepliesSent.Increment() + } case header.ARPReply: + stats.RepliesReceived.Increment() addr := tcpip.Address(h.ProtocolAddressSender()) linkAddr := tcpip.LinkAddress(h.HardwareAddressSender()) @@ -233,6 +246,8 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber { // LinkAddressRequest implements stack.LinkAddressResolver.LinkAddressRequest. func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, nic stack.NetworkInterface) *tcpip.Error { + stats := p.stack.Stats().ARP + if len(remoteLinkAddr) == 0 { remoteLinkAddr = header.EthernetBroadcastAddress } @@ -241,15 +256,18 @@ func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remot if len(localAddr) == 0 { addr, err := p.stack.GetMainNICAddress(nicID, header.IPv4ProtocolNumber) if err != nil { + stats.OutgoingRequestInterfaceHasNoLocalAddressErrors.Increment() return err } if len(addr.Address) == 0 { + stats.OutgoingRequestNetworkUnreachableErrors.Increment() return tcpip.ErrNetworkUnreachable } localAddr = addr.Address } else if p.stack.CheckLocalAddress(nicID, header.IPv4ProtocolNumber, localAddr) == 0 { + stats.OutgoingRequestBadLocalAddressErrors.Increment() return tcpip.ErrBadLocalAddress } @@ -269,7 +287,12 @@ func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remot if n := copy(h.ProtocolAddressTarget(), targetAddr); n != header.IPv4AddressSize { panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize)) } - return nic.WritePacketToRemote(remoteLinkAddr, nil /* gso */, ProtocolNumber, pkt) + if err := nic.WritePacketToRemote(remoteLinkAddr, nil /* gso */, ProtocolNumber, pkt); err != nil { + stats.OutgoingRequestsDropped.Increment() + return err + } + stats.OutgoingRequestsSent.Increment() + return nil } // ResolveStaticAddress implements stack.LinkAddressResolver.ResolveStaticAddress. diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 002ddaf67..49d4912ad 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -1591,6 +1591,59 @@ type IPStats struct { OptionUnknownReceived *StatCounter } +// ARPStats collects ARP-specific stats. +type ARPStats struct { + // PacketsReceived is the number of ARP packets received from the link layer. + PacketsReceived *StatCounter + + // DisabledPacketsReceived is the number of ARP packets received from the link + // layer when the ARP layer is disabled. + DisabledPacketsReceived *StatCounter + + // MalformedPacketsReceived is the number of ARP packets that were dropped due + // to being malformed. + MalformedPacketsReceived *StatCounter + + // RequestsReceived is the number of ARP requests received. + RequestsReceived *StatCounter + + // RequestsReceivedUnknownTargetAddress is the number of ARP requests that + // were targeted to an interface different from the one it was received on. + RequestsReceivedUnknownTargetAddress *StatCounter + + // OutgoingRequestInterfaceHasNoLocalAddressErrors is the number of failures + // to send an ARP request because the interface has no network address + // assigned to it. + OutgoingRequestInterfaceHasNoLocalAddressErrors *StatCounter + + // OutgoingRequestBadLocalAddressErrors is the number of failures to send an + // ARP request with a bad local address. + OutgoingRequestBadLocalAddressErrors *StatCounter + + // OutgoingRequestNetworkUnreachableErrors is the number of failures to send + // an ARP request with a network unreachable error. + OutgoingRequestNetworkUnreachableErrors *StatCounter + + // OutgoingRequestsDropped is the number of ARP requests which failed to write + // to a link-layer endpoint. + OutgoingRequestsDropped *StatCounter + + // OutgoingRequestSent is the number of ARP requests successfully written to a + // link-layer endpoint. + OutgoingRequestsSent *StatCounter + + // RepliesReceived is the number of ARP replies received. + RepliesReceived *StatCounter + + // OutgoingRepliesDropped is the number of ARP replies which failed to write + // to a link-layer endpoint. + OutgoingRepliesDropped *StatCounter + + // OutgoingRepliesSent is the number of ARP replies successfully written to a + // link-layer endpoint. + OutgoingRepliesSent *StatCounter +} + // TCPStats collects TCP-specific stats. type TCPStats struct { // ActiveConnectionOpenings is the number of connections opened @@ -1743,6 +1796,9 @@ type Stats struct { // IP breaks out IP-specific stats (both v4 and v6). IP IPStats + // ARP breaks out ARP-specific stats. + ARP ARPStats + // TCP breaks out TCP-specific stats. TCP TCPStats diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 9e8872fc9..6921de0f1 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -305,10 +305,7 @@ func (l *listenContext) startHandshake(s *segment, opts *header.TCPSynOptions, q // Initialize and start the handshake. h := ep.newPassiveHandshake(isn, irs, opts, deferAccept) - if err := h.start(); err != nil { - l.cleanupFailedHandshake(h) - return nil, err - } + h.start() return h, nil } diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index f45d26a87..6cdbb8bee 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -53,7 +53,6 @@ const ( wakerForNotification = iota wakerForNewSegment wakerForResend - wakerForResolution ) const ( @@ -460,9 +459,9 @@ func (h *handshake) processSegments() *tcpip.Error { return nil } -// start resolves the route if necessary and sends the first -// SYN/SYN-ACK. -func (h *handshake) start() *tcpip.Error { +// start sends the first SYN/SYN-ACK. It does not block, even if link address +// resolution is required. +func (h *handshake) start() { h.startTime = time.Now() h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route) var sackEnabled tcpip.TCPSACKEnabled @@ -503,7 +502,6 @@ func (h *handshake) start() *tcpip.Error { ack: h.ackNum, rcvWnd: h.rcvWnd, }, synOpts) - return nil } // complete completes the TCP 3-way handshake initiated by h.start(). diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index ddbed7e46..a4508e871 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -2325,68 +2325,17 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc } if run { - if err := e.startMainLoop(handshake); err != nil { - return err - } - } - - return tcpip.ErrConnectStarted -} - -// startMainLoop sends the initial SYN and starts the main loop for the -// endpoint. -func (e *endpoint) startMainLoop(handshake bool) *tcpip.Error { - preloop := func() *tcpip.Error { if handshake { h := e.newHandshake() e.setEndpointState(StateSynSent) - if err := h.start(); err != nil { - e.lastErrorMu.Lock() - e.lastError = err - e.lastErrorMu.Unlock() - - e.setEndpointState(StateError) - e.hardError = err - - // Call cleanupLocked to free up any reservations. - e.cleanupLocked() - return err - } + h.start() } e.stack.Stats().TCP.ActiveConnectionOpenings.Increment() - return nil - } - - if e.route.IsResolutionRequired() { - // If the endpoint is closed between releasing e.mu and the goroutine below - // acquiring it, make sure that cleanup is deferred to the new goroutine. e.workerRunning = true - - // Sending the initial SYN may block due to route resolution; do it in a - // separate goroutine to avoid blocking the syscall goroutine. - go func() { // S/R-SAFE: will be drained before save. - e.mu.Lock() - if err := preloop(); err != nil { - e.workerRunning = false - e.mu.Unlock() - return - } - e.mu.Unlock() - _ = e.protocolMainLoop(handshake, nil) - }() - return nil + go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save. } - // No route resolution is required, so we can send the initial SYN here without - // blocking. This will hopefully reduce overall latency by overlapping time - // spent waiting for a SYN-ACK and time spent spinning up a new goroutine - // for the main loop. - if err := preloop(); err != nil { - return err - } - e.workerRunning = true - go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save. - return nil + return tcpip.ErrConnectStarted } // ConnectEndpoint is not supported. |