diff options
Diffstat (limited to 'pkg/sentry')
-rw-r--r-- | pkg/sentry/fsimpl/gofer/filesystem.go | 82 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/kernfs/filesystem.go | 7 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/overlay/filesystem.go | 17 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/tmpfs/filesystem.go | 9 | ||||
-rw-r--r-- | pkg/sentry/kernel/pipe/BUILD | 2 | ||||
-rw-r--r-- | pkg/sentry/kernel/pipe/pipe.go | 198 | ||||
-rw-r--r-- | pkg/sentry/kernel/pipe/pipe_util.go | 99 | ||||
-rw-r--r-- | pkg/sentry/kernel/pipe/save_restore.go | 26 | ||||
-rw-r--r-- | pkg/sentry/kernel/pipe/vfs.go | 202 | ||||
-rw-r--r-- | pkg/sentry/socket/netstack/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/socket/netstack/netstack.go | 50 | ||||
-rw-r--r-- | pkg/sentry/socket/netstack/netstack_vfs2.go | 11 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_splice.go | 15 |
13 files changed, 330 insertions, 389 deletions
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index df27554d3..91d5dc174 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -407,33 +407,44 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err != nil { return err } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + + // Order of checks is important. First check if parent directory can be + // executed, then check for existence, and lastly check if mount is writable. + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } name := rp.Component() if name == "." || name == ".." { return syserror.EEXIST } - if len(name) > maxFilenameLen { - return syserror.ENAMETOOLONG - } if parent.isDeleted() { return syserror.ENOENT } + + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), parent, name, &ds) + switch { + case err != nil && err != syserror.ENOENT: + return err + case child != nil: + return syserror.EEXIST + } + mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() - parent.dirMu.Lock() - defer parent.dirMu.Unlock() + + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return err + } + if !dir && rp.MustBeDir() { + return syserror.ENOENT + } if parent.isSynthetic() { - if child := parent.children[name]; child != nil { - return syserror.EEXIST - } - if !dir && rp.MustBeDir() { - return syserror.ENOENT - } if createInSyntheticDir == nil { return syserror.EPERM } @@ -449,47 +460,20 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } - if fs.opts.interop == InteropModeShared { - if child := parent.children[name]; child != nil && child.isSynthetic() { - return syserror.EEXIST - } - if !dir && rp.MustBeDir() { - return syserror.ENOENT - } - // The existence of a non-synthetic dentry at name would be inconclusive - // because the file it represents may have been deleted from the remote - // filesystem, so we would need to make an RPC to revalidate the dentry. - // Just attempt the file creation RPC instead. If a file does exist, the - // RPC will fail with EEXIST like we would have. If the RPC succeeds, and a - // stale dentry exists, the dentry will fail revalidation next time it's - // used. - if err := createInRemoteDir(parent, name, &ds); err != nil { - return err - } - ev := linux.IN_CREATE - if dir { - ev |= linux.IN_ISDIR - } - parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) - return nil - } - if child := parent.children[name]; child != nil { - return syserror.EEXIST - } - if !dir && rp.MustBeDir() { - return syserror.ENOENT - } - // No cached dentry exists; however, there might still be an existing file - // at name. As above, we attempt the file creation RPC anyway. + // No cached dentry exists; however, in InteropModeShared there might still be + // an existing file at name. Just attempt the file creation RPC anyways. If a + // file does exist, the RPC will fail with EEXIST like we would have. if err := createInRemoteDir(parent, name, &ds); err != nil { return err } - if child, ok := parent.children[name]; ok && child == nil { - // Delete the now-stale negative dentry. - delete(parent.children, name) + if fs.opts.interop != InteropModeShared { + if child, ok := parent.children[name]; ok && child == nil { + // Delete the now-stale negative dentry. + delete(parent.children, name) + } + parent.touchCMtime() + parent.dirents = nil } - parent.touchCMtime() - parent.dirents = nil ev := linux.IN_CREATE if dir { ev |= linux.IN_ISDIR diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index e77523f22..a7a553619 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -208,7 +208,9 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving // * Filesystem.mu must be locked for at least reading. // * isDir(parentInode) == true. func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string, parent *Dentry) error { - if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite|vfs.MayExec); err != nil { + // Order of checks is important. First check if parent directory can be + // executed, then check for existence, and lastly check if mount is writable. + if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayExec); err != nil { return err } if name == "." || name == ".." { @@ -223,6 +225,9 @@ func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string if parent.VFSDentry().IsDead() { return syserror.ENOENT } + if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite); err != nil { + return err + } return nil } diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index d55bdc97f..e46f593c7 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -480,9 +480,6 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err != nil { return err } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { - return err - } name := rp.Component() if name == "." || name == ".." { return syserror.EEXIST @@ -490,11 +487,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if parent.vfsd.IsDead() { return syserror.ENOENT } - mnt := rp.Mount() - if err := mnt.CheckBeginWrite(); err != nil { + + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } - defer mnt.EndWrite() + parent.dirMu.Lock() defer parent.dirMu.Unlock() @@ -514,6 +511,14 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return syserror.ENOENT } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } // Ensure that the parent directory is copied-up so that we can create the // new file in the upper layer. if err := parent.copyUpLocked(ctx); err != nil { diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 9296db2fb..453e41d11 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -153,7 +153,10 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err != nil { return err } - if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + + // Order of checks is important. First check if parent directory can be + // executed, then check for existence, and lastly check if mount is writable. + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } name := rp.Component() @@ -179,6 +182,10 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return err } defer mnt.EndWrite() + + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return err + } if err := create(parentDir, name); err != nil { return err } diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 99134e634..2c32d017d 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -12,6 +12,7 @@ go_library( "pipe_util.go", "reader.go", "reader_writer.go", + "save_restore.go", "vfs.go", "writer.go", ], @@ -19,7 +20,6 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/amutex", - "//pkg/buffer", "//pkg/context", "//pkg/marshal/primitive", "//pkg/safemem", diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index b989e14c7..c551acd99 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -21,8 +21,8 @@ import ( "sync/atomic" "syscall" - "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -75,10 +75,18 @@ type Pipe struct { // mu protects all pipe internal state below. mu sync.Mutex `state:"nosave"` - // view is the underlying set of buffers. + // buf holds the pipe's data. buf is a circular buffer; the first valid + // byte in buf is at offset off, and the pipe contains size valid bytes. + // bufBlocks contains two identical safemem.Blocks representing buf; this + // avoids needing to heap-allocate a new safemem.Block slice when buf is + // resized. bufBlockSeq is a safemem.BlockSeq representing bufBlocks. // - // This is protected by mu. - view buffer.View + // These fields are protected by mu. + buf []byte + bufBlocks [2]safemem.Block `state:"nosave"` + bufBlockSeq safemem.BlockSeq `state:"nosave"` + off int64 + size int64 // max is the maximum size of the pipe in bytes. When this max has been // reached, writers will get EWOULDBLOCK. @@ -99,12 +107,6 @@ type Pipe struct { // // N.B. The size will be bounded. func NewPipe(isNamed bool, sizeBytes int64) *Pipe { - if sizeBytes < MinimumPipeSize { - sizeBytes = MinimumPipeSize - } - if sizeBytes > MaximumPipeSize { - sizeBytes = MaximumPipeSize - } var p Pipe initPipe(&p, isNamed, sizeBytes) return &p @@ -175,75 +177,71 @@ func (p *Pipe) Open(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) *fs.F } } -type readOps struct { - // left returns the bytes remaining. - left func() int64 - - // limit limits subsequence reads. - limit func(int64) - - // read performs the actual read operation. - read func(*buffer.View) (int64, error) -} - -// read reads data from the pipe into dst and returns the number of bytes -// read, or returns ErrWouldBlock if the pipe is empty. +// peekLocked passes the first count bytes in the pipe to f and returns its +// result. If fewer than count bytes are available, the safemem.BlockSeq passed +// to f will be less than count bytes in length. // -// Precondition: this pipe must have readers. -func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { - p.mu.Lock() - defer p.mu.Unlock() - return p.readLocked(ctx, ops) -} - -func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) { +// peekLocked does not mutate the pipe; if the read consumes bytes from the +// pipe, then the caller is responsible for calling p.consumeLocked() and +// p.Notify(waiter.EventOut). (The latter must be called with p.mu unlocked.) +// +// Preconditions: +// * p.mu must be locked. +// * This pipe must have readers. +func (p *Pipe) peekLocked(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { // Don't block for a zero-length read even if the pipe is empty. - if ops.left() == 0 { + if count == 0 { return 0, nil } - // Is the pipe empty? - if p.view.Size() == 0 { - if !p.HasWriters() { - // There are no writers, return EOF. - return 0, io.EOF + // Limit the amount of data read to the amount of data in the pipe. + if count > p.size { + if p.size == 0 { + if !p.HasWriters() { + return 0, io.EOF + } + return 0, syserror.ErrWouldBlock } - return 0, syserror.ErrWouldBlock + count = p.size } - // Limit how much we consume. - if ops.left() > p.view.Size() { - ops.limit(p.view.Size()) - } + // Prepare the view of the data to be read. + bs := p.bufBlockSeq.DropFirst64(uint64(p.off)).TakeFirst64(uint64(count)) - // Copy user data; the read op is responsible for trimming. - done, err := ops.read(&p.view) - return done, err + // Perform the read. + done, err := f(bs) + return int64(done), err } -type writeOps struct { - // left returns the bytes remaining. - left func() int64 - - // limit should limit subsequent writes. - limit func(int64) - - // write should write to the provided buffer. - write func(*buffer.View) (int64, error) -} - -// write writes data from sv into the pipe and returns the number of bytes -// written. If no bytes are written because the pipe is full (or has less than -// atomicIOBytes free capacity), write returns ErrWouldBlock. +// consumeLocked consumes the first n bytes in the pipe, such that they will no +// longer be visible to future reads. // -// Precondition: this pipe must have writers. -func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { - p.mu.Lock() - defer p.mu.Unlock() - return p.writeLocked(ctx, ops) +// Preconditions: +// * p.mu must be locked. +// * The pipe must contain at least n bytes. +func (p *Pipe) consumeLocked(n int64) { + p.off += n + if max := int64(len(p.buf)); p.off >= max { + p.off -= max + } + p.size -= n } -func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) { +// writeLocked passes a safemem.BlockSeq representing the first count bytes of +// unused space in the pipe to f and returns the result. If fewer than count +// bytes are free, the safemem.BlockSeq passed to f will be less than count +// bytes in length. If the pipe is full or otherwise cannot accomodate a write +// of any number of bytes up to count, writeLocked returns ErrWouldBlock +// without calling f. +// +// Unlike peekLocked, writeLocked assumes that f returns the number of bytes +// written to the pipe, and increases the number of bytes stored in the pipe +// accordingly. Callers are still responsible for calling +// p.Notify(waiter.EventIn) with p.mu unlocked. +// +// Preconditions: +// * p.mu must be locked. +func (p *Pipe) writeLocked(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { // Can't write to a pipe with no readers. if !p.HasReaders() { return 0, syscall.EPIPE @@ -251,29 +249,59 @@ func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) { // POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be // atomic, but requires no atomicity for writes larger than this. - wanted := ops.left() - avail := p.max - p.view.Size() - if wanted > avail { - if wanted <= atomicIOBytes { + avail := p.max - p.size + short := false + if count > avail { + if count <= atomicIOBytes { return 0, syserror.ErrWouldBlock } - ops.limit(avail) + count = avail + short = true } - // Copy user data. - done, err := ops.write(&p.view) - if err != nil { - return done, err + // Ensure that the buffer is big enough. + if newLen, oldCap := p.size+count, int64(len(p.buf)); newLen > oldCap { + // Allocate a new buffer. + newCap := oldCap * 2 + if oldCap == 0 { + newCap = 8 // arbitrary; sending individual integers across pipes is relatively common + } + for newLen > newCap { + newCap *= 2 + } + if newCap > p.max { + newCap = p.max + } + newBuf := make([]byte, newCap) + // Copy the old buffer's contents to the beginning of the new one. + safemem.CopySeq( + safemem.BlockSeqOf(safemem.BlockFromSafeSlice(newBuf)), + p.bufBlockSeq.DropFirst64(uint64(p.off)).TakeFirst64(uint64(p.size))) + // Switch to the new buffer. + p.buf = newBuf + p.bufBlocks[0] = safemem.BlockFromSafeSlice(newBuf) + p.bufBlocks[1] = p.bufBlocks[0] + p.bufBlockSeq = safemem.BlockSeqFromSlice(p.bufBlocks[:]) + p.off = 0 } - if done < avail { - // Non-failure, but short write. - return done, nil + // Prepare the view of the space to be written. + woff := p.off + p.size + if woff >= int64(len(p.buf)) { + woff -= int64(len(p.buf)) } - if done < wanted { - // Partial write due to full pipe. Note that this could also be - // the short write case above, we would expect a second call - // and the write to return zero bytes in this case. + bs := p.bufBlockSeq.DropFirst64(uint64(woff)).TakeFirst64(uint64(count)) + + // Perform the write. + doneU64, err := f(bs) + done := int64(doneU64) + p.size += done + if done < count || err != nil { + return done, err + } + + // If we shortened the write, adjust the returned error appropriately. + if short { return done, syserror.ErrWouldBlock } @@ -324,7 +352,7 @@ func (p *Pipe) HasWriters() bool { // Precondition: mu must be held. func (p *Pipe) rReadinessLocked() waiter.EventMask { ready := waiter.EventMask(0) - if p.HasReaders() && p.view.Size() != 0 { + if p.HasReaders() && p.size != 0 { ready |= waiter.EventIn } if !p.HasWriters() && p.hadWriter { @@ -350,7 +378,7 @@ func (p *Pipe) rReadiness() waiter.EventMask { // Precondition: mu must be held. func (p *Pipe) wReadinessLocked() waiter.EventMask { ready := waiter.EventMask(0) - if p.HasWriters() && p.view.Size() < p.max { + if p.HasWriters() && p.size < p.max { ready |= waiter.EventOut } if !p.HasReaders() { @@ -383,7 +411,7 @@ func (p *Pipe) queued() int64 { } func (p *Pipe) queuedLocked() int64 { - return p.view.Size() + return p.size } // FifoSize implements fs.FifoSizer.FifoSize. @@ -406,7 +434,7 @@ func (p *Pipe) SetFifoSize(size int64) (int64, error) { } p.mu.Lock() defer p.mu.Unlock() - if size < p.view.Size() { + if size < p.size { return 0, syserror.EBUSY } p.max = size diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index f665920cb..77246edbe 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -21,9 +21,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/amutex" - "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" @@ -44,46 +44,37 @@ func (p *Pipe) Release(context.Context) { // Read reads from the Pipe into dst. func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error) { - n, err := p.read(ctx, readOps{ - left: func() int64 { - return dst.NumBytes() - }, - limit: func(l int64) { - dst = dst.TakeFirst64(l) - }, - read: func(view *buffer.View) (int64, error) { - n, err := dst.CopyOutFrom(ctx, view) - dst = dst.DropFirst64(n) - view.TrimFront(n) - return n, err - }, - }) + n, err := dst.CopyOutFrom(ctx, p) if n > 0 { p.Notify(waiter.EventOut) } return n, err } +// ReadToBlocks implements safemem.Reader.ReadToBlocks for Pipe.Read. +func (p *Pipe) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + n, err := p.read(int64(dsts.NumBytes()), func(srcs safemem.BlockSeq) (uint64, error) { + return safemem.CopySeq(dsts, srcs) + }, true /* removeFromSrc */) + return uint64(n), err +} + +func (p *Pipe) read(count int64, f func(srcs safemem.BlockSeq) (uint64, error), removeFromSrc bool) (int64, error) { + p.mu.Lock() + defer p.mu.Unlock() + n, err := p.peekLocked(count, f) + if n > 0 && removeFromSrc { + p.consumeLocked(n) + } + return n, err +} + // WriteTo writes to w from the Pipe. func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) (int64, error) { - ops := readOps{ - left: func() int64 { - return count - }, - limit: func(l int64) { - count = l - }, - read: func(view *buffer.View) (int64, error) { - n, err := view.ReadToWriter(w, count) - if !dup { - view.TrimFront(n) - } - count -= n - return n, err - }, - } - n, err := p.read(ctx, ops) - if n > 0 { + n, err := p.read(count, func(srcs safemem.BlockSeq) (uint64, error) { + return safemem.FromIOWriter{w}.WriteFromBlocks(srcs) + }, !dup /* removeFromSrc */) + if n > 0 && !dup { p.Notify(waiter.EventOut) } return n, err @@ -91,39 +82,31 @@ func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) // Write writes to the Pipe from src. func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) { - n, err := p.write(ctx, writeOps{ - left: func() int64 { - return src.NumBytes() - }, - limit: func(l int64) { - src = src.TakeFirst64(l) - }, - write: func(view *buffer.View) (int64, error) { - n, err := src.CopyInTo(ctx, view) - src = src.DropFirst64(n) - return n, err - }, - }) + n, err := src.CopyInTo(ctx, p) if n > 0 { p.Notify(waiter.EventIn) } return n, err } +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks for Pipe.Write. +func (p *Pipe) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + n, err := p.write(int64(srcs.NumBytes()), func(dsts safemem.BlockSeq) (uint64, error) { + return safemem.CopySeq(dsts, srcs) + }) + return uint64(n), err +} + +func (p *Pipe) write(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { + p.mu.Lock() + defer p.mu.Unlock() + return p.writeLocked(count, f) +} + // ReadFrom reads from r to the Pipe. func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, error) { - n, err := p.write(ctx, writeOps{ - left: func() int64 { - return count - }, - limit: func(l int64) { - count = l - }, - write: func(view *buffer.View) (int64, error) { - n, err := view.WriteFromReader(r, count) - count -= n - return n, err - }, + n, err := p.write(count, func(dsts safemem.BlockSeq) (uint64, error) { + return safemem.FromIOReader{r}.ReadToBlocks(dsts) }) if n > 0 { p.Notify(waiter.EventIn) diff --git a/pkg/sentry/kernel/pipe/save_restore.go b/pkg/sentry/kernel/pipe/save_restore.go new file mode 100644 index 000000000..f135827de --- /dev/null +++ b/pkg/sentry/kernel/pipe/save_restore.go @@ -0,0 +1,26 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "gvisor.dev/gvisor/pkg/safemem" +) + +// afterLoad is called by stateify. +func (p *Pipe) afterLoad() { + p.bufBlocks[0] = safemem.BlockFromSafeSlice(p.buf) + p.bufBlocks[1] = p.bufBlocks[0] + p.bufBlockSeq = safemem.BlockSeqFromSlice(p.bufBlocks[:]) +} diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 2d47d2e82..d5a91730d 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -16,7 +16,6 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -269,12 +268,10 @@ func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) { // SpliceToNonPipe performs a splice operation from fd to a non-pipe file. func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) { fd.pipe.mu.Lock() - defer fd.pipe.mu.Unlock() // Cap the sequence at number of bytes actually available. - v := fd.pipe.queuedLocked() - if v < count { - count = v + if count > fd.pipe.size { + count = fd.pipe.size } src := usermem.IOSequence{ IO: fd, @@ -291,154 +288,97 @@ func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescripti n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{}) } if n > 0 { - fd.pipe.view.TrimFront(n) + fd.pipe.consumeLocked(n) + } + + fd.pipe.mu.Unlock() + + if n > 0 { + fd.pipe.Notify(waiter.EventOut) } return n, err } // SpliceFromNonPipe performs a splice operation from a non-pipe file to fd. func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) { - fd.pipe.mu.Lock() - defer fd.pipe.mu.Unlock() - dst := usermem.IOSequence{ IO: fd, Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}), } + var ( + n int64 + err error + ) + fd.pipe.mu.Lock() if off == -1 { - return in.Read(ctx, dst, vfs.ReadOptions{}) + n, err = in.Read(ctx, dst, vfs.ReadOptions{}) + } else { + n, err = in.PRead(ctx, dst, off, vfs.ReadOptions{}) + } + fd.pipe.mu.Unlock() + + if n > 0 { + fd.pipe.Notify(waiter.EventIn) } - return in.PRead(ctx, dst, off, vfs.ReadOptions{}) + return n, err } // CopyIn implements usermem.IO.CopyIn. Note that it is the caller's -// responsibility to trim fd.pipe.view after the read is completed. +// responsibility to call fd.pipe.consumeLocked() and +// fd.pipe.Notify(waiter.EventOut) after the read is completed. +// +// Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) { - origCount := int64(len(dst)) - n, err := fd.pipe.readLocked(ctx, readOps{ - left: func() int64 { - return int64(len(dst)) - }, - limit: func(l int64) { - dst = dst[:l] - }, - read: func(view *buffer.View) (int64, error) { - n, err := view.ReadAt(dst, 0) - return int64(n), err - }, + n, err := fd.pipe.peekLocked(int64(len(dst)), func(srcs safemem.BlockSeq) (uint64, error) { + return safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), srcs) }) - if n > 0 { - fd.pipe.Notify(waiter.EventOut) - } - if err == nil && n != origCount { - return int(n), syserror.ErrWouldBlock - } return int(n), err } -// CopyOut implements usermem.IO.CopyOut. +// CopyOut implements usermem.IO.CopyOut. Note that it is the caller's +// responsibility to call fd.pipe.Notify(waiter.EventIn) after the +// write is completed. +// +// Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) { - origCount := int64(len(src)) - n, err := fd.pipe.writeLocked(ctx, writeOps{ - left: func() int64 { - return int64(len(src)) - }, - limit: func(l int64) { - src = src[:l] - }, - write: func(view *buffer.View) (int64, error) { - view.Append(src) - return int64(len(src)), nil - }, + n, err := fd.pipe.writeLocked(int64(len(src)), func(dsts safemem.BlockSeq) (uint64, error) { + return safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src))) }) - if n > 0 { - fd.pipe.Notify(waiter.EventIn) - } - if err == nil && n != origCount { - return int(n), syserror.ErrWouldBlock - } return int(n), err } // ZeroOut implements usermem.IO.ZeroOut. +// +// Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { - origCount := toZero - n, err := fd.pipe.writeLocked(ctx, writeOps{ - left: func() int64 { - return toZero - }, - limit: func(l int64) { - toZero = l - }, - write: func(view *buffer.View) (int64, error) { - view.Grow(view.Size()+toZero, true /* zero */) - return toZero, nil - }, + n, err := fd.pipe.writeLocked(toZero, func(dsts safemem.BlockSeq) (uint64, error) { + return safemem.ZeroSeq(dsts) }) - if n > 0 { - fd.pipe.Notify(waiter.EventIn) - } - if err == nil && n != origCount { - return n, syserror.ErrWouldBlock - } return n, err } // CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's -// responsibility to trim fd.pipe.view after the read is completed. +// responsibility to call fd.pipe.consumeLocked() and +// fd.pipe.Notify(waiter.EventOut) after the read is completed. +// +// Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { - count := ars.NumBytes() - if count == 0 { - return 0, nil - } - origCount := count - n, err := fd.pipe.readLocked(ctx, readOps{ - left: func() int64 { - return count - }, - limit: func(l int64) { - count = l - }, - read: func(view *buffer.View) (int64, error) { - n, err := view.ReadToSafememWriter(dst, uint64(count)) - return int64(n), err - }, + return fd.pipe.peekLocked(ars.NumBytes(), func(srcs safemem.BlockSeq) (uint64, error) { + return dst.WriteFromBlocks(srcs) }) - if n > 0 { - fd.pipe.Notify(waiter.EventOut) - } - if err == nil && n != origCount { - return n, syserror.ErrWouldBlock - } - return n, err } // CopyOutFrom implements usermem.IO.CopyOutFrom. +// +// Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { - count := ars.NumBytes() - if count == 0 { - return 0, nil - } - origCount := count - n, err := fd.pipe.writeLocked(ctx, writeOps{ - left: func() int64 { - return count - }, - limit: func(l int64) { - count = l - }, - write: func(view *buffer.View) (int64, error) { - n, err := view.WriteFromSafememReader(src, uint64(count)) - return int64(n), err - }, + n, err := fd.pipe.writeLocked(ars.NumBytes(), func(dsts safemem.BlockSeq) (uint64, error) { + return src.ReadToBlocks(dsts) }) if n > 0 { fd.pipe.Notify(waiter.EventIn) } - if err == nil && n != origCount { - return n, syserror.ErrWouldBlock - } return n, err } @@ -481,37 +421,23 @@ func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFr } lockTwoPipes(dst.pipe, src.pipe) - defer dst.pipe.mu.Unlock() - defer src.pipe.mu.Unlock() - - n, err := dst.pipe.writeLocked(ctx, writeOps{ - left: func() int64 { - return count - }, - limit: func(l int64) { - count = l - }, - write: func(dstView *buffer.View) (int64, error) { - return src.pipe.readLocked(ctx, readOps{ - left: func() int64 { - return count - }, - limit: func(l int64) { - count = l - }, - read: func(srcView *buffer.View) (int64, error) { - n, err := srcView.ReadToSafememWriter(dstView, uint64(count)) - if n > 0 && removeFromSrc { - srcView.TrimFront(int64(n)) - } - return int64(n), err - }, - }) - }, + n, err := dst.pipe.writeLocked(count, func(dsts safemem.BlockSeq) (uint64, error) { + n, err := src.pipe.peekLocked(int64(dsts.NumBytes()), func(srcs safemem.BlockSeq) (uint64, error) { + return safemem.CopySeq(dsts, srcs) + }) + if n > 0 && removeFromSrc { + src.pipe.consumeLocked(n) + } + return uint64(n), err }) + dst.pipe.mu.Unlock() + src.pipe.mu.Unlock() + if n > 0 { dst.pipe.Notify(waiter.EventIn) - src.pipe.Notify(waiter.EventOut) + if removeFromSrc { + src.pipe.Notify(waiter.EventOut) + } } return n, err } diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index b2206900b..22abca120 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -18,7 +18,6 @@ go_library( ], deps = [ "//pkg/abi/linux", - "//pkg/amutex", "//pkg/binary", "//pkg/context", "//pkg/log", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 57f224120..03749a8bf 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -36,7 +36,6 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" @@ -187,6 +186,21 @@ var Metrics = tcpip.Stats{ IPTablesInputDropped: mustCreateMetric("/netstack/ip/iptables/input_dropped", "Total number of IP packets dropped in the Input chain."), IPTablesOutputDropped: mustCreateMetric("/netstack/ip/iptables/output_dropped", "Total number of IP packets dropped in the Output chain."), }, + ARP: tcpip.ARPStats{ + PacketsReceived: mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."), + DisabledPacketsReceived: mustCreateMetric("/netstack/arp/disabled_packets_received", "Number of ARP packets received from the link layer when the ARP layer is disabled."), + MalformedPacketsReceived: mustCreateMetric("/netstack/arp/malformed_packets_received", "Number of ARP packets which failed ARP header validation checks."), + RequestsReceived: mustCreateMetric("/netstack/arp/requests_received", "Number of ARP requests received."), + RequestsReceivedUnknownTargetAddress: mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."), + OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."), + OutgoingRequestBadLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."), + OutgoingRequestNetworkUnreachableErrors: mustCreateMetric("/netstack/arp/outgoing_requests_network_unreachable", "Number of failed attempts to send an ARP request with a network unreachable error."), + OutgoingRequestsDropped: mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."), + OutgoingRequestsSent: mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."), + RepliesReceived: mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."), + OutgoingRepliesDropped: mustCreateMetric("/netstack/arp/outgoing_replies_dropped", "Number of ARP replies which failed to write to a link-layer endpoint."), + OutgoingRepliesSent: mustCreateMetric("/netstack/arp/outgoing_replies_sent", "Number of ARP replies sent."), + }, TCP: tcpip.TCPStats{ ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."), PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."), @@ -459,18 +473,10 @@ func (i *ioSequencePayload) DropFirst(n int) { // Write implements fs.FileOperations.Write. func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { f := &ioSequencePayload{ctx: ctx, src: src} - n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{}) + n, err := s.Endpoint.Write(f, tcpip.WriteOptions{}) if err == tcpip.ErrWouldBlock { return 0, syserror.ErrWouldBlock } - - if resCh != nil { - if err := amutex.Block(ctx, resCh); err != nil { - return 0, err - } - n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{}) - } - if err != nil { return 0, syserr.TranslateNetstackError(err).ToError() } @@ -526,24 +532,12 @@ func (r *readerPayload) Payload(size int) ([]byte, *tcpip.Error) { // ReadFrom implements fs.FileOperations.ReadFrom. func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) { f := &readerPayload{ctx: ctx, r: r, count: count} - n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{ + n, err := s.Endpoint.Write(f, tcpip.WriteOptions{ // Reads may be destructive but should be very fast, // so we can't release the lock while copying data. Atomic: true, }) if err == tcpip.ErrWouldBlock { - return 0, syserror.ErrWouldBlock - } - - if resCh != nil { - if err := amutex.Block(ctx, resCh); err != nil { - return 0, err - } - n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{ - Atomic: true, // See above. - }) - } - if err == tcpip.ErrWouldBlock { return n, syserror.ErrWouldBlock } else if err != nil { return int64(n), f.err // Propagate error. @@ -2836,13 +2830,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b } v := &ioSequencePayload{t, src} - n, resCh, err := s.Endpoint.Write(v, opts) - if resCh != nil { - if err := t.Block(resCh); err != nil { - return 0, syserr.FromError(err) - } - n, _, err = s.Endpoint.Write(v, opts) - } + n, err := s.Endpoint.Write(v, opts) dontWait := flags&linux.MSG_DONTWAIT != 0 if err == nil && (n >= v.src.NumBytes() || dontWait) { // Complete write. @@ -2861,7 +2849,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b v.DropFirst(int(n)) total := n for { - n, _, err = s.Endpoint.Write(v, opts) + n, err = s.Endpoint.Write(v, opts) v.DropFirst(int(n)) total += n diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index b756bfca0..6f70b02fc 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -16,7 +16,6 @@ package netstack import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" @@ -131,18 +130,10 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs } f := &ioSequencePayload{ctx: ctx, src: src} - n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{}) + n, err := s.Endpoint.Write(f, tcpip.WriteOptions{}) if err == tcpip.ErrWouldBlock { return 0, syserror.ErrWouldBlock } - - if resCh != nil { - if err := amutex.Block(ctx, resCh); err != nil { - return 0, err - } - n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{}) - } - if err != nil { return 0, syserr.TranslateNetstackError(err).ToError() } diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index 1c4cdb0dd..134051124 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -29,24 +29,23 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 || (opts.SrcStart+opts.Length < 0) { return 0, syserror.EINVAL } - + if opts.Length == 0 { + return 0, nil + } if opts.Length > int64(kernel.MAX_RW_COUNT) { opts.Length = int64(kernel.MAX_RW_COUNT) } var ( - total int64 n int64 err error inCh chan struct{} outCh chan struct{} ) - for opts.Length > 0 { + for { n, err = fs.Splice(t, outFile, inFile, opts) - opts.Length -= n - total += n - if err != syserror.ErrWouldBlock { + if n != 0 || err != syserror.ErrWouldBlock { break } else if err == syserror.ErrWouldBlock && nonBlocking { break @@ -87,13 +86,13 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB } } - if total > 0 { + if n > 0 { // On Linux, inotify behavior is not very consistent with splice(2). We try // our best to emulate Linux for very basic calls to splice, where for some // reason, events are generated for output files, but not input files. outFile.Dirent.InotifyEvent(linux.IN_MODIFY, 0) } - return total, err + return n, err } // Sendfile implements linux system call sendfile(2). |