1 files changed, 181 insertions, 0 deletions
diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go
new file mode 100644
index 000000000..33da82868
--- /dev/null
+++ b/pkg/sentry/fs/splice.go
@@ -0,0 +1,181 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"io"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Splice moves data to this file, directly from another.
+//
+// Offsets are updated only if DstOffset and SrcOffset are set.
+func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, error) {
+	// Verify basic file flag permissions.
+	if !dst.Flags().Write || !src.Flags().Read {
+		return 0, syserror.EBADF
+	}
+
+	// Check whether or not the objects being sliced are stream-oriented
+	// (i.e. pipes or sockets). For all stream-oriented files and files
+	// where a specific offiset is not request, we acquire the file mutex.
+	// This has two important side effects. First, it provides the standard
+	// protection against concurrent writes that would mutate the offset.
+	// Second, it prevents Splice deadlocks. Only internal anonymous files
+	// implement the ReadFrom and WriteTo methods directly, and since such
+	// anonymous files are referred to by a unique fs.File object, we know
+	// that the file mutex takes strict precedence over internal locks.
+	// Since we enforce lock ordering here, we can't deadlock by using
+	// using a file in two different splice operations simultaneously.
+	srcPipe := !IsRegular(src.Dirent.Inode.StableAttr)
+	dstPipe := !IsRegular(dst.Dirent.Inode.StableAttr)
+	dstAppend := !dstPipe && dst.Flags().Append
+	srcLock := srcPipe || !opts.SrcOffset
+	dstLock := dstPipe || !opts.DstOffset || dstAppend
+
+	switch {
+	case srcLock && dstLock:
+		switch {
+		case dst.UniqueID < src.UniqueID:
+			// Acquire dst first.
+			if !dst.mu.Lock(ctx) {
+				return 0, syserror.ErrInterrupted
+			}
+			if !src.mu.Lock(ctx) {
+				dst.mu.Unlock()
+				return 0, syserror.ErrInterrupted
+			}
+		case dst.UniqueID > src.UniqueID:
+			// Acquire src first.
+			if !src.mu.Lock(ctx) {
+				return 0, syserror.ErrInterrupted
+			}
+			if !dst.mu.Lock(ctx) {
+				src.mu.Unlock()
+				return 0, syserror.ErrInterrupted
+			}
+		case dst.UniqueID == src.UniqueID:
+			// Acquire only one lock; it's the same file. This is a
+			// bit of a edge case, but presumably it's possible.
+			if !dst.mu.Lock(ctx) {
+				return 0, syserror.ErrInterrupted
+			}
+			srcLock = false // Only need one unlock.
+		}
+		// Use both offsets (locked).
+		opts.DstStart = dst.offset
+		opts.SrcStart = src.offset
+	case dstLock:
+		// Acquire only dst.
+		if !dst.mu.Lock(ctx) {
+			return 0, syserror.ErrInterrupted
+		}
+		opts.DstStart = dst.offset // Safe: locked.
+	case srcLock:
+		// Acquire only src.
+		if !src.mu.Lock(ctx) {
+			return 0, syserror.ErrInterrupted
+		}
+		opts.SrcStart = src.offset // Safe: locked.
+	}
+
+	var err error
+	if dstAppend {
+		unlock := dst.Dirent.Inode.lockAppendMu(dst.Flags().Append)
+		defer unlock()
+
+		// Figure out the appropriate offset to use.
+		err = dst.offsetForAppend(ctx, &opts.DstStart)
+	}
+	if err == nil && !dstPipe {
+		// Enforce file limits.
+		limit, ok := dst.checkLimit(ctx, opts.DstStart)
+		switch {
+		case ok && limit == 0:
+			err = syserror.ErrExceedsFileSizeLimit
+		case ok && limit < opts.Length:
+			opts.Length = limit // Cap the write.
+		}
+	}
+	if err != nil {
+		if dstLock {
+			dst.mu.Unlock()
+		}
+		if srcLock {
+			src.mu.Unlock()
+		}
+		return 0, err
+	}
+
+	// Construct readers and writers for the splice. This is used to
+	// provide a safer locking path for the WriteTo/ReadFrom operations
+	// (since they will otherwise go through public interface methods which
+	// conflict with locking done above), and simplifies the fallback path.
+	w := &lockedWriter{
+		Ctx:    ctx,
+		File:   dst,
+		Offset: opts.DstStart,
+	}
+	r := &lockedReader{
+		Ctx:    ctx,
+		File:   src,
+		Offset: opts.SrcStart,
+	}
+
+	// Attempt to do a WriteTo; this is likely the most efficient.
+	n, err := src.FileOperations.WriteTo(ctx, src, w, opts.Length, opts.Dup)
+	if n == 0 && err == syserror.ENOSYS && !opts.Dup {
+		// Attempt as a ReadFrom. If a WriteTo, a ReadFrom may also be
+		// more efficient than a copy if buffers are cached or readily
+		// available. (It's unlikely that they can actually be donated).
+		n, err = dst.FileOperations.ReadFrom(ctx, dst, r, opts.Length)
+	}
+
+	// Support one last fallback option, but only if at least one of
+	// the source and destination are regular files. This is because
+	// if we block at some point, we could lose data. If the source is
+	// not a pipe then reading is not destructive; if the destination
+	// is a regular file, then it is guaranteed not to block writing.
+	if n == 0 && err == syserror.ENOSYS && !opts.Dup && (!dstPipe || !srcPipe) {
+		// Fallback to an in-kernel copy.
+		n, err = io.Copy(w, &io.LimitedReader{
+			R: r,
+			N: opts.Length,
+		})
+	}
+
+	// Update offsets, if required.
+	if n > 0 {
+		if !dstPipe && !opts.DstOffset {
+			atomic.StoreInt64(&dst.offset, dst.offset+n)
+		}
+		if !srcPipe && !opts.SrcOffset {
+			atomic.StoreInt64(&src.offset, src.offset+n)
+		}
+	}
+
+	// Drop locks.
+	if dstLock {
+		dst.mu.Unlock()
+	}
+	if srcLock {
+		src.mu.Unlock()
+	}
+
+	return n, err
+}