From ae6f092fe117a738df34e072ef5ba01a41c89222 Mon Sep 17 00:00:00 2001 From: Justine Olshan Date: Fri, 10 Aug 2018 16:09:52 -0700 Subject: Implemented the splice(2) syscall. Currently the implementation matches the behavior of moving data between two file descriptors. However, it does not implement this through zero-copy movement. Thus, this code is a starting point to build the more complex implementation. PiperOrigin-RevId: 208284483 Change-Id: Ibde79520a3d50bc26aead7ad4f128d2be31db14e --- pkg/abi/linux/BUILD | 1 + pkg/abi/linux/splice.go | 20 +++ pkg/sentry/syscalls/linux/BUILD | 1 + pkg/sentry/syscalls/linux/linux64.go | 2 +- pkg/sentry/syscalls/linux/sys_splice.go | 260 ++++++++++++++++++++++++++++++++ 5 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 pkg/abi/linux/splice.go create mode 100644 pkg/sentry/syscalls/linux/sys_splice.go diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index ac4ceefbc..9a44c2042 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -43,6 +43,7 @@ go_library( "shm.go", "signal.go", "socket.go", + "splice.go", "time.go", "tty.go", "uio.go", diff --git a/pkg/abi/linux/splice.go b/pkg/abi/linux/splice.go new file mode 100644 index 000000000..9331ec84b --- /dev/null +++ b/pkg/abi/linux/splice.go @@ -0,0 +1,20 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// Flags for splice(2). +const ( + SPLICE_F_NONBLOCK = 2 +) diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index bbdfad9da..62423c0fa 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -34,6 +34,7 @@ go_library( "sys_shm.go", "sys_signal.go", "sys_socket.go", + "sys_splice.go", "sys_stat.go", "sys_sync.go", "sys_sysinfo.go", diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index c102af101..485c96202 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -319,7 +319,7 @@ var AMD64 = &kernel.SyscallTable{ 272: Unshare, 273: syscalls.Error(syscall.ENOSYS), // SetRobustList, obsolete 274: syscalls.Error(syscall.ENOSYS), // GetRobustList, obsolete - // 275: Splice, TODO + 275: Splice, // 276: Tee, TODO // 277: SyncFileRange, TODO // 278: Vmsplice, TODO diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go new file mode 100644 index 000000000..8151e3599 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -0,0 +1,260 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "io" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Splice implements linux syscall splice(2). +func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fdIn := kdefs.FD(args[0].Int()) + offIn := args[1].Pointer() + fdOut := kdefs.FD(args[2].Int()) + offOut := args[3].Pointer() + size := int64(args[4].SizeT()) + flags := uint(args[5].Uint()) + + fileIn := t.FDMap().GetFile(fdIn) + if fileIn == nil { + return 0, nil, syserror.EBADF + } + defer fileIn.DecRef() + fileOut := t.FDMap().GetFile(fdOut) + if fileOut == nil { + return 0, nil, syserror.EBADF + } + defer fileOut.DecRef() + + // Check for whether we have pipes. + ipipe := fs.IsPipe(fileIn.Dirent.Inode.StableAttr) + opipe := fs.IsPipe(fileOut.Dirent.Inode.StableAttr) + if (ipipe && offIn != 0) || (opipe && offOut != 0) { + return 0, nil, syserror.ESPIPE + } + + // Check if both file descriptors are pipes. + if ipipe && opipe { + var readPipe *pipe.Pipe + switch p := fileIn.FileOperations.(type) { + case *pipe.Reader: + readPipe = p.ReaderWriter.Pipe + case *pipe.ReaderWriter: + readPipe = p.Pipe + default: + return 0, nil, syserror.EBADF + } + var writePipe *pipe.Pipe + switch p := fileOut.FileOperations.(type) { + case *pipe.Writer: + writePipe = p.ReaderWriter.Pipe + case *pipe.ReaderWriter: + writePipe = p.Pipe + default: + return 0, nil, syserror.EBADF + } + + // Splicing with two ends of the same pipe is not allowed. + if readPipe == writePipe { + return 0, nil, syserror.EINVAL + } + spliced, err := splicePipeToPipe(t, fileIn, fileOut, size, flags) + if err != nil { + return 0, nil, err + } + return uintptr(spliced), nil, nil + } + + // Check if the file descriptor that contains the data to move is a pipe. + if ipipe { + flagsOut := fileOut.Flags() + offset := uint64(fileOut.Offset()) + + // If there is an offset for the file, ensure the file has the Pwrite flag. + if offOut != 0 { + if !flagsOut.Pwrite { + return 0, nil, syserror.EINVAL + } + if _, err := t.CopyIn(offOut, &offset); err != nil { + return 0, nil, err + } + } + + if !flagsOut.Write { + return 0, nil, syserror.EBADF + } + + if flagsOut.Append { + return 0, nil, syserror.EINVAL + } + + switch fileIn.FileOperations.(type) { + case *pipe.Reader, *pipe.ReaderWriter: + // If the pipe in is a Reader or ReaderWriter, we can continue. + default: + return 0, nil, syserror.EBADF + } + spliced, err := spliceWrite(t, fileIn, fileOut, size, offset, flags) + if err != nil { + return 0, nil, err + } + + // Make sure value that offset points to is updated. + if offOut == 0 { + fileOut.Seek(t, fs.SeekSet, spliced+int64(offset)) + } else if _, err := t.CopyOut(offOut, spliced+int64(offset)); err != nil { + return 0, nil, err + } + return uintptr(spliced), nil, nil + } + + // Check if the file descriptor that the data will be moved to is a pipe. + if opipe { + flagsIn := fileIn.Flags() + offset := uint64(fileIn.Offset()) + + // If there is an offset for the file, ensure the file has the Pread flag. + if offIn != 0 { + if !flagsIn.Pread { + return 0, nil, syserror.EINVAL + } + if _, err := t.CopyIn(offIn, &offset); err != nil { + return 0, nil, err + } + } + + if !flagsIn.Read { + return 0, nil, syserror.EBADF + } + + switch fileOut.FileOperations.(type) { + case *pipe.Writer, *pipe.ReaderWriter: + // If the pipe out is a Writer or ReaderWriter, we can continue. + default: + return 0, nil, syserror.EBADF + } + spliced, err := spliceRead(t, fileIn, fileOut, size, offset, flags) + if err != nil { + return 0, nil, err + } + + // Make sure value that offset points to is updated. + if offIn == 0 { + fileOut.Seek(t, fs.SeekSet, spliced+int64(offset)) + } else if _, err := t.CopyOut(offIn, spliced+int64(offset)); err != nil { + return 0, nil, err + } + return uintptr(spliced), nil, nil + } + + // Splice requires one of the file descriptors to be a pipe. + return 0, nil, syserror.EINVAL +} + +// splicePipeToPipe moves data from one pipe to another pipe. +// TODO: Implement with zero copy movement/without copying between +// user and kernel address spaces. +func splicePipeToPipe(t *kernel.Task, inPipe *fs.File, outPipe *fs.File, size int64, flags uint) (int64, error) { + w := &fs.FileWriter{t, outPipe} + if flags == linux.SPLICE_F_NONBLOCK { + r := &io.LimitedReader{R: &fs.FileReader{t, inPipe}, N: size} + return io.Copy(w, r) + } + var n int64 + for read := int64(0); read < size; { + var err error + r := &io.LimitedReader{R: &fs.FileReader{t, inPipe}, N: size} + n, err = io.Copy(w, r) + if err != nil && err != syserror.ErrWouldBlock { + return 0, err + } + read += n + } + return n, nil +} + +// spliceRead moves data from a file to a pipe. +// TODO: Implement with zero copy movement/without copying between +// user and kernel address spaces. +func spliceRead(t *kernel.Task, inFile *fs.File, outPipe *fs.File, size int64, offset uint64, flags uint) (int64, error) { + w := &fs.FileWriter{t, outPipe} + if flags == linux.SPLICE_F_NONBLOCK { + r := io.NewSectionReader(&fs.FileReader{t, inFile}, int64(offset), size) + return io.Copy(w, r) + } + var n int64 + for read := int64(0); read < size; { + r := io.NewSectionReader(&fs.FileReader{t, inFile}, int64(offset), size) + var err error + n, err = io.Copy(w, r) + if err != nil && err != syserror.ErrWouldBlock { + return 0, err + } + read += n + } + return n, nil +} + +// offsetWriter implements io.Writer on a section of an underlying +// WriterAt starting from the offset and ending at the limit. +type offsetWriter struct { + w io.WriterAt + off int64 + limit int64 +} + +// Write implements io.Writer.Write and writes the content of the offsetWriter +// starting at the offset and ending at the limit into the given buffer. +func (o *offsetWriter) Write(p []byte) (n int, err error) { + if o.off >= o.limit { + return 0, io.EOF + } + if max := o.limit - o.off; int64(len(p)) > max { + p = p[0:max] + } + n, err = o.w.WriteAt(p, o.off) + o.off += int64(n) + return n, err +} + +// spliceWrite moves data from a pipe to a file. +// TODO: Implement with zero copy movement/without copying between +// user and kernel address spaces. +func spliceWrite(t *kernel.Task, inPipe *fs.File, outFile *fs.File, size int64, offset uint64, flags uint) (int64, error) { + w := &offsetWriter{&fs.FileWriter{t, outFile}, int64(offset), int64(offset) + size} + if flags == linux.SPLICE_F_NONBLOCK { + r := &io.LimitedReader{R: &fs.FileReader{t, inPipe}, N: size} + return io.Copy(w, r) + } + var n int64 + for read := int64(0); read < size; { + var err error + r := &io.LimitedReader{R: &fs.FileReader{t, inPipe}, N: size} + n, err = io.Copy(w, r) + if err != nil && err != syserror.ErrWouldBlock { + return 0, err + } + read += n + } + return n, nil +} -- cgit v1.2.3