diff options
Diffstat (limited to 'pkg/sentry/syscalls/linux')
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_aio.go | 169 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/BUILD | 3 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/aio.go | 216 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/vfs2.go | 6 |
4 files changed, 279 insertions, 115 deletions
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index d781d6a04..ba2557c52 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -15,8 +15,8 @@ package linux import ( - "encoding/binary" - + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -27,59 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// I/O commands. -const ( - _IOCB_CMD_PREAD = 0 - _IOCB_CMD_PWRITE = 1 - _IOCB_CMD_FSYNC = 2 - _IOCB_CMD_FDSYNC = 3 - _IOCB_CMD_NOOP = 6 - _IOCB_CMD_PREADV = 7 - _IOCB_CMD_PWRITEV = 8 -) - -// I/O flags. -const ( - _IOCB_FLAG_RESFD = 1 -) - -// ioCallback describes an I/O request. -// -// The priority field is currently ignored in the implementation below. Also -// note that the IOCB_FLAG_RESFD feature is not supported. -type ioCallback struct { - Data uint64 - Key uint32 - Reserved1 uint32 - - OpCode uint16 - ReqPrio int16 - FD int32 - - Buf uint64 - Bytes uint64 - Offset int64 - - Reserved2 uint64 - Flags uint32 - - // eventfd to signal if IOCB_FLAG_RESFD is set in flags. - ResFD int32 -} - -// ioEvent describes an I/O result. -// -// +stateify savable -type ioEvent struct { - Data uint64 - Obj uint64 - Result int64 - Result2 int64 -} - -// ioEventSize is the size of an ioEvent encoded. -var ioEventSize = binary.Size(ioEvent{}) - // IoSetup implements linux syscall io_setup(2). func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { nrEvents := args[0].Int() @@ -192,7 +139,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } } - ev := v.(*ioEvent) + ev := v.(*linux.IOEvent) // Copy out the result. if _, err := t.CopyOut(eventsAddr, ev); err != nil { @@ -204,7 +151,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } // Keep rolling. - eventsAddr += usermem.Addr(ioEventSize) + eventsAddr += usermem.Addr(linux.IOEventSize) } // Everything finished. @@ -231,7 +178,7 @@ func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadl } // memoryFor returns appropriate memory for the given callback. -func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) { +func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) { bytes := int(cb.Bytes) if bytes < 0 { // Linux also requires that this field fit in ssize_t. @@ -242,17 +189,17 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) { // we have no guarantee that t's AddressSpace will be active during the // I/O. switch cb.OpCode { - case _IOCB_CMD_PREAD, _IOCB_CMD_PWRITE: + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE: return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ AddressSpaceActive: false, }) - case _IOCB_CMD_PREADV, _IOCB_CMD_PWRITEV: + case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV: return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ AddressSpaceActive: false, }) - case _IOCB_CMD_FSYNC, _IOCB_CMD_FDSYNC, _IOCB_CMD_NOOP: + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP: return usermem.IOSequence{}, nil default: @@ -261,54 +208,62 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) { } } -func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) { - if ctx.Dead() { - ctx.CancelPendingRequest() - return - } - ev := &ioEvent{ - Data: cb.Data, - Obj: uint64(cbAddr), - } +// IoCancel implements linux syscall io_cancel(2). +// +// It is not presently supported (ENOSYS indicates no support on this +// architecture). +func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, syserror.ENOSYS +} - // Construct a context.Context that will not be interrupted if t is - // interrupted. - c := t.AsyncContext() +// LINT.IfChange - var err error - switch cb.OpCode { - case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV: - ev.Result, err = file.Preadv(c, ioseq, cb.Offset) - case _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV: - ev.Result, err = file.Pwritev(c, ioseq, cb.Offset) - case _IOCB_CMD_FSYNC: - err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncAll) - case _IOCB_CMD_FDSYNC: - err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncData) - } +func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, actx *mm.AIOContext, eventFile *fs.File) kernel.AIOCallback { + return func(ctx context.Context) { + if actx.Dead() { + actx.CancelPendingRequest() + return + } + ev := &linux.IOEvent{ + Data: cb.Data, + Obj: uint64(cbAddr), + } - // Update the result. - if err != nil { - err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file) - ev.Result = -int64(kernel.ExtractErrno(err, 0)) - } + var err error + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV: + ev.Result, err = file.Preadv(ctx, ioseq, cb.Offset) + case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + ev.Result, err = file.Pwritev(ctx, ioseq, cb.Offset) + case linux.IOCB_CMD_FSYNC: + err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + case linux.IOCB_CMD_FDSYNC: + err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncData) + } + + // Update the result. + if err != nil { + err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file) + ev.Result = -int64(kernel.ExtractErrno(err, 0)) + } - file.DecRef() + file.DecRef() - // Queue the result for delivery. - ctx.FinishRequest(ev) + // Queue the result for delivery. + actx.FinishRequest(ev) - // Notify the event file if one was specified. This needs to happen - // *after* queueing the result to avoid racing with the thread we may - // wake up. - if eventFile != nil { - eventFile.FileOperations.(*eventfd.EventOperations).Signal(1) - eventFile.DecRef() + // Notify the event file if one was specified. This needs to happen + // *after* queueing the result to avoid racing with the thread we may + // wake up. + if eventFile != nil { + eventFile.FileOperations.(*eventfd.EventOperations).Signal(1) + eventFile.DecRef() + } } } // submitCallback processes a single callback. -func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error { +func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error { file := t.GetFile(cb.FD) if file == nil { // File not found. @@ -318,7 +273,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Was there an eventFD? Extract it. var eventFile *fs.File - if cb.Flags&_IOCB_FLAG_RESFD != 0 { + if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { eventFile = t.GetFile(cb.ResFD) if eventFile == nil { // Bad FD. @@ -340,7 +295,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Check offset for reads/writes. switch cb.OpCode { - case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV, _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV: + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: if cb.Offset < 0 { return syserror.EINVAL } @@ -366,7 +321,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Perform the request asynchronously. file.IncRef() - fs.Async(func() { performCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile) }) + t.QueueAIO(getAIOCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile)) // All set. return nil @@ -395,7 +350,7 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } // Copy in this callback. - var cb ioCallback + var cb linux.IOCallback cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative)) if _, err := t.CopyIn(cbAddr, &cb); err != nil { @@ -424,10 +379,4 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return uintptr(nrEvents), nil, nil } -// IoCancel implements linux syscall io_cancel(2). -// -// It is not presently supported (ENOSYS indicates no support on this -// architecture). -func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, syserror.ENOSYS -} +// LINT.ThenChange(vfs2/aio.go) diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 9f93f4354..c301a0991 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -5,6 +5,7 @@ package(licenses = ["notice"]) go_library( name = "vfs2", srcs = [ + "aio.go", "epoll.go", "eventfd.go", "execve.go", @@ -40,6 +41,7 @@ go_library( "//pkg/abi/linux", "//pkg/binary", "//pkg/bits", + "//pkg/context", "//pkg/fspath", "//pkg/gohacks", "//pkg/sentry/arch", @@ -57,6 +59,7 @@ go_library( "//pkg/sentry/limits", "//pkg/sentry/loader", "//pkg/sentry/memmap", + "//pkg/sentry/mm", "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go new file mode 100644 index 000000000..e5cdefc50 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/aio.go @@ -0,0 +1,216 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/mm" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// IoSubmit implements linux syscall io_submit(2). +func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Uint64() + nrEvents := args[1].Int() + addr := args[2].Pointer() + + if nrEvents < 0 { + return 0, nil, syserror.EINVAL + } + + for i := int32(0); i < nrEvents; i++ { + // Copy in the address. + cbAddrNative := t.Arch().Native(0) + if _, err := t.CopyIn(addr, cbAddrNative); err != nil { + if i > 0 { + // Some successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Copy in this callback. + var cb linux.IOCallback + cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative)) + if _, err := t.CopyIn(cbAddr, &cb); err != nil { + if i > 0 { + // Some have been successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Process this callback. + if err := submitCallback(t, id, &cb, cbAddr); err != nil { + if i > 0 { + // Partial success. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Advance to the next one. + addr += usermem.Addr(t.Arch().Width()) + } + + return uintptr(nrEvents), nil, nil +} + +// submitCallback processes a single callback. +func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error { + if cb.Reserved2 != 0 { + return syserror.EINVAL + } + + fd := t.GetFileVFS2(cb.FD) + if fd == nil { + return syserror.EBADF + } + defer fd.DecRef() + + // Was there an eventFD? Extract it. + var eventFD *vfs.FileDescription + if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { + eventFD = t.GetFileVFS2(cb.ResFD) + if eventFD == nil { + return syserror.EBADF + } + defer eventFD.DecRef() + + // Check that it is an eventfd. + if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok { + return syserror.EINVAL + } + } + + ioseq, err := memoryFor(t, cb) + if err != nil { + return err + } + + // Check offset for reads/writes. + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + if cb.Offset < 0 { + return syserror.EINVAL + } + } + + // Prepare the request. + aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id) + if !ok { + return syserror.EINVAL + } + if ready := aioCtx.Prepare(); !ready { + // Context is busy. + return syserror.EAGAIN + } + + if eventFD != nil { + // The request is set. Make sure there's a ref on the file. + // + // This is necessary when the callback executes on completion, + // which is also what will release this reference. + eventFD.IncRef() + } + + // Perform the request asynchronously. + fd.IncRef() + t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx)) + return nil +} + +func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback { + return func(ctx context.Context) { + if aioCtx.Dead() { + aioCtx.CancelPendingRequest() + return + } + ev := &linux.IOEvent{ + Data: cb.Data, + Obj: uint64(cbAddr), + } + + var err error + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV: + ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{}) + case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{}) + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC: + err = fd.Sync(ctx) + } + + // Update the result. + if err != nil { + err = slinux.HandleIOErrorVFS2(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd) + ev.Result = -int64(kernel.ExtractErrno(err, 0)) + } + + fd.DecRef() + + // Queue the result for delivery. + aioCtx.FinishRequest(ev) + + // Notify the event file if one was specified. This needs to happen + // *after* queueing the result to avoid racing with the thread we may + // wake up. + if eventFD != nil { + eventFD.Impl().(*eventfd.EventFileDescription).Signal(1) + eventFD.DecRef() + } + } +} + +// memoryFor returns appropriate memory for the given callback. +func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) { + bytes := int(cb.Bytes) + if bytes < 0 { + // Linux also requires that this field fit in ssize_t. + return usermem.IOSequence{}, syserror.EINVAL + } + + // Since this I/O will be asynchronous with respect to t's task goroutine, + // we have no guarantee that t's AddressSpace will be active during the + // I/O. + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE: + return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV: + return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP: + return usermem.IOSequence{}, nil + + default: + // Not a supported command. + return usermem.IOSequence{}, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 954c82f97..caa6a98ff 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -105,11 +105,7 @@ func Override() { s.Table[197] = syscalls.Supported("removexattr", Removexattr) s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr) s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr) - delete(s.Table, 206) // io_setup - delete(s.Table, 207) // io_destroy - delete(s.Table, 208) // io_getevents - delete(s.Table, 209) // io_submit - delete(s.Table, 210) // io_cancel + s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}) s.Table[213] = syscalls.Supported("epoll_create", EpollCreate) s.Table[217] = syscalls.Supported("getdents64", Getdents64) delete(s.Table, 221) // fdavise64 |