diff options
Diffstat (limited to 'pkg')
-rw-r--r-- | pkg/abi/linux/aio.go | 60 | ||||
-rw-r--r-- | pkg/abi/linux/linux_abi_autogen_unsafe.go | 6 | ||||
-rw-r--r-- | pkg/abi/linux/linux_state_autogen.go | 18 | ||||
-rw-r--r-- | pkg/sentry/kernel/aio.go | 81 | ||||
-rw-r--r-- | pkg/sentry/kernel/context.go | 53 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 10 | ||||
-rw-r--r-- | pkg/sentry/kernel/threads.go | 7 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/linux_state_autogen.go | 18 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_aio.go | 169 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/aio.go | 216 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/vfs2.go | 6 |
11 files changed, 448 insertions, 196 deletions
diff --git a/pkg/abi/linux/aio.go b/pkg/abi/linux/aio.go index 3c6e0079d..86ee3f8b5 100644 --- a/pkg/abi/linux/aio.go +++ b/pkg/abi/linux/aio.go @@ -14,7 +14,63 @@ package linux +import "encoding/binary" + +// AIORingSize is sizeof(struct aio_ring). +const AIORingSize = 32 + +// I/O commands. const ( - // AIORingSize is sizeof(struct aio_ring). - AIORingSize = 32 + IOCB_CMD_PREAD = 0 + IOCB_CMD_PWRITE = 1 + IOCB_CMD_FSYNC = 2 + IOCB_CMD_FDSYNC = 3 + // 4 was the experimental IOCB_CMD_PREADX. + IOCB_CMD_POLL = 5 + IOCB_CMD_NOOP = 6 + IOCB_CMD_PREADV = 7 + IOCB_CMD_PWRITEV = 8 ) + +// I/O flags. +const ( + IOCB_FLAG_RESFD = 1 + IOCB_FLAG_IOPRIO = 2 +) + +// IOCallback describes an I/O request. +// +// The priority field is currently ignored in the implementation below. Also +// note that the IOCB_FLAG_RESFD feature is not supported. +type IOCallback struct { + Data uint64 + Key uint32 + _ uint32 + + OpCode uint16 + ReqPrio int16 + FD int32 + + Buf uint64 + Bytes uint64 + Offset int64 + + Reserved2 uint64 + Flags uint32 + + // eventfd to signal if IOCB_FLAG_RESFD is set in flags. + ResFD int32 +} + +// IOEvent describes an I/O result. +// +// +stateify savable +type IOEvent struct { + Data uint64 + Obj uint64 + Result int64 + Result2 int64 +} + +// IOEventSize is the size of an ioEvent encoded. +var IOEventSize = binary.Size(IOEvent{}) diff --git a/pkg/abi/linux/linux_abi_autogen_unsafe.go b/pkg/abi/linux/linux_abi_autogen_unsafe.go index f18f8a0e2..f06e71199 100644 --- a/pkg/abi/linux/linux_abi_autogen_unsafe.go +++ b/pkg/abi/linux/linux_abi_autogen_unsafe.go @@ -124,12 +124,12 @@ func (s *Statx) UnmarshalBytes(src []byte) { // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *Statx) Packed() bool { - return s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() + return s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() && s.Atime.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *Statx) MarshalUnsafe(dst []byte) { - if s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() && s.Atime.Packed() { + if s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { safecopy.CopyIn(dst, unsafe.Pointer(s)) } else { s.MarshalBytes(dst) @@ -148,7 +148,7 @@ func (s *Statx) UnmarshalUnsafe(src []byte) { // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (s *Statx) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { - if !s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { + if !s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() && s.Atime.Packed() { // Type Statx doesn't have a packed layout in memory, fall back to MarshalBytes. buf := task.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. diff --git a/pkg/abi/linux/linux_state_autogen.go b/pkg/abi/linux/linux_state_autogen.go index b8e488a11..67da4be84 100644 --- a/pkg/abi/linux/linux_state_autogen.go +++ b/pkg/abi/linux/linux_state_autogen.go @@ -6,6 +6,23 @@ import ( "gvisor.dev/gvisor/pkg/state" ) +func (x *IOEvent) beforeSave() {} +func (x *IOEvent) save(m state.Map) { + x.beforeSave() + m.Save("Data", &x.Data) + m.Save("Obj", &x.Obj) + m.Save("Result", &x.Result) + m.Save("Result2", &x.Result2) +} + +func (x *IOEvent) afterLoad() {} +func (x *IOEvent) load(m state.Map) { + m.Load("Data", &x.Data) + m.Load("Obj", &x.Obj) + m.Load("Result", &x.Result) + m.Load("Result2", &x.Result2) +} + func (x *BPFInstruction) beforeSave() {} func (x *BPFInstruction) save(m state.Map) { x.beforeSave() @@ -62,6 +79,7 @@ func (x *WindowSize) load(m state.Map) { } func init() { + state.Register("pkg/abi/linux.IOEvent", (*IOEvent)(nil), state.Fns{Save: (*IOEvent).save, Load: (*IOEvent).load}) state.Register("pkg/abi/linux.BPFInstruction", (*BPFInstruction)(nil), state.Fns{Save: (*BPFInstruction).save, Load: (*BPFInstruction).load}) state.Register("pkg/abi/linux.KernelTermios", (*KernelTermios)(nil), state.Fns{Save: (*KernelTermios).save, Load: (*KernelTermios).load}) state.Register("pkg/abi/linux.WindowSize", (*WindowSize)(nil), state.Fns{Save: (*WindowSize).save, Load: (*WindowSize).load}) diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go new file mode 100644 index 000000000..0ac78c0b8 --- /dev/null +++ b/pkg/sentry/kernel/aio.go @@ -0,0 +1,81 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "time" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" +) + +// AIOCallback is an function that does asynchronous I/O on behalf of a task. +type AIOCallback func(context.Context) + +// QueueAIO queues an AIOCallback which will be run asynchronously. +func (t *Task) QueueAIO(cb AIOCallback) { + ctx := taskAsyncContext{t: t} + wg := &t.TaskSet().aioGoroutines + wg.Add(1) + go func() { + cb(ctx) + wg.Done() + }() +} + +type taskAsyncContext struct { + context.NoopSleeper + t *Task +} + +// Debugf implements log.Logger.Debugf. +func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { + ctx.t.Debugf(format, v...) +} + +// Infof implements log.Logger.Infof. +func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { + ctx.t.Infof(format, v...) +} + +// Warningf implements log.Logger.Warningf. +func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { + ctx.t.Warningf(format, v...) +} + +// IsLogging implements log.Logger.IsLogging. +func (ctx taskAsyncContext) IsLogging(level log.Level) bool { + return ctx.t.IsLogging(level) +} + +// Deadline implements context.Context.Deadline. +func (ctx taskAsyncContext) Deadline() (time.Time, bool) { + return ctx.t.Deadline() +} + +// Done implements context.Context.Done. +func (ctx taskAsyncContext) Done() <-chan struct{} { + return ctx.t.Done() +} + +// Err implements context.Context.Err. +func (ctx taskAsyncContext) Err() error { + return ctx.t.Err() +} + +// Value implements context.Context.Value. +func (ctx taskAsyncContext) Value(key interface{}) interface{} { + return ctx.t.Value(key) +} diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index 0c40bf315..dd5f0f5fa 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -18,7 +18,6 @@ import ( "time" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" ) // contextID is the kernel package's type for context.Context.Value keys. @@ -113,55 +112,3 @@ func (*Task) Done() <-chan struct{} { func (*Task) Err() error { return nil } - -// AsyncContext returns a context.Context that may be used by goroutines that -// do work on behalf of t and therefore share its contextual values, but are -// not t's task goroutine (e.g. asynchronous I/O). -func (t *Task) AsyncContext() context.Context { - return taskAsyncContext{t: t} -} - -type taskAsyncContext struct { - context.NoopSleeper - t *Task -} - -// Debugf implements log.Logger.Debugf. -func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { - ctx.t.Debugf(format, v...) -} - -// Infof implements log.Logger.Infof. -func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { - ctx.t.Infof(format, v...) -} - -// Warningf implements log.Logger.Warningf. -func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { - ctx.t.Warningf(format, v...) -} - -// IsLogging implements log.Logger.IsLogging. -func (ctx taskAsyncContext) IsLogging(level log.Level) bool { - return ctx.t.IsLogging(level) -} - -// Deadline implements context.Context.Deadline. -func (ctx taskAsyncContext) Deadline() (time.Time, bool) { - return ctx.t.Deadline() -} - -// Done implements context.Context.Done. -func (ctx taskAsyncContext) Done() <-chan struct{} { - return ctx.t.Done() -} - -// Err implements context.Context.Err. -func (ctx taskAsyncContext) Err() error { - return ctx.t.Err() -} - -// Value implements context.Context.Value. -func (ctx taskAsyncContext) Value(key interface{}) interface{} { - return ctx.t.Value(key) -} diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 52491da7a..554a42e05 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -452,9 +452,7 @@ func (k *Kernel) SaveTo(w io.Writer) error { return err } - // Ensure that all pending asynchronous work is complete: - // - inode and mount release - // - asynchronuous IO + // Ensure that all inode and mount release operations have completed. fs.AsyncBarrier() // Once all fs work has completed (flushed references have all been released), @@ -1249,13 +1247,15 @@ func (k *Kernel) Kill(es ExitStatus) { } // Pause requests that all tasks in k temporarily stop executing, and blocks -// until all tasks in k have stopped. Multiple calls to Pause nest and require -// an equal number of calls to Unpause to resume execution. +// until all tasks and asynchronous I/O operations in k have stopped. Multiple +// calls to Pause nest and require an equal number of calls to Unpause to +// resume execution. func (k *Kernel) Pause() { k.extMu.Lock() k.tasks.BeginExternalStop() k.extMu.Unlock() k.tasks.runningGoroutines.Wait() + k.tasks.aioGoroutines.Wait() } // Unpause ends the effect of a previous call to Pause. If Unpause is called diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go index bf2dabb6e..872e1a82d 100644 --- a/pkg/sentry/kernel/threads.go +++ b/pkg/sentry/kernel/threads.go @@ -87,6 +87,13 @@ type TaskSet struct { // at time of save (but note that this is not necessarily the same thing as // sync.WaitGroup's zero value). runningGoroutines sync.WaitGroup `state:"nosave"` + + // aioGoroutines is the number of goroutines running async I/O + // callbacks. + // + // aioGoroutines is not saved but is required to be zero at the time of + // save. + aioGoroutines sync.WaitGroup `state:"nosave"` } // newTaskSet returns a new, empty TaskSet. diff --git a/pkg/sentry/syscalls/linux/linux_state_autogen.go b/pkg/sentry/syscalls/linux/linux_state_autogen.go index cbae0a871..c76ccd1d5 100644 --- a/pkg/sentry/syscalls/linux/linux_state_autogen.go +++ b/pkg/sentry/syscalls/linux/linux_state_autogen.go @@ -6,23 +6,6 @@ import ( "gvisor.dev/gvisor/pkg/state" ) -func (x *ioEvent) beforeSave() {} -func (x *ioEvent) save(m state.Map) { - x.beforeSave() - m.Save("Data", &x.Data) - m.Save("Obj", &x.Obj) - m.Save("Result", &x.Result) - m.Save("Result2", &x.Result2) -} - -func (x *ioEvent) afterLoad() {} -func (x *ioEvent) load(m state.Map) { - m.Load("Data", &x.Data) - m.Load("Obj", &x.Obj) - m.Load("Result", &x.Result) - m.Load("Result2", &x.Result2) -} - func (x *futexWaitRestartBlock) beforeSave() {} func (x *futexWaitRestartBlock) save(m state.Map) { x.beforeSave() @@ -73,7 +56,6 @@ func (x *clockNanosleepRestartBlock) load(m state.Map) { } func init() { - state.Register("pkg/sentry/syscalls/linux.ioEvent", (*ioEvent)(nil), state.Fns{Save: (*ioEvent).save, Load: (*ioEvent).load}) state.Register("pkg/sentry/syscalls/linux.futexWaitRestartBlock", (*futexWaitRestartBlock)(nil), state.Fns{Save: (*futexWaitRestartBlock).save, Load: (*futexWaitRestartBlock).load}) state.Register("pkg/sentry/syscalls/linux.pollRestartBlock", (*pollRestartBlock)(nil), state.Fns{Save: (*pollRestartBlock).save, Load: (*pollRestartBlock).load}) state.Register("pkg/sentry/syscalls/linux.clockNanosleepRestartBlock", (*clockNanosleepRestartBlock)(nil), state.Fns{Save: (*clockNanosleepRestartBlock).save, Load: (*clockNanosleepRestartBlock).load}) diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index d781d6a04..ba2557c52 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -15,8 +15,8 @@ package linux import ( - "encoding/binary" - + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -27,59 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// I/O commands. -const ( - _IOCB_CMD_PREAD = 0 - _IOCB_CMD_PWRITE = 1 - _IOCB_CMD_FSYNC = 2 - _IOCB_CMD_FDSYNC = 3 - _IOCB_CMD_NOOP = 6 - _IOCB_CMD_PREADV = 7 - _IOCB_CMD_PWRITEV = 8 -) - -// I/O flags. -const ( - _IOCB_FLAG_RESFD = 1 -) - -// ioCallback describes an I/O request. -// -// The priority field is currently ignored in the implementation below. Also -// note that the IOCB_FLAG_RESFD feature is not supported. -type ioCallback struct { - Data uint64 - Key uint32 - Reserved1 uint32 - - OpCode uint16 - ReqPrio int16 - FD int32 - - Buf uint64 - Bytes uint64 - Offset int64 - - Reserved2 uint64 - Flags uint32 - - // eventfd to signal if IOCB_FLAG_RESFD is set in flags. - ResFD int32 -} - -// ioEvent describes an I/O result. -// -// +stateify savable -type ioEvent struct { - Data uint64 - Obj uint64 - Result int64 - Result2 int64 -} - -// ioEventSize is the size of an ioEvent encoded. -var ioEventSize = binary.Size(ioEvent{}) - // IoSetup implements linux syscall io_setup(2). func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { nrEvents := args[0].Int() @@ -192,7 +139,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } } - ev := v.(*ioEvent) + ev := v.(*linux.IOEvent) // Copy out the result. if _, err := t.CopyOut(eventsAddr, ev); err != nil { @@ -204,7 +151,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } // Keep rolling. - eventsAddr += usermem.Addr(ioEventSize) + eventsAddr += usermem.Addr(linux.IOEventSize) } // Everything finished. @@ -231,7 +178,7 @@ func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadl } // memoryFor returns appropriate memory for the given callback. -func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) { +func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) { bytes := int(cb.Bytes) if bytes < 0 { // Linux also requires that this field fit in ssize_t. @@ -242,17 +189,17 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) { // we have no guarantee that t's AddressSpace will be active during the // I/O. switch cb.OpCode { - case _IOCB_CMD_PREAD, _IOCB_CMD_PWRITE: + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE: return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ AddressSpaceActive: false, }) - case _IOCB_CMD_PREADV, _IOCB_CMD_PWRITEV: + case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV: return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ AddressSpaceActive: false, }) - case _IOCB_CMD_FSYNC, _IOCB_CMD_FDSYNC, _IOCB_CMD_NOOP: + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP: return usermem.IOSequence{}, nil default: @@ -261,54 +208,62 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) { } } -func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) { - if ctx.Dead() { - ctx.CancelPendingRequest() - return - } - ev := &ioEvent{ - Data: cb.Data, - Obj: uint64(cbAddr), - } +// IoCancel implements linux syscall io_cancel(2). +// +// It is not presently supported (ENOSYS indicates no support on this +// architecture). +func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, syserror.ENOSYS +} - // Construct a context.Context that will not be interrupted if t is - // interrupted. - c := t.AsyncContext() +// LINT.IfChange - var err error - switch cb.OpCode { - case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV: - ev.Result, err = file.Preadv(c, ioseq, cb.Offset) - case _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV: - ev.Result, err = file.Pwritev(c, ioseq, cb.Offset) - case _IOCB_CMD_FSYNC: - err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncAll) - case _IOCB_CMD_FDSYNC: - err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncData) - } +func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, actx *mm.AIOContext, eventFile *fs.File) kernel.AIOCallback { + return func(ctx context.Context) { + if actx.Dead() { + actx.CancelPendingRequest() + return + } + ev := &linux.IOEvent{ + Data: cb.Data, + Obj: uint64(cbAddr), + } - // Update the result. - if err != nil { - err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file) - ev.Result = -int64(kernel.ExtractErrno(err, 0)) - } + var err error + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV: + ev.Result, err = file.Preadv(ctx, ioseq, cb.Offset) + case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + ev.Result, err = file.Pwritev(ctx, ioseq, cb.Offset) + case linux.IOCB_CMD_FSYNC: + err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + case linux.IOCB_CMD_FDSYNC: + err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncData) + } + + // Update the result. + if err != nil { + err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file) + ev.Result = -int64(kernel.ExtractErrno(err, 0)) + } - file.DecRef() + file.DecRef() - // Queue the result for delivery. - ctx.FinishRequest(ev) + // Queue the result for delivery. + actx.FinishRequest(ev) - // Notify the event file if one was specified. This needs to happen - // *after* queueing the result to avoid racing with the thread we may - // wake up. - if eventFile != nil { - eventFile.FileOperations.(*eventfd.EventOperations).Signal(1) - eventFile.DecRef() + // Notify the event file if one was specified. This needs to happen + // *after* queueing the result to avoid racing with the thread we may + // wake up. + if eventFile != nil { + eventFile.FileOperations.(*eventfd.EventOperations).Signal(1) + eventFile.DecRef() + } } } // submitCallback processes a single callback. -func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error { +func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error { file := t.GetFile(cb.FD) if file == nil { // File not found. @@ -318,7 +273,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Was there an eventFD? Extract it. var eventFile *fs.File - if cb.Flags&_IOCB_FLAG_RESFD != 0 { + if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { eventFile = t.GetFile(cb.ResFD) if eventFile == nil { // Bad FD. @@ -340,7 +295,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Check offset for reads/writes. switch cb.OpCode { - case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV, _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV: + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: if cb.Offset < 0 { return syserror.EINVAL } @@ -366,7 +321,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Perform the request asynchronously. file.IncRef() - fs.Async(func() { performCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile) }) + t.QueueAIO(getAIOCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile)) // All set. return nil @@ -395,7 +350,7 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } // Copy in this callback. - var cb ioCallback + var cb linux.IOCallback cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative)) if _, err := t.CopyIn(cbAddr, &cb); err != nil { @@ -424,10 +379,4 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return uintptr(nrEvents), nil, nil } -// IoCancel implements linux syscall io_cancel(2). -// -// It is not presently supported (ENOSYS indicates no support on this -// architecture). -func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, syserror.ENOSYS -} +// LINT.ThenChange(vfs2/aio.go) diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go new file mode 100644 index 000000000..e5cdefc50 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/aio.go @@ -0,0 +1,216 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/mm" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// IoSubmit implements linux syscall io_submit(2). +func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Uint64() + nrEvents := args[1].Int() + addr := args[2].Pointer() + + if nrEvents < 0 { + return 0, nil, syserror.EINVAL + } + + for i := int32(0); i < nrEvents; i++ { + // Copy in the address. + cbAddrNative := t.Arch().Native(0) + if _, err := t.CopyIn(addr, cbAddrNative); err != nil { + if i > 0 { + // Some successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Copy in this callback. + var cb linux.IOCallback + cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative)) + if _, err := t.CopyIn(cbAddr, &cb); err != nil { + if i > 0 { + // Some have been successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Process this callback. + if err := submitCallback(t, id, &cb, cbAddr); err != nil { + if i > 0 { + // Partial success. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Advance to the next one. + addr += usermem.Addr(t.Arch().Width()) + } + + return uintptr(nrEvents), nil, nil +} + +// submitCallback processes a single callback. +func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error { + if cb.Reserved2 != 0 { + return syserror.EINVAL + } + + fd := t.GetFileVFS2(cb.FD) + if fd == nil { + return syserror.EBADF + } + defer fd.DecRef() + + // Was there an eventFD? Extract it. + var eventFD *vfs.FileDescription + if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { + eventFD = t.GetFileVFS2(cb.ResFD) + if eventFD == nil { + return syserror.EBADF + } + defer eventFD.DecRef() + + // Check that it is an eventfd. + if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok { + return syserror.EINVAL + } + } + + ioseq, err := memoryFor(t, cb) + if err != nil { + return err + } + + // Check offset for reads/writes. + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + if cb.Offset < 0 { + return syserror.EINVAL + } + } + + // Prepare the request. + aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id) + if !ok { + return syserror.EINVAL + } + if ready := aioCtx.Prepare(); !ready { + // Context is busy. + return syserror.EAGAIN + } + + if eventFD != nil { + // The request is set. Make sure there's a ref on the file. + // + // This is necessary when the callback executes on completion, + // which is also what will release this reference. + eventFD.IncRef() + } + + // Perform the request asynchronously. + fd.IncRef() + t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx)) + return nil +} + +func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback { + return func(ctx context.Context) { + if aioCtx.Dead() { + aioCtx.CancelPendingRequest() + return + } + ev := &linux.IOEvent{ + Data: cb.Data, + Obj: uint64(cbAddr), + } + + var err error + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV: + ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{}) + case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{}) + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC: + err = fd.Sync(ctx) + } + + // Update the result. + if err != nil { + err = slinux.HandleIOErrorVFS2(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd) + ev.Result = -int64(kernel.ExtractErrno(err, 0)) + } + + fd.DecRef() + + // Queue the result for delivery. + aioCtx.FinishRequest(ev) + + // Notify the event file if one was specified. This needs to happen + // *after* queueing the result to avoid racing with the thread we may + // wake up. + if eventFD != nil { + eventFD.Impl().(*eventfd.EventFileDescription).Signal(1) + eventFD.DecRef() + } + } +} + +// memoryFor returns appropriate memory for the given callback. +func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) { + bytes := int(cb.Bytes) + if bytes < 0 { + // Linux also requires that this field fit in ssize_t. + return usermem.IOSequence{}, syserror.EINVAL + } + + // Since this I/O will be asynchronous with respect to t's task goroutine, + // we have no guarantee that t's AddressSpace will be active during the + // I/O. + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE: + return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV: + return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP: + return usermem.IOSequence{}, nil + + default: + // Not a supported command. + return usermem.IOSequence{}, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 954c82f97..caa6a98ff 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -105,11 +105,7 @@ func Override() { s.Table[197] = syscalls.Supported("removexattr", Removexattr) s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr) s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr) - delete(s.Table, 206) // io_setup - delete(s.Table, 207) // io_destroy - delete(s.Table, 208) // io_getevents - delete(s.Table, 209) // io_submit - delete(s.Table, 210) // io_cancel + s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}) s.Table[213] = syscalls.Supported("epoll_create", EpollCreate) s.Table[217] = syscalls.Supported("getdents64", Getdents64) delete(s.Table, 221) // fdavise64 |