diff options
Diffstat (limited to 'pkg/sentry/mm/aio_context.go')
-rw-r--r-- | pkg/sentry/mm/aio_context.go | 387 |
1 files changed, 387 insertions, 0 deletions
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go new file mode 100644 index 000000000..5c61acf36 --- /dev/null +++ b/pkg/sentry/mm/aio_context.go @@ -0,0 +1,387 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// aioManager creates and manages asynchronous I/O contexts. +// +// +stateify savable +type aioManager struct { + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // aioContexts is the set of asynchronous I/O contexts. + contexts map[uint64]*AIOContext +} + +func (a *aioManager) destroy() { + a.mu.Lock() + defer a.mu.Unlock() + + for _, ctx := range a.contexts { + ctx.destroy() + } +} + +// newAIOContext creates a new context for asynchronous I/O. +// +// Returns false if 'id' is currently in use. +func (a *aioManager) newAIOContext(events uint32, id uint64) bool { + a.mu.Lock() + defer a.mu.Unlock() + + if _, ok := a.contexts[id]; ok { + return false + } + + a.contexts[id] = &AIOContext{ + done: make(chan struct{}, 1), + maxOutstanding: events, + } + return true +} + +// destroyAIOContext destroys an asynchronous I/O context. +// +// False is returned if the context does not exist. +func (a *aioManager) destroyAIOContext(id uint64) bool { + a.mu.Lock() + defer a.mu.Unlock() + ctx, ok := a.contexts[id] + if !ok { + return false + } + delete(a.contexts, id) + ctx.destroy() + return true +} + +// lookupAIOContext looks up the given context. +// +// Returns false if context does not exist. +func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) { + a.mu.Lock() + defer a.mu.Unlock() + ctx, ok := a.contexts[id] + return ctx, ok +} + +// ioResult is a completed I/O operation. +// +// +stateify savable +type ioResult struct { + data interface{} + ioEntry +} + +// AIOContext is a single asynchronous I/O context. +// +// +stateify savable +type AIOContext struct { + // done is the notification channel used for all requests. + done chan struct{} `state:"nosave"` + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // results is the set of completed requests. + results ioList + + // maxOutstanding is the maximum number of outstanding entries; this value + // is immutable. + maxOutstanding uint32 + + // outstanding is the number of requests outstanding; this will effectively + // be the number of entries in the result list or that are expected to be + // added to the result list. + outstanding uint32 + + // dead is set when the context is destroyed. + dead bool `state:"zerovalue"` +} + +// destroy marks the context dead. +func (ctx *AIOContext) destroy() { + ctx.mu.Lock() + defer ctx.mu.Unlock() + ctx.dead = true + if ctx.outstanding == 0 { + close(ctx.done) + } +} + +// Prepare reserves space for a new request, returning true if available. +// Returns false if the context is busy. +func (ctx *AIOContext) Prepare() bool { + ctx.mu.Lock() + defer ctx.mu.Unlock() + if ctx.outstanding >= ctx.maxOutstanding { + return false + } + ctx.outstanding++ + return true +} + +// PopRequest pops a completed request if available, this function does not do +// any blocking. Returns false if no request is available. +func (ctx *AIOContext) PopRequest() (interface{}, bool) { + ctx.mu.Lock() + defer ctx.mu.Unlock() + + // Is there anything ready? + if e := ctx.results.Front(); e != nil { + ctx.results.Remove(e) + ctx.outstanding-- + if ctx.outstanding == 0 && ctx.dead { + close(ctx.done) + } + return e.data, true + } + return nil, false +} + +// FinishRequest finishes a pending request. It queues up the data +// and notifies listeners. +func (ctx *AIOContext) FinishRequest(data interface{}) { + ctx.mu.Lock() + defer ctx.mu.Unlock() + + // Push to the list and notify opportunistically. The channel notify + // here is guaranteed to be safe because outstanding must be non-zero. + // The done channel is only closed when outstanding reaches zero. + ctx.results.PushBack(&ioResult{data: data}) + + select { + case ctx.done <- struct{}{}: + default: + } +} + +// WaitChannel returns a channel that is notified when an AIO request is +// completed. +// +// The boolean return value indicates whether or not the context is active. +func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) { + ctx.mu.Lock() + defer ctx.mu.Unlock() + if ctx.outstanding == 0 && ctx.dead { + return nil, false + } + return ctx.done, true +} + +// aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO +// ring buffers. +// +// +stateify savable +type aioMappable struct { + refs.AtomicRefCount + + mfp pgalloc.MemoryFileProvider + fr platform.FileRange +} + +var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp()) + +func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { + fr, err := mfp.MemoryFile().Allocate(aioRingBufferSize, usage.Anonymous) + if err != nil { + return nil, err + } + return &aioMappable{mfp: mfp, fr: fr}, nil +} + +// DecRef implements refs.RefCounter.DecRef. +func (m *aioMappable) DecRef() { + m.AtomicRefCount.DecRefWithDestructor(func() { + m.mfp.MemoryFile().DecRef(m.fr) + }) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (m *aioMappable) MappedName(ctx context.Context) string { + return "[aio]" +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (m *aioMappable) DeviceID() uint64 { + return 0 +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (m *aioMappable) InodeID() uint64 { + return 0 +} + +// Msync implements memmap.MappingIdentity.Msync. +func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error { + // Linux: aio_ring_fops.fsync == NULL + return syserror.EINVAL +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar usermem.AddrRange, offset uint64, _ bool) error { + // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() + // sets VM_DONTEXPAND). + if offset != 0 || uint64(ar.Length()) != aioRingBufferSize { + return syserror.EFAULT + } + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (m *aioMappable) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) { +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, _ bool) error { + // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() + // sets VM_DONTEXPAND). + if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize { + return syserror.EFAULT + } + // Require that the mapping correspond to a live AIOContext. Compare + // Linux's fs/aio.c:aio_ring_mremap(). + mm, ok := ms.(*MemoryManager) + if !ok { + return syserror.EINVAL + } + am := &mm.aioManager + am.mu.Lock() + defer am.mu.Unlock() + oldID := uint64(srcAR.Start) + aioCtx, ok := am.contexts[oldID] + if !ok { + return syserror.EINVAL + } + aioCtx.mu.Lock() + defer aioCtx.mu.Unlock() + if aioCtx.dead { + return syserror.EINVAL + } + // Use the new ID for the AIOContext. + am.contexts[uint64(dstAR.Start)] = aioCtx + delete(am.contexts, oldID) + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + var err error + if required.End > m.fr.Length() { + err = &memmap.BusError{syserror.EFAULT} + } + if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { + return []memmap.Translation{ + { + Source: source, + File: m.mfp.MemoryFile(), + Offset: m.fr.Start + source.Start, + Perms: usermem.AnyAccess, + }, + }, err + } + return nil, err +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error { + return nil +} + +// NewAIOContext creates a new context for asynchronous I/O. +// +// NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc(). +func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) { + // libaio get_ioevents() expects context "handle" to be a valid address. + // libaio peeks inside looking for a magic number. This function allocates + // a page per context and keeps it set to zeroes to ensure it will not + // match AIO_RING_MAGIC and make libaio happy. + m, err := newAIOMappable(mm.mfp) + if err != nil { + return 0, err + } + defer m.DecRef() + addr, err := mm.MMap(ctx, memmap.MMapOpts{ + Length: aioRingBufferSize, + MappingIdentity: m, + Mappable: m, + // TODO(fvoznika): Linux does "do_mmap_pgoff(..., PROT_READ | + // PROT_WRITE, ...)" in fs/aio.c:aio_setup_ring(); why do we make this + // mapping read-only? + Perms: usermem.Read, + MaxPerms: usermem.Read, + }) + if err != nil { + return 0, err + } + id := uint64(addr) + if !mm.aioManager.newAIOContext(events, id) { + mm.MUnmap(ctx, addr, aioRingBufferSize) + return 0, syserror.EINVAL + } + return id, nil +} + +// DestroyAIOContext destroys an asynchronous I/O context. It returns false if +// the context does not exist. +func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) bool { + if _, ok := mm.LookupAIOContext(ctx, id); !ok { + return false + } + + // Only unmaps after it assured that the address is a valid aio context to + // prevent random memory from been unmapped. + // + // Note: It's possible to unmap this address and map something else into + // the same address. Then it would be unmapping memory that it doesn't own. + // This is, however, the way Linux implements AIO. Keeps the same [weird] + // semantics in case anyone relies on it. + mm.MUnmap(ctx, usermem.Addr(id), aioRingBufferSize) + + return mm.aioManager.destroyAIOContext(id) +} + +// LookupAIOContext looks up the given context. It returns false if the context +// does not exist. +func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) { + aioCtx, ok := mm.aioManager.lookupAIOContext(id) + if !ok { + return nil, false + } + + // Protect against 'ids' that are inaccessible (Linux also reads 4 bytes + // from id). + var buf [4]byte + _, err := mm.CopyIn(ctx, usermem.Addr(id), buf[:], usermem.IOOpts{}) + if err != nil { + return nil, false + } + + return aioCtx, true +} |