summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/epoll/epoll.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel/epoll/epoll.go')
-rw-r--r--pkg/sentry/kernel/epoll/epoll.go466
1 files changed, 466 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
new file mode 100644
index 000000000..b572fcd7e
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -0,0 +1,466 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package epoll provides an implementation of Linux's IO event notification
+// facility. See epoll(7) for more details.
+package epoll
+
+import (
+ "fmt"
+ "sync"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/ilist"
+ "gvisor.googlesource.com/gvisor/pkg/refs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Event describes the event mask that was observed and the user data to be
+// returned when one of the events occurs. It has this format to match the linux
+// format to avoid extra copying/allocation when writing events to userspace.
+type Event struct {
+ // Events is the event mask containing the set of events that have been
+ // observed on an entry.
+ Events uint32
+
+ // Data is an opaque 64-bit value provided by the caller when adding the
+ // entry, and returned to the caller when the entry reports an event.
+ Data [2]int32
+}
+
+// EntryFlags is a bitmask that holds an entry's flags.
+type EntryFlags int
+
+// Valid entry flags.
+const (
+ OneShot EntryFlags = 1 << iota
+ EdgeTriggered
+)
+
+// FileIdentifier identifies a file. We cannot use just the FD because it could
+// potentially be reassigned. We also cannot use just the file pointer because
+// it is possible to have multiple entries for the same file object as long as
+// they are created with different FDs (i.e., the FDs point to the same file).
+type FileIdentifier struct {
+ File *fs.File
+ Fd kdefs.FD
+}
+
+// pollEntry holds all the state associated with an event poll entry, that is,
+// a file being observed by an event poll object.
+type pollEntry struct {
+ ilist.Entry
+ file *refs.WeakRef `state:"manual"`
+ id FileIdentifier `state:"wait"`
+ userData [2]int32
+ waiter waiter.Entry `state:"manual"`
+ mask waiter.EventMask
+ flags EntryFlags
+
+ epoll *EventPoll
+
+ // We cannot save the current list pointer as it points into EventPoll
+ // struct, while state framework currently does not support such
+ // in-struct pointers. Instead, EventPoll will properly set this field
+ // in its loading logic.
+ curList *ilist.List `state:"nosave"`
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+// weakReferenceGone is called when the file in the weak reference is destroyed.
+// The poll entry is removed in response to this.
+func (p *pollEntry) WeakRefGone() {
+ p.epoll.RemoveEntry(p.id)
+}
+
+// EventPoll holds all the state associated with an event poll object, that is,
+// collection of files to observe and their current state.
+type EventPoll struct {
+ fsutil.PipeSeek `state:"zerovalue"`
+ fsutil.NotDirReaddir `state:"zerovalue"`
+ fsutil.NoFsync `state:"zerovalue"`
+ fsutil.NoopFlush `state:"zerovalue"`
+ fsutil.NoMMap `state:"zerovalue"`
+ fsutil.NoIoctl `state:"zerovalue"`
+
+ // Wait queue is used to notify interested parties when the event poll
+ // object itself becomes readable or writable.
+ waiter.Queue
+
+ // files is the map of all the files currently being observed, it is
+ // protected by mu.
+ mu sync.Mutex `state:"nosave"`
+ files map[FileIdentifier]*pollEntry
+
+ // listsMu protects manipulation of the lists below. It needs to be a
+ // different lock to avoid circular lock acquisition order involving
+ // the wait queue mutexes and mu. The full order is mu, observed file
+ // wait queue mutex, then listsMu; this allows listsMu to be acquired
+ // when readyCallback is called.
+ //
+ // An entry is always in one of the following lists:
+ // readyList -- when there's a chance that it's ready to have
+ // events delivered to epoll waiters. Given that being
+ // ready is a transient state, the Readiness() and
+ // readEvents() functions always call the entry's file
+ // Readiness() function to confirm it's ready.
+ // waitingList -- when there's no chance that the entry is ready,
+ // so it's waiting for the readyCallback to be called
+ // on it before it gets moved to the readyList.
+ // disabledList -- when the entry is disabled. This happens when
+ // a one-shot entry gets delivered via readEvents().
+ listsMu sync.Mutex `state:"nosave"`
+ readyList ilist.List
+ waitingList ilist.List
+ disabledList ilist.List
+}
+
+// cycleMu is used to serialize all the cycle checks. This is only used when
+// an event poll file is added as an entry to another event poll. Such checks
+// are serialized to avoid lock acquisition order inversion: if a thread is
+// adding A to B, and another thread is adding B to A, each would acquire A's
+// and B's mutexes in reverse order, and could cause deadlocks. Having this
+// lock prevents this by allowing only one check at a time to happen.
+//
+// We do the cycle check to prevent callers from introducing potentially
+// infinite recursions. If a caller were to add A to B and then B to A, for
+// event poll A to know if it's readable, it would need to check event poll B,
+// which in turn would need event poll A and so on indefinitely.
+var cycleMu sync.Mutex
+
+// NewEventPoll allocates and initializes a new event poll object.
+func NewEventPoll(ctx context.Context) *fs.File {
+ // name matches fs/eventpoll.c:epoll_create1.
+ dirent := fs.NewDirent(anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]"))
+ return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{
+ files: make(map[FileIdentifier]*pollEntry),
+ })
+}
+
+// Release implements fs.FileOperations.Release.
+func (e *EventPoll) Release() {
+ // We need to take the lock now because files may be attempting to
+ // remove entries in parallel if they get destroyed.
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ // Go through all entries and clean up.
+ for _, entry := range e.files {
+ entry.id.File.EventUnregister(&entry.waiter)
+ entry.file.Drop()
+ }
+}
+
+// Read implements fs.FileOperations.Read.
+func (*EventPoll) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+ return 0, syscall.ENOSYS
+}
+
+// Write implements fs.FileOperations.Write.
+func (*EventPoll) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+ return 0, syscall.ENOSYS
+}
+
+// eventsAvailable determines if 'e' has events available for delivery.
+func (e *EventPoll) eventsAvailable() bool {
+ e.listsMu.Lock()
+
+ for it := e.readyList.Front(); it != nil; {
+ entry := it.(*pollEntry)
+ it = it.Next()
+
+ // If the entry is ready, we know 'e' has at least one entry
+ // ready for delivery.
+ ready := entry.id.File.Readiness(entry.mask)
+ if ready != 0 {
+ e.listsMu.Unlock()
+ return true
+ }
+
+ // Entry is not ready, so move it to waiting list.
+ e.readyList.Remove(entry)
+ e.waitingList.PushBack(entry)
+ entry.curList = &e.waitingList
+ }
+
+ e.listsMu.Unlock()
+
+ return false
+}
+
+// Readiness determines if the event poll object is currently readable (i.e.,
+// if there are pending events for delivery).
+func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask {
+ ready := waiter.EventMask(0)
+
+ if (mask&waiter.EventIn) != 0 && e.eventsAvailable() {
+ ready |= waiter.EventIn
+ }
+
+ return ready
+}
+
+// ReadEvents returns up to max available events.
+func (e *EventPoll) ReadEvents(max int) []Event {
+ var local ilist.List
+ var ret []Event
+
+ e.listsMu.Lock()
+
+ // Go through all entries we believe may be ready.
+ for it := e.readyList.Front(); it != nil && len(ret) < max; {
+ entry := it.(*pollEntry)
+ it = it.Next()
+
+ // Check the entry's readiness. It it's not really ready, we
+ // just put it back in the waiting list and move on to the next
+ // entry.
+ ready := entry.id.File.Readiness(entry.mask) & entry.mask
+ if ready == 0 {
+ e.readyList.Remove(entry)
+ e.waitingList.PushBack(entry)
+ entry.curList = &e.waitingList
+
+ continue
+ }
+
+ // Add event to the array that will be returned to caller.
+ ret = append(ret, Event{
+ Events: uint32(ready),
+ Data: entry.userData,
+ })
+
+ // The entry is consumed, so we must move it to the disabled
+ // list in case it's one-shot, or back to the wait list if it's
+ // edge-triggered. If it's neither, we leave it in the ready
+ // list so that its readiness can be checked the next time
+ // around; however, we must move it to the end of the list so
+ // that other events can be delivered as well.
+ e.readyList.Remove(entry)
+ if entry.flags&OneShot != 0 {
+ e.disabledList.PushBack(entry)
+ entry.curList = &e.disabledList
+ } else if entry.flags&EdgeTriggered != 0 {
+ e.waitingList.PushBack(entry)
+ entry.curList = &e.waitingList
+ } else {
+ local.PushBack(entry)
+ }
+ }
+
+ e.readyList.PushBackList(&local)
+
+ e.listsMu.Unlock()
+
+ return ret
+}
+
+// readyCallback is called when one of the files we're polling becomes ready. It
+// moves said file to the readyList if it's currently in the waiting list.
+type readyCallback struct{}
+
+// Callback implements waiter.EntryCallback.Callback.
+func (*readyCallback) Callback(w *waiter.Entry) {
+ entry := w.Context.(*pollEntry)
+ e := entry.epoll
+
+ e.listsMu.Lock()
+
+ if entry.curList == &e.waitingList {
+ e.waitingList.Remove(entry)
+ e.readyList.PushBack(entry)
+ entry.curList = &e.readyList
+
+ e.Notify(waiter.EventIn)
+ }
+
+ e.listsMu.Unlock()
+}
+
+// initEntryReadiness initializes the entry's state with regards to its
+// readiness by placing it in the appropriate list and registering for
+// notifications.
+func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
+ // A new entry starts off in the waiting list.
+ e.listsMu.Lock()
+ e.waitingList.PushBack(entry)
+ entry.curList = &e.waitingList
+ e.listsMu.Unlock()
+
+ // Register for event notifications.
+ f := entry.id.File
+ f.EventRegister(&entry.waiter, entry.mask)
+
+ // Check if the file happens to already be in a ready state.
+ ready := f.Readiness(entry.mask) & entry.mask
+ if ready != 0 {
+ (*readyCallback).Callback(nil, &entry.waiter)
+ }
+}
+
+// observes checks if event poll object e is directly or indirectly observing
+// event poll object ep. It uses a bounded recursive depth-first search.
+func (e *EventPoll) observes(ep *EventPoll, depthLeft int) bool {
+ // If we reached the maximum depth, we'll consider that we found it
+ // because we don't want to allow chains that are too long.
+ if depthLeft <= 0 {
+ return true
+ }
+
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ // Go through each observed file and check if it is or observes ep.
+ for id := range e.files {
+ f, ok := id.File.FileOperations.(*EventPoll)
+ if !ok {
+ continue
+ }
+
+ if f == ep || f.observes(ep, depthLeft-1) {
+ return true
+ }
+ }
+
+ return false
+}
+
+// AddEntry adds a new file to the collection of files observed by e.
+func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
+ // Acquire cycle check lock if another event poll is being added.
+ ep, ok := id.File.FileOperations.(*EventPoll)
+ if ok {
+ cycleMu.Lock()
+ defer cycleMu.Unlock()
+ }
+
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ // Fail if the file already has an entry.
+ if _, ok := e.files[id]; ok {
+ return syscall.EEXIST
+ }
+
+ // Check if a cycle would be created. We use 4 as the limit because
+ // that's the value used by linux and we want to emulate it.
+ if ep != nil {
+ if e == ep {
+ return syscall.EINVAL
+ }
+
+ if ep.observes(e, 4) {
+ return syscall.ELOOP
+ }
+ }
+
+ // Create new entry and add it to map.
+ //
+ // N.B. Even though we are creating a weak reference here, we know it
+ // won't trigger a callback because we hold a reference to the file
+ // throughout the execution of this function.
+ entry := &pollEntry{
+ id: id,
+ userData: data,
+ epoll: e,
+ flags: flags,
+ waiter: waiter.Entry{Callback: &readyCallback{}},
+ mask: mask,
+ }
+ entry.waiter.Context = entry
+ e.files[id] = entry
+ entry.file = refs.NewWeakRef(id.File, entry)
+
+ // Initialize the readiness state of the new entry.
+ e.initEntryReadiness(entry)
+
+ return nil
+}
+
+// UpdateEntry updates the flags, mask and user data associated with a file that
+// is already part of the collection of observed files.
+func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ // Fail if the file doesn't have an entry.
+ entry, ok := e.files[id]
+ if !ok {
+ return syscall.ENOENT
+ }
+
+ // Unregister the old mask and remove entry from the list it's in, so
+ // readyCallback is guaranteed to not be called on this entry anymore.
+ entry.id.File.EventUnregister(&entry.waiter)
+
+ // Remove entry from whatever list it's in. This ensure that no other
+ // threads have access to this entry as the only way left to find it
+ // is via e.files, but we hold e.mu, which prevents that.
+ e.listsMu.Lock()
+ entry.curList.Remove(entry)
+ e.listsMu.Unlock()
+
+ // Initialize new readiness state.
+ entry.flags = flags
+ entry.mask = mask
+ entry.userData = data
+ e.initEntryReadiness(entry)
+
+ return nil
+}
+
+// RemoveEntry a files from the collection of observed files.
+func (e *EventPoll) RemoveEntry(id FileIdentifier) error {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ // Fail if the file doesn't have an entry.
+ entry, ok := e.files[id]
+ if !ok {
+ return syscall.ENOENT
+ }
+
+ // Unregister from file first so that no concurrent attempts will be
+ // made to manipulate the file.
+ entry.id.File.EventUnregister(&entry.waiter)
+
+ // Remove from the current list.
+ e.listsMu.Lock()
+ entry.curList.Remove(entry)
+ entry.curList = nil
+ e.listsMu.Unlock()
+
+ // Remove file from map, and drop weak reference.
+ delete(e.files, id)
+ entry.file.Drop()
+
+ return nil
+}
+
+// UnregisterEpollWaiters removes the epoll waiter objects from the waiting
+// queues. This is different from Release() as the file is not dereferenced.
+func (e *EventPoll) UnregisterEpollWaiters() {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ for _, entry := range e.files {
+ entry.id.File.EventUnregister(&entry.waiter)
+ }
+}