diff options
Diffstat (limited to 'pkg/sentry')
-rwxr-xr-x | pkg/sentry/vfs/epoll.go | 377 | ||||
-rwxr-xr-x | pkg/sentry/vfs/epoll_interest_list.go | 173 | ||||
-rwxr-xr-x | pkg/sentry/vfs/file_description.go | 38 | ||||
-rwxr-xr-x | pkg/sentry/vfs/vfs.go | 16 | ||||
-rwxr-xr-x | pkg/sentry/vfs/vfs_state_autogen.go | 34 |
5 files changed, 632 insertions, 6 deletions
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go new file mode 100755 index 000000000..7c83f9a5a --- /dev/null +++ b/pkg/sentry/vfs/epoll.go @@ -0,0 +1,377 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// epollCycleMu serializes attempts to register EpollInstances with other +// EpollInstances in order to check for cycles. +var epollCycleMu sync.Mutex + +// EpollInstance represents an epoll instance, as described by epoll(7). +type EpollInstance struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + DentryMetadataFileDescriptionImpl + + // q holds waiters on this EpollInstance. + q waiter.Queue + + // interest is the set of file descriptors that are registered with the + // EpollInstance for monitoring. interest is protected by interestMu. + interestMu sync.Mutex + interest map[epollInterestKey]*epollInterest + + // mu protects fields in registered epollInterests. + mu sync.Mutex + + // ready is the set of file descriptors that may be "ready" for I/O. Note + // that this must be an ordered list, not a map: "If more than maxevents + // file descriptors are ready when epoll_wait() is called, then successive + // epoll_wait() calls will round robin through the set of ready file + // descriptors. This behavior helps avoid starvation scenarios, where a + // process fails to notice that additional file descriptors are ready + // because it focuses on a set of file descriptors that are already known + // to be ready." - epoll_wait(2) + ready epollInterestList +} + +type epollInterestKey struct { + // file is the registered FileDescription. No reference is held on file; + // instead, when the last reference is dropped, FileDescription.DecRef() + // removes the FileDescription from all EpollInstances. file is immutable. + file *FileDescription + + // num is the file descriptor number with which this entry was registered. + // num is immutable. + num int32 +} + +// epollInterest represents an EpollInstance's interest in a file descriptor. +type epollInterest struct { + // epoll is the owning EpollInstance. epoll is immutable. + epoll *EpollInstance + + // key is the file to which this epollInterest applies. key is immutable. + key epollInterestKey + + // waiter is registered with key.file. entry is protected by epoll.mu. + waiter waiter.Entry + + // mask is the event mask associated with this registration, including + // flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.mu. + mask uint32 + + // ready is true if epollInterestEntry is linked into epoll.ready. ready + // and epollInterestEntry are protected by epoll.mu. + ready bool + epollInterestEntry + + // userData is the epoll_data_t associated with this epollInterest. + // userData is protected by epoll.mu. + userData [2]int32 +} + +// NewEpollInstanceFD returns a FileDescription representing a new epoll +// instance. A reference is taken on the returned FileDescription. +func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) { + vd := vfs.NewAnonVirtualDentry("[eventpoll]") + defer vd.DecRef() + ep := &EpollInstance{ + interest: make(map[epollInterestKey]*epollInterest), + } + if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &ep.vfsfd, nil +} + +// Release implements FileDescriptionImpl.Release. +func (ep *EpollInstance) Release() { + // Unregister all polled fds. + ep.interestMu.Lock() + defer ep.interestMu.Unlock() + for key, epi := range ep.interest { + file := key.file + file.epollMu.Lock() + delete(file.epolls, epi) + file.epollMu.Unlock() + file.EventUnregister(&epi.waiter) + } + ep.interest = nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask { + if mask&waiter.EventIn == 0 { + return 0 + } + ep.mu.Lock() + for epi := ep.ready.Front(); epi != nil; epi = epi.Next() { + wmask := waiter.EventMaskFromLinux(epi.mask) + if epi.key.file.Readiness(wmask)&wmask != 0 { + ep.mu.Unlock() + return waiter.EventIn + } + } + ep.mu.Unlock() + return 0 +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (ep *EpollInstance) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + ep.q.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (ep *EpollInstance) EventUnregister(e *waiter.Entry) { + ep.q.EventUnregister(e) +} + +// Seek implements FileDescriptionImpl.Seek. +func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + // Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek + return 0, nil +} + +// AddInterest implements the semantics of EPOLL_CTL_ADD. +// +// Preconditions: A reference must be held on file. +func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error { + // Check for cyclic polling if necessary. + subep, _ := file.impl.(*EpollInstance) + if subep != nil { + epollCycleMu.Lock() + // epollCycleMu must be locked for the rest of AddInterest to ensure + // that cyclic polling is not introduced after the check. + defer epollCycleMu.Unlock() + if subep.mightPoll(ep) { + return syserror.ELOOP + } + } + + ep.interestMu.Lock() + defer ep.interestMu.Unlock() + + // Fail if the key is already registered. + key := epollInterestKey{ + file: file, + num: num, + } + if _, ok := ep.interest[key]; ok { + return syserror.EEXIST + } + + // Register interest in file. + mask |= linux.EPOLLERR | linux.EPOLLRDHUP + epi := &epollInterest{ + epoll: ep, + key: key, + mask: mask, + userData: userData, + } + ep.interest[key] = epi + wmask := waiter.EventMaskFromLinux(mask) + file.EventRegister(&epi.waiter, wmask) + + // Check if the file is already ready. + if file.Readiness(wmask)&wmask != 0 { + epi.Callback(nil) + } + + // Add epi to file.epolls so that it is removed when the last + // FileDescription reference is dropped. + file.epollMu.Lock() + file.epolls[epi] = struct{}{} + file.epollMu.Unlock() + + return nil +} + +func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool { + return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS +} + +func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool { + ep.interestMu.Lock() + defer ep.interestMu.Unlock() + for key := range ep.interest { + nextep, ok := key.file.impl.(*EpollInstance) + if !ok { + continue + } + if nextep == ep2 { + return true + } + if remainingRecursion == 0 { + return true + } + if nextep.mightPollRecursive(ep2, remainingRecursion-1) { + return true + } + } + return false +} + +// ModifyInterest implements the semantics of EPOLL_CTL_MOD. +// +// Preconditions: A reference must be held on file. +func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error { + ep.interestMu.Lock() + defer ep.interestMu.Unlock() + + // Fail if the key is not already registered. + epi, ok := ep.interest[epollInterestKey{ + file: file, + num: num, + }] + if !ok { + return syserror.ENOENT + } + + // Update epi for the next call to ep.ReadEvents(). + ep.mu.Lock() + epi.mask = mask + epi.userData = userData + ep.mu.Unlock() + + // Re-register with the new mask. + mask |= linux.EPOLLERR | linux.EPOLLRDHUP + file.EventUnregister(&epi.waiter) + wmask := waiter.EventMaskFromLinux(mask) + file.EventRegister(&epi.waiter, wmask) + + // Check if the file is already ready with the new mask. + if file.Readiness(wmask)&wmask != 0 { + epi.Callback(nil) + } + + return nil +} + +// DeleteInterest implements the semantics of EPOLL_CTL_DEL. +// +// Preconditions: A reference must be held on file. +func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error { + ep.interestMu.Lock() + defer ep.interestMu.Unlock() + + // Fail if the key is not already registered. + epi, ok := ep.interest[epollInterestKey{ + file: file, + num: num, + }] + if !ok { + return syserror.ENOENT + } + + // Unregister from the file so that epi will no longer be readied. + file.EventUnregister(&epi.waiter) + + // Forget about epi. + ep.removeLocked(epi) + + file.epollMu.Lock() + delete(file.epolls, epi) + file.epollMu.Unlock() + + return nil +} + +// Callback implements waiter.EntryCallback.Callback. +func (epi *epollInterest) Callback(*waiter.Entry) { + newReady := false + epi.epoll.mu.Lock() + if !epi.ready { + newReady = true + epi.ready = true + epi.epoll.ready.PushBack(epi) + } + epi.epoll.mu.Unlock() + if newReady { + epi.epoll.q.Notify(waiter.EventIn) + } +} + +// Preconditions: ep.interestMu must be locked. +func (ep *EpollInstance) removeLocked(epi *epollInterest) { + delete(ep.interest, epi.key) + ep.mu.Lock() + if epi.ready { + epi.ready = false + ep.ready.Remove(epi) + } + ep.mu.Unlock() +} + +// ReadEvents reads up to len(events) ready events into events and returns the +// number of events read. +// +// Preconditions: len(events) != 0. +func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int { + i := 0 + // Hot path: avoid defer. + ep.mu.Lock() + var next *epollInterest + var requeue epollInterestList + for epi := ep.ready.Front(); epi != nil; epi = next { + next = epi.Next() + // Regardless of what else happens, epi is initially removed from the + // ready list. + ep.ready.Remove(epi) + wmask := waiter.EventMaskFromLinux(epi.mask) + ievents := epi.key.file.Readiness(wmask) & wmask + if ievents == 0 { + // Leave epi off the ready list. + epi.ready = false + continue + } + // Determine what we should do with epi. + switch { + case epi.mask&linux.EPOLLONESHOT != 0: + // Clear all events from the mask; they must be re-added by + // EPOLL_CTL_MOD. + epi.mask &= linux.EP_PRIVATE_BITS + fallthrough + case epi.mask&linux.EPOLLET != 0: + // Leave epi off the ready list. + epi.ready = false + default: + // Queue epi to be moved to the end of the ready list. + requeue.PushBack(epi) + } + // Report ievents. + events[i] = linux.EpollEvent{ + Events: ievents.ToLinux(), + Fd: epi.userData[0], + Data: epi.userData[1], + } + i++ + if i == len(events) { + break + } + } + ep.ready.PushBackList(&requeue) + ep.mu.Unlock() + return i +} diff --git a/pkg/sentry/vfs/epoll_interest_list.go b/pkg/sentry/vfs/epoll_interest_list.go new file mode 100755 index 000000000..16cf878cd --- /dev/null +++ b/pkg/sentry/vfs/epoll_interest_list.go @@ -0,0 +1,173 @@ +package vfs + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type epollInterestElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (epollInterestElementMapper) linkerFor(elem *epollInterest) *epollInterest { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type epollInterestList struct { + head *epollInterest + tail *epollInterest +} + +// Reset resets list l to the empty state. +func (l *epollInterestList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *epollInterestList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *epollInterestList) Front() *epollInterest { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *epollInterestList) Back() *epollInterest { + return l.tail +} + +// PushFront inserts the element e at the front of list l. +func (l *epollInterestList) PushFront(e *epollInterest) { + epollInterestElementMapper{}.linkerFor(e).SetNext(l.head) + epollInterestElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + epollInterestElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *epollInterestList) PushBack(e *epollInterest) { + epollInterestElementMapper{}.linkerFor(e).SetNext(nil) + epollInterestElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + epollInterestElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *epollInterestList) PushBackList(m *epollInterestList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + epollInterestElementMapper{}.linkerFor(l.tail).SetNext(m.head) + epollInterestElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *epollInterestList) InsertAfter(b, e *epollInterest) { + a := epollInterestElementMapper{}.linkerFor(b).Next() + epollInterestElementMapper{}.linkerFor(e).SetNext(a) + epollInterestElementMapper{}.linkerFor(e).SetPrev(b) + epollInterestElementMapper{}.linkerFor(b).SetNext(e) + + if a != nil { + epollInterestElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *epollInterestList) InsertBefore(a, e *epollInterest) { + b := epollInterestElementMapper{}.linkerFor(a).Prev() + epollInterestElementMapper{}.linkerFor(e).SetNext(a) + epollInterestElementMapper{}.linkerFor(e).SetPrev(b) + epollInterestElementMapper{}.linkerFor(a).SetPrev(e) + + if b != nil { + epollInterestElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *epollInterestList) Remove(e *epollInterest) { + prev := epollInterestElementMapper{}.linkerFor(e).Prev() + next := epollInterestElementMapper{}.linkerFor(e).Next() + + if prev != nil { + epollInterestElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + epollInterestElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type epollInterestEntry struct { + next *epollInterest + prev *epollInterest +} + +// Next returns the entry that follows e in the list. +func (e *epollInterestEntry) Next() *epollInterest { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *epollInterestEntry) Prev() *epollInterest { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *epollInterestEntry) SetNext(elem *epollInterest) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *epollInterestEntry) SetPrev(elem *epollInterest) { + e.prev = elem +} diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 225024463..badacb55e 100755 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -45,6 +46,11 @@ type FileDescription struct { // memory operations. statusFlags uint32 + // epolls is the set of epollInterests registered for this FileDescription. + // epolls is protected by epollMu. + epollMu sync.Mutex + epolls map[*epollInterest]struct{} + // vd is the filesystem location at which this FileDescription was opened. // A reference is held on vd. vd is immutable. vd VirtualDentry @@ -141,6 +147,23 @@ func (fd *FileDescription) TryIncRef() bool { // DecRef decrements fd's reference count. func (fd *FileDescription) DecRef() { if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { + // Unregister fd from all epoll instances. + fd.epollMu.Lock() + epolls := fd.epolls + fd.epolls = nil + fd.epollMu.Unlock() + for epi := range epolls { + ep := epi.epoll + ep.interestMu.Lock() + // Check that epi has not been concurrently unregistered by + // EpollInstance.DeleteInterest() or EpollInstance.Release(). + if _, ok := ep.interest[epi.key]; ok { + fd.EventUnregister(&epi.waiter) + ep.removeLocked(epi) + } + ep.interestMu.Unlock() + } + // Release implementation resources. fd.impl.Release() if fd.writable { fd.vd.mount.EndWrite() @@ -453,6 +476,21 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { return fd.impl.StatFS(ctx) } +// Readiness returns fd's I/O readiness. +func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + return fd.impl.Readiness(mask) +} + +// EventRegister registers e for I/O readiness events in mask. +func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.impl.EventRegister(e, mask) +} + +// EventUnregister unregisters e for I/O readiness events. +func (fd *FileDescription) EventUnregister(e *waiter.Entry) { + fd.impl.EventUnregister(e) +} + // PRead reads from the file represented by fd into dst, starting at the given // offset, and returns the number of bytes read. PRead is permitted to return // partial reads with a nil error. diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index d730530b9..908c69f91 100755 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -16,15 +16,19 @@ // // Lock order: // -// FilesystemImpl/FileDescriptionImpl locks -// VirtualFilesystem.mountMu -// Dentry.mu -// Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry -// VirtualFilesystem.filesystemsMu +// EpollInstance.interestMu +// FileDescription.epollMu +// FilesystemImpl/FileDescriptionImpl locks +// VirtualFilesystem.mountMu +// Dentry.mu +// Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry +// VirtualFilesystem.filesystemsMu +// EpollInstance.mu // VirtualFilesystem.fsTypesMu // // Locking Dentry.mu in multiple Dentries requires holding -// VirtualFilesystem.mountMu. +// VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple +// EpollInstances requires holding epollCycleMu. package vfs import ( diff --git a/pkg/sentry/vfs/vfs_state_autogen.go b/pkg/sentry/vfs/vfs_state_autogen.go index 0a4a7bf35..78d4a0305 100755 --- a/pkg/sentry/vfs/vfs_state_autogen.go +++ b/pkg/sentry/vfs/vfs_state_autogen.go @@ -2,3 +2,37 @@ package vfs +import ( + "gvisor.dev/gvisor/pkg/state" +) + +func (x *epollInterestList) beforeSave() {} +func (x *epollInterestList) save(m state.Map) { + x.beforeSave() + m.Save("head", &x.head) + m.Save("tail", &x.tail) +} + +func (x *epollInterestList) afterLoad() {} +func (x *epollInterestList) load(m state.Map) { + m.Load("head", &x.head) + m.Load("tail", &x.tail) +} + +func (x *epollInterestEntry) beforeSave() {} +func (x *epollInterestEntry) save(m state.Map) { + x.beforeSave() + m.Save("next", &x.next) + m.Save("prev", &x.prev) +} + +func (x *epollInterestEntry) afterLoad() {} +func (x *epollInterestEntry) load(m state.Map) { + m.Load("next", &x.next) + m.Load("prev", &x.prev) +} + +func init() { + state.Register("vfs.epollInterestList", (*epollInterestList)(nil), state.Fns{Save: (*epollInterestList).save, Load: (*epollInterestList).load}) + state.Register("vfs.epollInterestEntry", (*epollInterestEntry)(nil), state.Fns{Save: (*epollInterestEntry).save, Load: (*epollInterestEntry).load}) +} |