1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
|
// Copyright 2018 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package epoll provides an implementation of Linux's IO event notification
// facility. See epoll(7) for more details.
package epoll
import (
"fmt"
"sync"
"syscall"
"gvisor.googlesource.com/gvisor/pkg/ilist"
"gvisor.googlesource.com/gvisor/pkg/refs"
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
"gvisor.googlesource.com/gvisor/pkg/waiter"
)
// Event describes the event mask that was observed and the user data to be
// returned when one of the events occurs. It has this format to match the linux
// format to avoid extra copying/allocation when writing events to userspace.
type Event struct {
// Events is the event mask containing the set of events that have been
// observed on an entry.
Events uint32
// Data is an opaque 64-bit value provided by the caller when adding the
// entry, and returned to the caller when the entry reports an event.
Data [2]int32
}
// EntryFlags is a bitmask that holds an entry's flags.
type EntryFlags int
// Valid entry flags.
const (
OneShot EntryFlags = 1 << iota
EdgeTriggered
)
// FileIdentifier identifies a file. We cannot use just the FD because it could
// potentially be reassigned. We also cannot use just the file pointer because
// it is possible to have multiple entries for the same file object as long as
// they are created with different FDs (i.e., the FDs point to the same file).
//
// +stateify savable
type FileIdentifier struct {
File *fs.File
Fd kdefs.FD
}
// pollEntry holds all the state associated with an event poll entry, that is,
// a file being observed by an event poll object.
//
// +stateify savable
type pollEntry struct {
ilist.Entry
file *refs.WeakRef `state:"manual"`
id FileIdentifier `state:"wait"`
userData [2]int32
waiter waiter.Entry `state:"manual"`
mask waiter.EventMask
flags EntryFlags
epoll *EventPoll
// We cannot save the current list pointer as it points into EventPoll
// struct, while state framework currently does not support such
// in-struct pointers. Instead, EventPoll will properly set this field
// in its loading logic.
curList *ilist.List `state:"nosave"`
}
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
// weakReferenceGone is called when the file in the weak reference is destroyed.
// The poll entry is removed in response to this.
func (p *pollEntry) WeakRefGone() {
p.epoll.RemoveEntry(p.id)
}
// EventPoll holds all the state associated with an event poll object, that is,
// collection of files to observe and their current state.
//
// +stateify savable
type EventPoll struct {
fsutil.PipeSeek `state:"zerovalue"`
fsutil.NotDirReaddir `state:"zerovalue"`
fsutil.NoFsync `state:"zerovalue"`
fsutil.NoopFlush `state:"zerovalue"`
fsutil.NoMMap `state:"zerovalue"`
fsutil.NoIoctl `state:"zerovalue"`
// Wait queue is used to notify interested parties when the event poll
// object itself becomes readable or writable.
waiter.Queue `state:"zerovalue"`
// files is the map of all the files currently being observed, it is
// protected by mu.
mu sync.Mutex `state:"nosave"`
files map[FileIdentifier]*pollEntry
// listsMu protects manipulation of the lists below. It needs to be a
// different lock to avoid circular lock acquisition order involving
// the wait queue mutexes and mu. The full order is mu, observed file
// wait queue mutex, then listsMu; this allows listsMu to be acquired
// when readyCallback is called.
//
// An entry is always in one of the following lists:
// readyList -- when there's a chance that it's ready to have
// events delivered to epoll waiters. Given that being
// ready is a transient state, the Readiness() and
// readEvents() functions always call the entry's file
// Readiness() function to confirm it's ready.
// waitingList -- when there's no chance that the entry is ready,
// so it's waiting for the readyCallback to be called
// on it before it gets moved to the readyList.
// disabledList -- when the entry is disabled. This happens when
// a one-shot entry gets delivered via readEvents().
listsMu sync.Mutex `state:"nosave"`
readyList ilist.List
waitingList ilist.List
disabledList ilist.List
}
// cycleMu is used to serialize all the cycle checks. This is only used when
// an event poll file is added as an entry to another event poll. Such checks
// are serialized to avoid lock acquisition order inversion: if a thread is
// adding A to B, and another thread is adding B to A, each would acquire A's
// and B's mutexes in reverse order, and could cause deadlocks. Having this
// lock prevents this by allowing only one check at a time to happen.
//
// We do the cycle check to prevent callers from introducing potentially
// infinite recursions. If a caller were to add A to B and then B to A, for
// event poll A to know if it's readable, it would need to check event poll B,
// which in turn would need event poll A and so on indefinitely.
var cycleMu sync.Mutex
// NewEventPoll allocates and initializes a new event poll object.
func NewEventPoll(ctx context.Context) *fs.File {
// name matches fs/eventpoll.c:epoll_create1.
dirent := fs.NewDirent(anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]"))
return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{
files: make(map[FileIdentifier]*pollEntry),
})
}
// Release implements fs.FileOperations.Release.
func (e *EventPoll) Release() {
// We need to take the lock now because files may be attempting to
// remove entries in parallel if they get destroyed.
e.mu.Lock()
defer e.mu.Unlock()
// Go through all entries and clean up.
for _, entry := range e.files {
entry.id.File.EventUnregister(&entry.waiter)
entry.file.Drop()
}
}
// Read implements fs.FileOperations.Read.
func (*EventPoll) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
return 0, syscall.ENOSYS
}
// Write implements fs.FileOperations.Write.
func (*EventPoll) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
return 0, syscall.ENOSYS
}
// eventsAvailable determines if 'e' has events available for delivery.
func (e *EventPoll) eventsAvailable() bool {
e.listsMu.Lock()
for it := e.readyList.Front(); it != nil; {
entry := it.(*pollEntry)
it = it.Next()
// If the entry is ready, we know 'e' has at least one entry
// ready for delivery.
ready := entry.id.File.Readiness(entry.mask)
if ready != 0 {
e.listsMu.Unlock()
return true
}
// Entry is not ready, so move it to waiting list.
e.readyList.Remove(entry)
e.waitingList.PushBack(entry)
entry.curList = &e.waitingList
}
e.listsMu.Unlock()
return false
}
// Readiness determines if the event poll object is currently readable (i.e.,
// if there are pending events for delivery).
func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask {
ready := waiter.EventMask(0)
if (mask&waiter.EventIn) != 0 && e.eventsAvailable() {
ready |= waiter.EventIn
}
return ready
}
// ReadEvents returns up to max available events.
func (e *EventPoll) ReadEvents(max int) []Event {
var local ilist.List
var ret []Event
e.listsMu.Lock()
// Go through all entries we believe may be ready.
for it := e.readyList.Front(); it != nil && len(ret) < max; {
entry := it.(*pollEntry)
it = it.Next()
// Check the entry's readiness. It it's not really ready, we
// just put it back in the waiting list and move on to the next
// entry.
ready := entry.id.File.Readiness(entry.mask) & entry.mask
if ready == 0 {
e.readyList.Remove(entry)
e.waitingList.PushBack(entry)
entry.curList = &e.waitingList
continue
}
// Add event to the array that will be returned to caller.
ret = append(ret, Event{
Events: uint32(ready),
Data: entry.userData,
})
// The entry is consumed, so we must move it to the disabled
// list in case it's one-shot, or back to the wait list if it's
// edge-triggered. If it's neither, we leave it in the ready
// list so that its readiness can be checked the next time
// around; however, we must move it to the end of the list so
// that other events can be delivered as well.
e.readyList.Remove(entry)
if entry.flags&OneShot != 0 {
e.disabledList.PushBack(entry)
entry.curList = &e.disabledList
} else if entry.flags&EdgeTriggered != 0 {
e.waitingList.PushBack(entry)
entry.curList = &e.waitingList
} else {
local.PushBack(entry)
}
}
e.readyList.PushBackList(&local)
e.listsMu.Unlock()
return ret
}
// readyCallback is called when one of the files we're polling becomes ready. It
// moves said file to the readyList if it's currently in the waiting list.
type readyCallback struct{}
// Callback implements waiter.EntryCallback.Callback.
func (*readyCallback) Callback(w *waiter.Entry) {
entry := w.Context.(*pollEntry)
e := entry.epoll
e.listsMu.Lock()
if entry.curList == &e.waitingList {
e.waitingList.Remove(entry)
e.readyList.PushBack(entry)
entry.curList = &e.readyList
e.Notify(waiter.EventIn)
}
e.listsMu.Unlock()
}
// initEntryReadiness initializes the entry's state with regards to its
// readiness by placing it in the appropriate list and registering for
// notifications.
func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
// A new entry starts off in the waiting list.
e.listsMu.Lock()
e.waitingList.PushBack(entry)
entry.curList = &e.waitingList
e.listsMu.Unlock()
// Register for event notifications.
f := entry.id.File
f.EventRegister(&entry.waiter, entry.mask)
// Check if the file happens to already be in a ready state.
ready := f.Readiness(entry.mask) & entry.mask
if ready != 0 {
(*readyCallback).Callback(nil, &entry.waiter)
}
}
// observes checks if event poll object e is directly or indirectly observing
// event poll object ep. It uses a bounded recursive depth-first search.
func (e *EventPoll) observes(ep *EventPoll, depthLeft int) bool {
// If we reached the maximum depth, we'll consider that we found it
// because we don't want to allow chains that are too long.
if depthLeft <= 0 {
return true
}
e.mu.Lock()
defer e.mu.Unlock()
// Go through each observed file and check if it is or observes ep.
for id := range e.files {
f, ok := id.File.FileOperations.(*EventPoll)
if !ok {
continue
}
if f == ep || f.observes(ep, depthLeft-1) {
return true
}
}
return false
}
// AddEntry adds a new file to the collection of files observed by e.
func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
// Acquire cycle check lock if another event poll is being added.
ep, ok := id.File.FileOperations.(*EventPoll)
if ok {
cycleMu.Lock()
defer cycleMu.Unlock()
}
e.mu.Lock()
defer e.mu.Unlock()
// Fail if the file already has an entry.
if _, ok := e.files[id]; ok {
return syscall.EEXIST
}
// Check if a cycle would be created. We use 4 as the limit because
// that's the value used by linux and we want to emulate it.
if ep != nil {
if e == ep {
return syscall.EINVAL
}
if ep.observes(e, 4) {
return syscall.ELOOP
}
}
// Create new entry and add it to map.
//
// N.B. Even though we are creating a weak reference here, we know it
// won't trigger a callback because we hold a reference to the file
// throughout the execution of this function.
entry := &pollEntry{
id: id,
userData: data,
epoll: e,
flags: flags,
waiter: waiter.Entry{Callback: &readyCallback{}},
mask: mask,
}
entry.waiter.Context = entry
e.files[id] = entry
entry.file = refs.NewWeakRef(id.File, entry)
// Initialize the readiness state of the new entry.
e.initEntryReadiness(entry)
return nil
}
// UpdateEntry updates the flags, mask and user data associated with a file that
// is already part of the collection of observed files.
func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
e.mu.Lock()
defer e.mu.Unlock()
// Fail if the file doesn't have an entry.
entry, ok := e.files[id]
if !ok {
return syscall.ENOENT
}
// Unregister the old mask and remove entry from the list it's in, so
// readyCallback is guaranteed to not be called on this entry anymore.
entry.id.File.EventUnregister(&entry.waiter)
// Remove entry from whatever list it's in. This ensure that no other
// threads have access to this entry as the only way left to find it
// is via e.files, but we hold e.mu, which prevents that.
e.listsMu.Lock()
entry.curList.Remove(entry)
e.listsMu.Unlock()
// Initialize new readiness state.
entry.flags = flags
entry.mask = mask
entry.userData = data
e.initEntryReadiness(entry)
return nil
}
// RemoveEntry a files from the collection of observed files.
func (e *EventPoll) RemoveEntry(id FileIdentifier) error {
e.mu.Lock()
defer e.mu.Unlock()
// Fail if the file doesn't have an entry.
entry, ok := e.files[id]
if !ok {
return syscall.ENOENT
}
// Unregister from file first so that no concurrent attempts will be
// made to manipulate the file.
entry.id.File.EventUnregister(&entry.waiter)
// Remove from the current list.
e.listsMu.Lock()
entry.curList.Remove(entry)
entry.curList = nil
e.listsMu.Unlock()
// Remove file from map, and drop weak reference.
delete(e.files, id)
entry.file.Drop()
return nil
}
// UnregisterEpollWaiters removes the epoll waiter objects from the waiting
// queues. This is different from Release() as the file is not dereferenced.
func (e *EventPoll) UnregisterEpollWaiters() {
e.mu.Lock()
defer e.mu.Unlock()
for _, entry := range e.files {
entry.id.File.EventUnregister(&entry.waiter)
}
}
|