1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
|
// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package kernel
import (
"bytes"
"fmt"
"sort"
"sync"
"sync/atomic"
"syscall"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/refs"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
)
// FDs is an ordering of FD's that can be made stable.
type FDs []kdefs.FD
func (f FDs) Len() int {
return len(f)
}
func (f FDs) Swap(i, j int) {
f[i], f[j] = f[j], f[i]
}
func (f FDs) Less(i, j int) bool {
return f[i] < f[j]
}
// FDFlags define flags for an individual descriptor.
//
// +stateify savable
type FDFlags struct {
// CloseOnExec indicates the descriptor should be closed on exec.
CloseOnExec bool
}
// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
// representation.
func (f FDFlags) ToLinuxFileFlags() (mask uint) {
if f.CloseOnExec {
mask |= linux.O_CLOEXEC
}
return
}
// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
// representation.
func (f FDFlags) ToLinuxFDFlags() (mask uint) {
if f.CloseOnExec {
mask |= linux.FD_CLOEXEC
}
return
}
// descriptor holds the details about a file descriptor, namely a pointer the
// file itself and the descriptor flags.
//
// +stateify savable
type descriptor struct {
file *fs.File
flags FDFlags
}
// FDMap is used to manage File references and flags.
//
// +stateify savable
type FDMap struct {
refs.AtomicRefCount
k *Kernel
files map[kdefs.FD]descriptor
mu sync.RWMutex `state:"nosave"`
uid uint64
}
// ID returns a unique identifier for this FDMap.
func (f *FDMap) ID() uint64 {
return f.uid
}
// NewFDMap allocates a new FDMap that may be used by tasks in k.
func (k *Kernel) NewFDMap() *FDMap {
return &FDMap{
k: k,
files: make(map[kdefs.FD]descriptor),
uid: atomic.AddUint64(&k.fdMapUids, 1),
}
}
// destroy removes all of the file descriptors from the map.
func (f *FDMap) destroy() {
f.RemoveIf(func(*fs.File, FDFlags) bool {
return true
})
}
// DecRef implements RefCounter.DecRef with destructor f.destroy.
func (f *FDMap) DecRef() {
f.DecRefWithDestructor(f.destroy)
}
// Size returns the number of file descriptor slots currently allocated.
func (f *FDMap) Size() int {
f.mu.RLock()
defer f.mu.RUnlock()
return len(f.files)
}
// String is a stringer for FDMap.
func (f *FDMap) String() string {
f.mu.RLock()
defer f.mu.RUnlock()
var b bytes.Buffer
for k, v := range f.files {
n, _ := v.file.Dirent.FullName(nil /* root */)
b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", k, n))
}
return b.String()
}
// NewFDFrom allocates a new FD guaranteed to be the lowest number available
// greater than or equal to from. This property is important as Unix programs
// tend to count on this allocation order.
func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error) {
if fd < 0 {
// Don't accept negative FDs.
return 0, syscall.EINVAL
}
f.mu.Lock()
defer f.mu.Unlock()
// Finds the lowest fd not in the handles map.
lim := limitSet.Get(limits.NumberOfFiles)
for i := fd; lim.Cur == limits.Infinity || i < kdefs.FD(lim.Cur); i++ {
if _, ok := f.files[i]; !ok {
file.IncRef()
f.files[i] = descriptor{file, flags}
return i, nil
}
}
return -1, syscall.EMFILE
}
// NewFDAt sets the file reference for the given FD. If there is an
// active reference for that FD, the ref count for that existing reference
// is decremented.
func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error {
if fd < 0 {
// Don't accept negative FDs.
return syscall.EBADF
}
// In this one case we do not do a defer of the Unlock. The
// reason is that we must have done all the work needed for
// discarding any old open file before we return to the
// caller. In other words, the DecRef(), below, must have
// completed by the time we return to the caller to ensure
// side effects are, in fact, effected. A classic example is
// dup2(fd1, fd2); if fd2 was already open, it must be closed,
// and we don't want to resume the caller until it is; we have
// to block on the DecRef(). Hence we can not just do a 'go
// oldfile.DecRef()', since there would be no guarantee that
// it would be done before we the caller resumed. Since we
// must wait for the DecRef() to finish, and that could take
// time, it's best to first call f.muUnlock beore so we are
// not blocking other uses of this FDMap on the DecRef() call.
f.mu.Lock()
oldDesc, oldExists := f.files[fd]
lim := limitSet.Get(limits.NumberOfFiles).Cur
// if we're closing one then the effective limit is one
// more than the actual limit.
if oldExists && lim != limits.Infinity {
lim++
}
if lim != limits.Infinity && fd >= kdefs.FD(lim) {
f.mu.Unlock()
return syscall.EMFILE
}
file.IncRef()
f.files[fd] = descriptor{file, flags}
f.mu.Unlock()
if oldExists {
oldDesc.file.DecRef()
}
return nil
}
// SetFlags sets the flags for the given file descriptor, if it is valid.
func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags) {
f.mu.Lock()
defer f.mu.Unlock()
desc, ok := f.files[fd]
if !ok {
return
}
f.files[fd] = descriptor{desc.file, flags}
}
// GetDescriptor returns a reference to the file and the flags for the FD. It
// bumps its reference count as well. It returns nil if there is no File
// for the FD, i.e. if the FD is invalid. The caller must use DecRef
// when they are done.
func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags) {
f.mu.RLock()
defer f.mu.RUnlock()
if desc, ok := f.files[fd]; ok {
desc.file.IncRef()
return desc.file, desc.flags
}
return nil, FDFlags{}
}
// GetFile returns a reference to the File for the FD and bumps
// its reference count as well. It returns nil if there is no File
// for the FD, i.e. if the FD is invalid. The caller must use DecRef
// when they are done.
func (f *FDMap) GetFile(fd kdefs.FD) *fs.File {
f.mu.RLock()
if desc, ok := f.files[fd]; ok {
desc.file.IncRef()
f.mu.RUnlock()
return desc.file
}
f.mu.RUnlock()
return nil
}
// fds returns an ordering of FDs.
func (f *FDMap) fds() FDs {
fds := make(FDs, 0, len(f.files))
for fd := range f.files {
fds = append(fds, fd)
}
sort.Sort(fds)
return fds
}
// GetFDs returns a list of valid fds.
func (f *FDMap) GetFDs() FDs {
f.mu.RLock()
defer f.mu.RUnlock()
return f.fds()
}
// GetRefs returns a stable slice of references to all files and bumps the
// reference count on each. The caller must use DecRef on each reference when
// they're done using the slice.
func (f *FDMap) GetRefs() []*fs.File {
f.mu.RLock()
defer f.mu.RUnlock()
fds := f.fds()
fs := make([]*fs.File, 0, len(fds))
for _, fd := range fds {
desc := f.files[fd]
desc.file.IncRef()
fs = append(fs, desc.file)
}
return fs
}
// Fork returns an independent FDMap pointing to the same descriptors.
func (f *FDMap) Fork() *FDMap {
f.mu.RLock()
defer f.mu.RUnlock()
clone := f.k.NewFDMap()
// Grab a extra reference for every file.
for fd, desc := range f.files {
desc.file.IncRef()
clone.files[fd] = desc
}
// That's it!
return clone
}
// unlock releases all file locks held by this FDMap's uid. Must only be
// called on a non-nil *fs.File.
func (f *FDMap) unlock(file *fs.File) {
id := lock.UniqueID(f.ID())
file.Dirent.Inode.LockCtx.Posix.UnlockRegion(id, lock.LockRange{0, lock.LockEOF})
}
// inotifyFileClose generates the appropriate inotify events for f being closed.
func inotifyFileClose(f *fs.File) {
var ev uint32
d := f.Dirent
if fs.IsDir(d.Inode.StableAttr) {
ev |= linux.IN_ISDIR
}
if f.Flags().Write {
ev |= linux.IN_CLOSE_WRITE
} else {
ev |= linux.IN_CLOSE_NOWRITE
}
d.InotifyEvent(ev, 0)
}
// Remove removes an FD from the FDMap, and returns (File, true) if a File
// one was found. Callers are expected to decrement the reference count on
// the File. Otherwise returns (nil, false).
func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool) {
f.mu.Lock()
desc := f.files[fd]
delete(f.files, fd)
f.mu.Unlock()
if desc.file != nil {
f.unlock(desc.file)
inotifyFileClose(desc.file)
return desc.file, true
}
return nil, false
}
// RemoveIf removes all FDs where cond is true.
func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool) {
var removed []*fs.File
f.mu.Lock()
for fd, desc := range f.files {
if desc.file != nil && cond(desc.file, desc.flags) {
delete(f.files, fd)
removed = append(removed, desc.file)
}
}
f.mu.Unlock()
for _, file := range removed {
f.unlock(file)
inotifyFileClose(file)
file.DecRef()
}
}
|