1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
|
// Copyright 2018 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fs
import (
"fmt"
"sync"
"syscall"
"gvisor.googlesource.com/gvisor/pkg/refs"
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
"gvisor.googlesource.com/gvisor/pkg/syserror"
)
// DefaultTraversalLimit provides a sensible default traversal limit that may
// be passed to FindInode and FindLink. You may want to provide other options in
// individual syscall implementations, but for internal functions this will be
// sane.
const DefaultTraversalLimit = 10
// MountNamespace defines a collection of mounts.
//
// +stateify savable
type MountNamespace struct {
refs.AtomicRefCount
// userns is the user namespace associated with this mount namespace.
//
// All privileged operations on this mount namespace must have
// appropriate capabilities in this userns.
//
// userns is immutable.
userns *auth.UserNamespace
// root is the root directory.
root *Dirent
// mu protects mounts and mountID counter.
mu sync.Mutex `state:"nosave"`
// mounts is a map of the last mounted Dirent -> stack of old Dirents
// that were mounted over, with the oldest mounted Dirent first and
// more recent mounted Dirents at the end of the slice.
//
// A reference to all Dirents in mounts (keys and values) must be held
// to ensure the Dirents are recoverable when unmounting.
mounts map[*Dirent][]*Dirent
// mountID is the next mount id to assign.
mountID uint64
}
// NewMountNamespace returns a new MountNamespace, with the provided node at the
// root, and the given cache size. A root must always be provided.
func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error) {
creds := auth.CredentialsFromContext(ctx)
root.MountSource.mu.Lock()
defer root.MountSource.mu.Unlock()
// Set the root dirent and id on the root mount.
d := NewDirent(root, "/")
root.MountSource.root = d
root.MountSource.id = 1
return &MountNamespace{
userns: creds.UserNamespace,
root: d,
mounts: make(map[*Dirent][]*Dirent),
mountID: 2,
}, nil
}
// UserNamespace returns the user namespace associated with this mount manager.
func (mns *MountNamespace) UserNamespace() *auth.UserNamespace {
return mns.userns
}
// Root returns the MountNamespace's root Dirent and increments its reference
// count. The caller must call DecRef when finished.
func (mns *MountNamespace) Root() *Dirent {
mns.root.IncRef()
return mns.root
}
// FlushMountSourceRefs flushes extra references held by MountSources for all active mount points;
// see fs/mount.go:MountSource.FlushDirentRefs.
func (mns *MountNamespace) FlushMountSourceRefs() {
mns.mu.Lock()
defer mns.mu.Unlock()
mns.flushMountSourceRefsLocked()
}
func (mns *MountNamespace) flushMountSourceRefsLocked() {
// Flush mounts' MountSource references.
for current, stack := range mns.mounts {
current.Inode.MountSource.FlushDirentRefs()
for _, prev := range stack {
prev.Inode.MountSource.FlushDirentRefs()
}
}
// Flush root's MountSource references.
mns.root.Inode.MountSource.FlushDirentRefs()
}
// destroy drops root and mounts dirent references and closes any original nodes.
//
// After destroy is called, the MountNamespace may continue to be referenced (for
// example via /proc/mounts), but should free all resources and shouldn't have
// Find* methods called.
func (mns *MountNamespace) destroy() {
mns.mu.Lock()
defer mns.mu.Unlock()
// Flush all mounts' MountSource references to Dirents. This allows for mount
// points to be torn down since there should be no remaining references after
// this and DecRef below.
mns.flushMountSourceRefsLocked()
// Teardown mounts.
for current, mp := range mns.mounts {
// Drop the mount reference on all mounted dirents.
for _, d := range mp {
d.DecRef()
}
current.DecRef()
}
mns.mounts = nil
// Drop reference on the root.
mns.root.DecRef()
// Wait for asynchronous work (queued by dropping Dirent references
// above) to complete before destroying this MountNamespace.
AsyncBarrier()
}
// DecRef implements RefCounter.DecRef with destructor mns.destroy.
func (mns *MountNamespace) DecRef() {
mns.DecRefWithDestructor(mns.destroy)
}
// Freeze freezes the entire mount tree.
func (mns *MountNamespace) Freeze() {
mns.mu.Lock()
defer mns.mu.Unlock()
// We only want to freeze Dirents with active references, not Dirents referenced
// by a mount's MountSource.
mns.flushMountSourceRefsLocked()
// Freeze the entire shebang.
mns.root.Freeze()
}
// withMountLocked prevents further walks to `node`, because `node` is about to
// be a mount point.
func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error {
mns.mu.Lock()
defer mns.mu.Unlock()
renameMu.Lock()
defer renameMu.Unlock()
// Linux allows mounting over the root (?). It comes with a strange set
// of semantics. We'll just not do this for now.
if node.parent == nil {
return syserror.EBUSY
}
// For both mount and unmount, we take this lock so we can swap out the
// appropriate child in parent.children.
//
// For unmount, this also ensures that if `node` is a mount point, the
// underlying mount's MountSource.direntRefs cannot increase by preventing
// walks to node.
node.parent.dirMu.Lock()
defer node.parent.dirMu.Unlock()
node.parent.mu.Lock()
defer node.parent.mu.Unlock()
// We need not take node.dirMu since we have parent.dirMu.
// We need to take node.mu, so that we can check for deletion.
node.mu.Lock()
defer node.mu.Unlock()
return fn()
}
// Mount mounts a `inode` over the subtree at `node`.
func (mns *MountNamespace) Mount(ctx context.Context, node *Dirent, inode *Inode) error {
return mns.withMountLocked(node, func() error {
// replacement already has one reference taken; this is the mount
// reference.
replacement, err := node.mount(ctx, inode)
if err != nil {
return err
}
// Set child/parent dirent relationship.
parentMountSource := node.Inode.MountSource
childMountSource := inode.MountSource
parentMountSource.mu.Lock()
defer parentMountSource.mu.Unlock()
childMountSource.mu.Lock()
defer childMountSource.mu.Unlock()
parentMountSource.children[childMountSource] = struct{}{}
childMountSource.parent = parentMountSource
// Set the mount's root dirent and id.
childMountSource.root = replacement
childMountSource.id = mns.mountID
mns.mountID++
// Drop node from its dirent cache.
node.dropExtendedReference()
// If node is already a mount point, push node on the stack so it can
// be recovered on unmount.
if stack, ok := mns.mounts[node]; ok {
mns.mounts[replacement] = append(stack, node)
delete(mns.mounts, node)
return nil
}
// Was not already mounted, just add another mount point.
// Take a reference on node so it can be recovered on unmount.
node.IncRef()
mns.mounts[replacement] = []*Dirent{node}
return nil
})
}
// Unmount ensures no references to the MountSource remain and removes `node` from
// this subtree. The subtree formerly mounted in `node`'s place will be
// restored. node's MountSource will be destroyed as soon as the last reference to
// `node` is dropped, as no references to Dirents within will remain.
//
// If detachOnly is set, Unmount merely removes `node` from the subtree, but
// allows existing references to the MountSource remain. E.g. if an open file still
// refers to Dirents in MountSource, the Unmount will succeed anyway and MountSource will
// be destroyed at a later time when all references to Dirents within are
// dropped.
//
// The caller must hold a reference to node from walking to it.
func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly bool) error {
// This takes locks to prevent further walks to Dirents in this mount
// under the assumption that `node` is the root of the mount.
return mns.withMountLocked(node, func() error {
origs, ok := mns.mounts[node]
if !ok {
// node is not a mount point.
return syserror.EINVAL
}
if len(origs) == 0 {
panic("cannot unmount initial dirent")
}
m := node.Inode.MountSource
if !detachOnly {
// Flush all references on the mounted node.
m.FlushDirentRefs()
// At this point, exactly two references must be held
// to mount: one mount reference on node, and one due
// to walking to node.
//
// We must also be guaranteed that no more references
// can be taken on mount. This is why withMountLocked
// must be held at this point to prevent any walks to
// and from node.
if refs := m.DirentRefs(); refs < 2 {
panic(fmt.Sprintf("have %d refs on unmount, expect 2 or more", refs))
} else if refs != 2 {
return syserror.EBUSY
}
}
// Lock the parent MountSource first, if it exists. We are
// holding mns.Lock, so the parent can not change out
// from under us.
parent := m.Parent()
if parent != nil {
parent.mu.Lock()
defer parent.mu.Unlock()
}
// Lock the mount that is being unmounted.
m.mu.Lock()
defer m.mu.Unlock()
if m.parent != nil {
// Sanity check.
if _, ok := m.parent.children[m]; !ok {
panic(fmt.Sprintf("mount %+v is not a child of parent %+v", m, m.parent))
}
delete(m.parent.children, m)
}
original := origs[len(origs)-1]
if err := node.unmount(ctx, original); err != nil {
return err
}
switch {
case len(origs) > 1:
mns.mounts[original] = origs[:len(origs)-1]
case len(origs) == 1:
// Drop mount reference taken at the end of
// MountNamespace.Mount.
original.DecRef()
}
delete(mns.mounts, node)
return nil
})
}
// FindLink returns an Dirent from a given node, which may be a symlink.
//
// The root argument is treated as the root directory, and FindLink will not
// return anything above that. The wd dirent provides the starting directory,
// and may be nil which indicates the root should be used. You must call DecRef
// on the resulting Dirent when you are no longer using the object.
//
// If wd is nil, then the root will be used as the working directory. If the
// path is absolute, this has no functional impact.
//
// Precondition: root must be non-nil.
// Precondition: the path must be non-empty.
func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) {
if root == nil {
panic("MountNamespace.FindInode: root must not be nil")
}
if len(path) == 0 {
panic("MountNamespace.FindInode: path is empty")
}
// Split the path.
first, remainder := SplitFirst(path)
// Where does this walk originate?
current := wd
if current == nil {
current = root
}
for first == "/" {
// Special case: it's possible that we have nothing to walk at
// all. This is necessary since we're resplitting the path.
if remainder == "" {
root.IncRef()
return root, nil
}
// Start at the root and advance the path component so that the
// walk below can proceed. Note at this point, it handles the
// no-op walk case perfectly fine.
current = root
first, remainder = SplitFirst(remainder)
}
current.IncRef() // Transferred during walk.
for {
// Check that the file is a directory and that we have
// permissions to walk.
//
// Note that we elide this check for the root directory as an
// optimization; a non-executable root may still be walked. A
// non-directory root is hopeless.
if current != root {
if !IsDir(current.Inode.StableAttr) {
current.DecRef() // Drop reference from above.
return nil, syserror.ENOTDIR
}
if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil {
current.DecRef() // Drop reference from above.
return nil, err
}
}
// Move to the next level.
next, err := current.Walk(ctx, root, first)
if err != nil {
// Allow failed walks to cache the dirent, because no
// children will acquire a reference at the end.
current.maybeExtendReference()
current.DecRef()
return nil, err
}
// Drop old reference.
current.DecRef()
if remainder != "" {
// Ensure it's resolved, unless it's the last level.
//
// See resolve for reference semantics; on err next
// will have one dropped.
current, err = mns.resolve(ctx, root, next, maxTraversals)
if err != nil {
return nil, err
}
} else {
// Allow the file system to take an extra reference on the
// found child. This will hold a reference on the containing
// directory, so the whole tree will be implicitly cached.
next.maybeExtendReference()
return next, nil
}
// Move to the next element.
first, remainder = SplitFirst(remainder)
}
}
// FindInode is identical to FindLink except the return value is resolved.
//
//go:nosplit
func (mns *MountNamespace) FindInode(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) {
d, err := mns.FindLink(ctx, root, wd, path, maxTraversals)
if err != nil {
return nil, err
}
// See resolve for reference semantics; on err d will have the
// reference dropped.
return mns.resolve(ctx, root, d, maxTraversals)
}
// resolve resolves the given link.
//
// If successful, a reference is dropped on node and one is acquired on the
// caller's behalf for the returned dirent.
//
// If not successful, a reference is _also_ dropped on the node and an error
// returned. This is for convenience in using resolve directly as a return
// value.
func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, maxTraversals uint) (*Dirent, error) {
// Resolve the path.
target, err := node.Inode.Getlink(ctx)
switch err {
case nil:
// Make sure we didn't exhaust the traversal budget.
if maxTraversals == 0 {
target.DecRef()
return nil, syscall.ELOOP
}
node.DecRef() // Drop the original reference.
return target, nil
case syscall.ENOLINK:
// Not a symlink.
return node, nil
case ErrResolveViaReadlink:
defer node.DecRef() // See above.
// First, check if we should traverse.
if maxTraversals == 0 {
return nil, syscall.ELOOP
}
// Read the target path.
targetPath, err := node.Inode.Readlink(ctx)
if err != nil {
return nil, err
}
// Find the node; we resolve relative to the current symlink's parent.
d, err := mns.FindInode(ctx, root, node.parent, targetPath, maxTraversals-1)
if err != nil {
return nil, err
}
return d, err
default:
node.DecRef() // Drop for err; see above.
// Propagate the error.
return nil, err
}
}
// SyncAll calls Dirent.SyncAll on the root.
func (mns *MountNamespace) SyncAll(ctx context.Context) {
mns.mu.Lock()
defer mns.mu.Unlock()
mns.root.SyncAll(ctx)
}
|