1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
|
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package memfs provides a filesystem implementation that behaves like tmpfs:
// the Dentry tree is the sole source of truth for the state of the filesystem.
//
// memfs is intended primarily to demonstrate filesystem implementation
// patterns. Real uses cases for an in-memory filesystem should use tmpfs
// instead.
//
// Lock order:
//
// filesystem.mu
// regularFileFD.offMu
// regularFile.mu
// inode.mu
package memfs
import (
"fmt"
"sync"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
)
// FilesystemType implements vfs.FilesystemType.
type FilesystemType struct{}
// filesystem implements vfs.FilesystemImpl.
type filesystem struct {
vfsfs vfs.Filesystem
// mu serializes changes to the Dentry tree.
mu sync.RWMutex
nextInoMinusOne uint64 // accessed using atomic memory operations
}
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
var fs filesystem
fs.vfsfs.Init(vfsObj, &fs)
root := fs.newDentry(fs.newDirectory(creds, 01777))
return &fs.vfsfs, &root.vfsd, nil
}
// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release() {
}
// Sync implements vfs.FilesystemImpl.Sync.
func (fs *filesystem) Sync(ctx context.Context) error {
// All filesystem state is in-memory.
return nil
}
// dentry implements vfs.DentryImpl.
type dentry struct {
vfsd vfs.Dentry
// inode is the inode represented by this dentry. Multiple Dentries may
// share a single non-directory inode (with hard links). inode is
// immutable.
inode *inode
// memfs doesn't count references on dentries; because the dentry tree is
// the sole source of truth, it is by definition always consistent with the
// state of the filesystem. However, it does count references on inodes,
// because inode resources are released when all references are dropped.
// (memfs doesn't really have resources to release, but we implement
// reference counting because tmpfs regular files will.)
// dentryEntry (ugh) links dentries into their parent directory.childList.
dentryEntry
}
func (fs *filesystem) newDentry(inode *inode) *dentry {
d := &dentry{
inode: inode,
}
d.vfsd.Init(d)
return d
}
// IncRef implements vfs.DentryImpl.IncRef.
func (d *dentry) IncRef() {
d.inode.incRef()
}
// TryIncRef implements vfs.DentryImpl.TryIncRef.
func (d *dentry) TryIncRef() bool {
return d.inode.tryIncRef()
}
// DecRef implements vfs.DentryImpl.DecRef.
func (d *dentry) DecRef() {
d.inode.decRef()
}
// inode represents a filesystem object.
type inode struct {
// refs is a reference count. refs is accessed using atomic memory
// operations.
//
// A reference is held on all inodes that are reachable in the filesystem
// tree. For non-directories (which may have multiple hard links), this
// means that a reference is dropped when nlink reaches 0. For directories,
// nlink never reaches 0 due to the "." entry; instead,
// filesystem.RmdirAt() drops the reference.
refs int64
// Inode metadata; protected by mu and accessed using atomic memory
// operations unless otherwise specified.
mu sync.RWMutex
mode uint32 // excluding file type bits, which are based on impl
nlink uint32 // protected by filesystem.mu instead of inode.mu
uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
gid uint32 // auth.KGID, but ...
ino uint64 // immutable
impl interface{} // immutable
}
func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
i.refs = 1
i.mode = uint32(mode)
i.uid = uint32(creds.EffectiveKUID)
i.gid = uint32(creds.EffectiveKGID)
i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
// i.nlink initialized by caller
i.impl = impl
}
// Preconditions: filesystem.mu must be locked for writing.
func (i *inode) incLinksLocked() {
if atomic.AddUint32(&i.nlink, 1) <= 1 {
panic("memfs.inode.incLinksLocked() called with no existing links")
}
}
// Preconditions: filesystem.mu must be locked for writing.
func (i *inode) decLinksLocked() {
if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
i.decRef()
} else if nlink == ^uint32(0) { // negative overflow
panic("memfs.inode.decLinksLocked() called with no existing links")
}
}
func (i *inode) incRef() {
if atomic.AddInt64(&i.refs, 1) <= 1 {
panic("memfs.inode.incRef() called without holding a reference")
}
}
func (i *inode) tryIncRef() bool {
for {
refs := atomic.LoadInt64(&i.refs)
if refs == 0 {
return false
}
if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
return true
}
}
}
func (i *inode) decRef() {
if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
// This is unnecessary; it's mostly to simulate what tmpfs would do.
if regfile, ok := i.impl.(*regularFile); ok {
regfile.mu.Lock()
regfile.data = nil
atomic.StoreInt64(®file.dataLen, 0)
regfile.mu.Unlock()
}
} else if refs < 0 {
panic("memfs.inode.decRef() called without holding a reference")
}
}
func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
}
// Go won't inline this function, and returning linux.Statx (which is quite
// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
// output parameter.
func (i *inode) statTo(stat *linux.Statx) {
stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
stat.Blksize = 1 // usermem.PageSize in tmpfs
stat.Nlink = atomic.LoadUint32(&i.nlink)
stat.UID = atomic.LoadUint32(&i.uid)
stat.GID = atomic.LoadUint32(&i.gid)
stat.Mode = uint16(atomic.LoadUint32(&i.mode))
stat.Ino = i.ino
// TODO: device number
switch impl := i.impl.(type) {
case *regularFile:
stat.Mode |= linux.S_IFREG
stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
stat.Size = uint64(atomic.LoadInt64(&impl.dataLen))
// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
// a uint64 accessed using atomic memory operations to avoid taking
// locks).
stat.Blocks = allocatedBlocksForSize(stat.Size)
case *directory:
stat.Mode |= linux.S_IFDIR
case *symlink:
stat.Mode |= linux.S_IFLNK
stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
stat.Size = uint64(len(impl.target))
stat.Blocks = allocatedBlocksForSize(stat.Size)
case *namedPipe:
stat.Mode |= linux.S_IFIFO
default:
panic(fmt.Sprintf("unknown inode type: %T", i.impl))
}
}
// allocatedBlocksForSize returns the number of 512B blocks needed to
// accommodate the given size in bytes, as appropriate for struct
// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
// size is independent of the "preferred block size for I/O", struct
// stat::st_blksize and struct statx::stx_blksize.)
func allocatedBlocksForSize(size uint64) uint64 {
return (size + 511) / 512
}
func (i *inode) direntType() uint8 {
switch i.impl.(type) {
case *regularFile:
return linux.DT_REG
case *directory:
return linux.DT_DIR
case *symlink:
return linux.DT_LNK
default:
panic(fmt.Sprintf("unknown inode type: %T", i.impl))
}
}
// fileDescription is embedded by memfs implementations of
// vfs.FileDescriptionImpl.
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
flags uint32 // status flags; immutable
}
func (fd *fileDescription) filesystem() *filesystem {
return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
}
func (fd *fileDescription) inode() *inode {
return fd.vfsfd.Dentry().Impl().(*dentry).inode
}
// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
return fd.flags, nil
}
// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
// no-op.
return nil
}
// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
var stat linux.Statx
fd.inode().statTo(&stat)
return stat, nil
}
// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
if opts.Stat.Mask == 0 {
return nil
}
// TODO: implement inode.setStat
return syserror.EPERM
}
|