1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
|
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fuse
import (
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
const fuseDevMinor = 229
// fuseDevice implements vfs.Device for /dev/fuse.
type fuseDevice struct{}
// Open implements vfs.Device.Open.
func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
if !kernel.FUSEEnabled {
return nil, syserror.ENOENT
}
var fd DeviceFD
if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
UseDentryMetadata: true,
}); err != nil {
return nil, err
}
return &fd.vfsfd, nil
}
// DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
type DeviceFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
vfs.NoLockFD
// nextOpID is used to create new requests.
nextOpID linux.FUSEOpID
// queue is the list of requests that need to be processed by the FUSE server.
queue requestList
// numActiveRequests is the number of requests made by the Sentry that has
// yet to be responded to.
numActiveRequests uint64
// completions is used to map a request to its response. A Writer will use this
// to notify the caller of a completed response.
completions map[linux.FUSEOpID]*futureResponse
writeCursor uint32
// writeBuf is the memory buffer used to copy in the FUSE out header from
// userspace.
writeBuf []byte
// writeCursorFR current FR being copied from server.
writeCursorFR *futureResponse
// mu protects all the queues, maps, buffers and cursors and nextOpID.
mu sync.Mutex
// waitQueue is used to notify interested parties when the device becomes
// readable or writable.
waitQueue waiter.Queue
// fullQueueCh is a channel used to synchronize the readers with the writers.
// Writers (inbound requests to the filesystem) block if there are too many
// unprocessed in-flight requests.
fullQueueCh chan struct{}
// fs is the FUSE filesystem that this FD is being used for.
fs *filesystem
}
// Release implements vfs.FileDescriptionImpl.Release.
func (fd *DeviceFD) Release(context.Context) {
if fd.fs != nil {
fd.fs.conn.connected = false
}
}
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
if fd.fs == nil {
return 0, syserror.EPERM
}
return 0, syserror.ENOSYS
}
// Read implements vfs.FileDescriptionImpl.Read.
func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
if fd.fs == nil {
return 0, syserror.EPERM
}
// Return ENODEV if the filesystem is umounted.
if fd.fs.umounted {
// TODO(gvisor.dev/issue/3525): return ECONNABORTED if aborted via fuse control fs.
return 0, syserror.ENODEV
}
// We require that any Read done on this filesystem have a sane minimum
// read buffer. It must have the capacity for the fixed parts of any request
// header (Linux uses the request header and the FUSEWriteIn header for this
// calculation) + the negotiated MaxWrite room for the data.
minBuffSize := linux.FUSE_MIN_READ_BUFFER
inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes())
writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes())
negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.maxWrite
if minBuffSize < negotiatedMinBuffSize {
minBuffSize = negotiatedMinBuffSize
}
// If the read buffer is too small, error out.
if dst.NumBytes() < int64(minBuffSize) {
return 0, syserror.EINVAL
}
fd.mu.Lock()
defer fd.mu.Unlock()
return fd.readLocked(ctx, dst, opts)
}
// readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
//
// Preconditions: dst is large enough for any reasonable request.
func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
var req *Request
// Find the first valid request.
// For the normal case this loop only execute once.
for !fd.queue.Empty() {
req = fd.queue.Front()
if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() {
break
}
// The request is too large. Cannot process it. All requests must be smaller than the
// negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
// handshake.
errno := -int32(syscall.EIO)
if req.hdr.Opcode == linux.FUSE_SETXATTR {
errno = -int32(syscall.E2BIG)
}
// Return the error to the calling task.
if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil {
return 0, err
}
// We're done with this request.
fd.queue.Remove(req)
req = nil
}
if req == nil {
return 0, syserror.ErrWouldBlock
}
// We already checked the size: dst must be able to fit the whole request.
// Now we write the marshalled header, the payload,
// and the potential additional payload
// to the user memory IOSequence.
n, err := dst.CopyOut(ctx, req.data)
if err != nil {
return 0, err
}
if n != len(req.data) {
return 0, syserror.EIO
}
if req.hdr.Opcode == linux.FUSE_WRITE {
written, err := dst.DropFirst(n).CopyOut(ctx, req.payload)
if err != nil {
return 0, err
}
if written != len(req.payload) {
return 0, syserror.EIO
}
n += int(written)
}
// Fully done with this req, remove it from the queue.
fd.queue.Remove(req)
// Remove noReply ones from map of requests expecting a reply.
if req.noReply {
fd.numActiveRequests -= 1
delete(fd.completions, req.hdr.Unique)
}
return int64(n), nil
}
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
if fd.fs == nil {
return 0, syserror.EPERM
}
return 0, syserror.ENOSYS
}
// Write implements vfs.FileDescriptionImpl.Write.
func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
fd.mu.Lock()
defer fd.mu.Unlock()
return fd.writeLocked(ctx, src, opts)
}
// writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
if fd.fs == nil {
return 0, syserror.EPERM
}
// Return ENODEV if the filesystem is umounted.
if fd.fs.umounted {
return 0, syserror.ENODEV
}
var cn, n int64
hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
for src.NumBytes() > 0 {
if fd.writeCursorFR != nil {
// Already have common header, and we're now copying the payload.
wantBytes := fd.writeCursorFR.hdr.Len
// Note that the FR data doesn't have the header. Copy it over if its necessary.
if fd.writeCursorFR.data == nil {
fd.writeCursorFR.data = make([]byte, wantBytes)
}
bytesCopied, err := src.CopyIn(ctx, fd.writeCursorFR.data[fd.writeCursor:wantBytes])
if err != nil {
return 0, err
}
src = src.DropFirst(bytesCopied)
cn = int64(bytesCopied)
n += cn
fd.writeCursor += uint32(cn)
if fd.writeCursor == wantBytes {
// Done reading this full response. Clean up and unblock the
// initiator.
break
}
// Check if we have more data in src.
continue
}
// Assert that the header isn't read into the writeBuf yet.
if fd.writeCursor >= hdrLen {
return 0, syserror.EINVAL
}
// We don't have the full common response header yet.
wantBytes := hdrLen - fd.writeCursor
bytesCopied, err := src.CopyIn(ctx, fd.writeBuf[fd.writeCursor:wantBytes])
if err != nil {
return 0, err
}
src = src.DropFirst(bytesCopied)
cn = int64(bytesCopied)
n += cn
fd.writeCursor += uint32(cn)
if fd.writeCursor == hdrLen {
// Have full header in the writeBuf. Use it to fetch the actual futureResponse
// from the device's completions map.
var hdr linux.FUSEHeaderOut
hdr.UnmarshalBytes(fd.writeBuf)
// We have the header now and so the writeBuf has served its purpose.
// We could reset it manually here but instead of doing that, at the
// end of the write, the writeCursor will be set to 0 thereby allowing
// the next request to overwrite whats in the buffer,
fut, ok := fd.completions[hdr.Unique]
if !ok {
if fut.hdr.Unique == linux.FUSE_RELEASE {
// Currently we simply discard the reply for FUSE_RELEASE.
return n + src.NumBytes(), nil
}
// Server sent us a response for a request we never sent,
// or for which we already received a reply (e.g. aborted), an unlikely event.
return 0, syserror.EINVAL
}
delete(fd.completions, hdr.Unique)
// Copy over the header into the future response. The rest of the payload
// will be copied over to the FR's data in the next iteration.
fut.hdr = &hdr
fd.writeCursorFR = fut
// Next iteration will now try read the complete request, if src has
// any data remaining. Otherwise we're done.
}
}
if fd.writeCursorFR != nil {
if err := fd.sendResponse(ctx, fd.writeCursorFR); err != nil {
return 0, err
}
// Ready the device for the next request.
fd.writeCursorFR = nil
fd.writeCursor = 0
}
return n, nil
}
// Readiness implements vfs.FileDescriptionImpl.Readiness.
func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
fd.mu.Lock()
defer fd.mu.Unlock()
return fd.readinessLocked(mask)
}
// readinessLocked implements checking the readiness of the fuse device while
// locked with DeviceFD.mu.
func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask {
var ready waiter.EventMask
if fd.fs.umounted {
ready |= waiter.EventErr
return ready & mask
}
// FD is always writable.
ready |= waiter.EventOut
if !fd.queue.Empty() {
// Have reqs available, FD is readable.
ready |= waiter.EventIn
}
return ready & mask
}
// EventRegister implements waiter.Waitable.EventRegister.
func (fd *DeviceFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
fd.waitQueue.EventRegister(e, mask)
}
// EventUnregister implements waiter.Waitable.EventUnregister.
func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
fd.waitQueue.EventUnregister(e)
}
// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
if fd.fs == nil {
return 0, syserror.EPERM
}
return 0, syserror.ENOSYS
}
// sendResponse sends a response to the waiting task (if any).
//
// Preconditions: fd.mu must be held.
func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
// Signal the task waiting on a response if any.
defer close(fut.ch)
// Signal that the queue is no longer full.
select {
case fd.fullQueueCh <- struct{}{}:
default:
}
fd.numActiveRequests--
if fut.async {
return fd.asyncCallBack(ctx, fut.getResponse())
}
return nil
}
// sendError sends an error response to the waiting task (if any) by calling sendResponse().
//
// Preconditions: fd.mu must be held.
func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error {
// Return the error to the calling task.
outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
respHdr := linux.FUSEHeaderOut{
Len: outHdrLen,
Error: errno,
Unique: unique,
}
fut, ok := fd.completions[respHdr.Unique]
if !ok {
// A response for a request we never sent,
// or for which we already received a reply (e.g. aborted).
return syserror.EINVAL
}
delete(fd.completions, respHdr.Unique)
fut.hdr = &respHdr
return fd.sendResponse(ctx, fut)
}
// asyncCallBack executes pre-defined callback function for async requests.
// Currently used by: FUSE_INIT.
func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error {
switch r.opcode {
case linux.FUSE_INIT:
creds := auth.CredentialsFromContext(ctx)
rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
// TODO(gvisor.dev/issue/3247): support async read: correctly process the response.
}
return nil
}
|