diff options
author | gVisor bot <gvisor-bot@google.com> | 2020-09-11 23:13:20 +0000 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2020-09-11 23:13:20 +0000 |
commit | 2c7d03d2378865ee92d7ae24e48748f17233e783 (patch) | |
tree | f84e553a41525dd9dc4dc64f6b6fb1bbdf1481a8 /pkg/sentry | |
parent | 51194daa8da2bee362c9ecb6491016038e5f8436 (diff) | |
parent | 1f4fb817c8ff8be7239a99baff01d8f20b2e9abd (diff) |
Merge release-20200907.0-36-g1f4fb817c (automated)
Diffstat (limited to 'pkg/sentry')
55 files changed, 853 insertions, 2140 deletions
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go index 5d4f312cf..c8231e0aa 100644 --- a/pkg/sentry/fs/host/socket_unsafe.go +++ b/pkg/sentry/fs/host/socket_unsafe.go @@ -65,10 +65,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) ( controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC if n > length { - return length, n, msg.Controllen, controlTrunc, err + return length, n, msg.Controllen, controlTrunc, nil } - return n, n, msg.Controllen, controlTrunc, err + return n, n, msg.Controllen, controlTrunc, nil } // fdWriteVec sends from bufs to fd. diff --git a/pkg/sentry/fsimpl/devpts/root_inode_refs.go b/pkg/sentry/fsimpl/devpts/root_inode_refs.go index 051801202..4abb66431 100644 --- a/pkg/sentry/fsimpl/devpts/root_inode_refs.go +++ b/pkg/sentry/fsimpl/devpts/root_inode_refs.go @@ -2,11 +2,10 @@ package devpts import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go index a7402c149..6df2728ab 100644 --- a/pkg/sentry/fsimpl/fuse/connection.go +++ b/pkg/sentry/fsimpl/fuse/connection.go @@ -15,17 +15,31 @@ package fuse import ( + "errors" + "fmt" "sync" + "sync/atomic" + "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" + "gvisor.dev/gvisor/tools/go_marshal/marshal" ) +// maxActiveRequestsDefault is the default setting controlling the upper bound +// on the number of active requests at any given time. +const maxActiveRequestsDefault = 10000 + +// Ordinary requests have even IDs, while interrupts IDs are odd. +// Used to increment the unique ID for each FUSE request. +var reqIDStep uint64 = 2 + const ( // fuseDefaultMaxBackground is the default value for MaxBackground. fuseDefaultMaxBackground = 12 @@ -38,36 +52,43 @@ const ( fuseDefaultMaxPagesPerReq = 32 ) +// Request represents a FUSE operation request that hasn't been sent to the +// server yet. +// +// +stateify savable +type Request struct { + requestEntry + + id linux.FUSEOpID + hdr *linux.FUSEHeaderIn + data []byte +} + +// Response represents an actual response from the server, including the +// response payload. +// +// +stateify savable +type Response struct { + opcode linux.FUSEOpcode + hdr linux.FUSEHeaderOut + data []byte +} + // connection is the struct by which the sentry communicates with the FUSE server daemon. -// Lock order: -// - conn.fd.mu -// - conn.mu -// - conn.asyncMu type connection struct { fd *DeviceFD - // mu protects access to struct memebers. - mu sync.Mutex - - // attributeVersion is the version of connection's attributes. - attributeVersion uint64 - - // We target FUSE 7.23. // The following FUSE_INIT flags are currently unsupported by this implementation: + // - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC) // - FUSE_EXPORT_SUPPORT + // - FUSE_HANDLE_KILLPRIV // - FUSE_POSIX_LOCKS: requires POSIX locks // - FUSE_FLOCK_LOCKS: requires POSIX locks // - FUSE_AUTO_INVAL_DATA: requires page caching eviction + // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction // - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation // - FUSE_ASYNC_DIO - // - FUSE_PARALLEL_DIROPS (7.25) - // - FUSE_HANDLE_KILLPRIV (7.26) - // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler (7.26) - // - FUSE_ABORT_ERROR (7.27) - // - FUSE_CACHE_SYMLINKS (7.28) - // - FUSE_NO_OPENDIR_SUPPORT (7.29) - // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction (7.30) - // - FUSE_MAP_ALIGNMENT (7.31) + // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler // initialized after receiving FUSE_INIT reply. // Until it's set, suspend sending FUSE requests. @@ -77,6 +98,10 @@ type connection struct { // initializedChan is used to block requests before initialization. initializedChan chan struct{} + // blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground). + // TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block. + blocked bool + // connected (connection established) when a new FUSE file system is created. // Set to false when: // umount, @@ -84,55 +109,48 @@ type connection struct { // device release. connected bool + // aborted via sysfs. + // TODO(gvisor.dev/issue/3185): abort all queued requests. + aborted bool + // connInitError if FUSE_INIT encountered error (major version mismatch). // Only set in INIT. connInitError bool // connInitSuccess if FUSE_INIT is successful. // Only set in INIT. - // Used for destory (not yet implemented). + // Used for destory. connInitSuccess bool - // aborted via sysfs, and will send ECONNABORTED to read after disconnection (instead of ENODEV). - // Set only if abortErr is true and via fuse control fs (not yet implemented). - // TODO(gvisor.dev/issue/3525): set this to true when user aborts. - aborted bool + // TODO(gvisor.dev/issue/3185): All the queue logic are working in progress. - // numWating is the number of requests waiting to be - // sent to FUSE device or being processed by FUSE daemon. - numWaiting uint32 + // NumberBackground is the number of requests in the background. + numBackground uint16 - // Terminology note: - // - // - `asyncNumMax` is the `MaxBackground` in the FUSE_INIT_IN struct. - // - // - `asyncCongestionThreshold` is the `CongestionThreshold` in the FUSE_INIT_IN struct. - // - // We call the "background" requests in unix term as async requests. - // The "async requests" in unix term is our async requests that expect a reply, - // i.e. `!requestOptions.noReply` + // congestionThreshold for NumBackground. + // Negotiated in FUSE_INIT. + congestionThreshold uint16 + + // maxBackground is the maximum number of NumBackground. + // Block connection when it is reached. + // Negotiated in FUSE_INIT. + maxBackground uint16 - // asyncMu protects the async request fields. - asyncMu sync.Mutex + // numActiveBackground is the number of requests in background and has being marked as active. + numActiveBackground uint16 - // asyncNum is the number of async requests. - // Protected by asyncMu. - asyncNum uint16 + // numWating is the number of requests waiting for completion. + numWaiting uint32 - // asyncCongestionThreshold the number of async requests. - // Negotiated in FUSE_INIT as "CongestionThreshold". - // TODO(gvisor.dev/issue/3529): add congestion control. - // Protected by asyncMu. - asyncCongestionThreshold uint16 + // TODO(gvisor.dev/issue/3185): BgQueue + // some queue for background queued requests. - // asyncNumMax is the maximum number of asyncNum. - // Connection blocks the async requests when it is reached. - // Negotiated in FUSE_INIT as "MaxBackground". - // Protected by asyncMu. - asyncNumMax uint16 + // bgLock protects: + // MaxBackground, CongestionThreshold, NumBackground, + // NumActiveBackground, BgQueue, Blocked. + bgLock sync.Mutex // maxRead is the maximum size of a read buffer in in bytes. - // Initialized from a fuse fs parameter. maxRead uint32 // maxWrite is the maximum size of a write buffer in bytes. @@ -147,20 +165,23 @@ type connection struct { // Negotiated and only set in INIT. minor uint32 - // atomicOTrunc is true when FUSE does not send a separate SETATTR request - // before open with O_TRUNC flag. - // Negotiated and only set in INIT. - atomicOTrunc bool - // asyncRead if read pages asynchronously. // Negotiated and only set in INIT. asyncRead bool + // abortErr is true if kernel need to return an unique read error after abort. + // Negotiated and only set in INIT. + abortErr bool + // writebackCache is true for write-back cache policy, // false for write-through policy. // Negotiated and only set in INIT. writebackCache bool + // cacheSymlinks if filesystem needs to cache READLINK responses in page cache. + // Negotiated and only set in INIT. + cacheSymlinks bool + // bigWrites if doing multi-page cached writes. // Negotiated and only set in INIT. bigWrites bool @@ -168,70 +189,116 @@ type connection struct { // dontMask if filestestem does not apply umask to creation modes. // Negotiated in INIT. dontMask bool - - // noOpen if FUSE server doesn't support open operation. - // This flag only influence performance, not correctness of the program. - noOpen bool } // newFUSEConnection creates a FUSE connection to fd. -func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, opts *filesystemOptions) (*connection, error) { +func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) { // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to // mount a FUSE filesystem. fuseFD := fd.Impl().(*DeviceFD) + fuseFD.mounted = true // Create the writeBuf for the header to be stored in. hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) fuseFD.writeBuf = make([]byte, hdrLen) fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse) - fuseFD.fullQueueCh = make(chan struct{}, opts.maxActiveRequests) + fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests) fuseFD.writeCursor = 0 return &connection{ - fd: fuseFD, - asyncNumMax: fuseDefaultMaxBackground, - asyncCongestionThreshold: fuseDefaultCongestionThreshold, - maxRead: opts.maxRead, - maxPages: fuseDefaultMaxPagesPerReq, - initializedChan: make(chan struct{}), - connected: true, + fd: fuseFD, + maxBackground: fuseDefaultMaxBackground, + congestionThreshold: fuseDefaultCongestionThreshold, + maxPages: fuseDefaultMaxPagesPerReq, + initializedChan: make(chan struct{}), + connected: true, }, nil } -// CallAsync makes an async (aka background) request. -// It's a simple wrapper around Call(). -func (conn *connection) CallAsync(t *kernel.Task, r *Request) error { - r.async = true - _, err := conn.Call(t, r) - return err +// SetInitialized atomically sets the connection as initialized. +func (conn *connection) SetInitialized() { + // Unblock the requests sent before INIT. + close(conn.initializedChan) + + // Close the channel first to avoid the non-atomic situation + // where conn.initialized is true but there are + // tasks being blocked on the channel. + // And it prevents the newer tasks from gaining + // unnecessary higher chance to be issued before the blocked one. + + atomic.StoreInt32(&(conn.initialized), int32(1)) } -// Call makes a request to the server. -// Block before the connection is initialized. -// When the Request is FUSE_INIT, it will not be blocked before initialization. -// Task should never be nil. -// -// For a sync request, it blocks the invoking task until -// a server responds with a response. -// -// For an async request (that do not expect a response immediately), -// it returns directly unless being blocked either before initialization -// or when there are too many async requests ongoing. -// -// Example for async request: -// init, readahead, write, async read/write, fuse_notify_reply, -// non-sync release, interrupt, forget. -// -// The forget request does not have a reply, -// as documented in include/uapi/linux/fuse.h:FUSE_FORGET. +// IsInitialized atomically check if the connection is initialized. +// pairs with SetInitialized(). +func (conn *connection) Initialized() bool { + return atomic.LoadInt32(&(conn.initialized)) != 0 +} + +// NewRequest creates a new request that can be sent to the FUSE server. +func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { + conn.fd.mu.Lock() + defer conn.fd.mu.Unlock() + conn.fd.nextOpID += linux.FUSEOpID(reqIDStep) + + hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes() + hdr := linux.FUSEHeaderIn{ + Len: uint32(hdrLen + payload.SizeBytes()), + Opcode: opcode, + Unique: conn.fd.nextOpID, + NodeID: ino, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + PID: pid, + } + + buf := make([]byte, hdr.Len) + hdr.MarshalUnsafe(buf[:hdrLen]) + payload.MarshalUnsafe(buf[hdrLen:]) + + return &Request{ + id: hdr.Unique, + hdr: &hdr, + data: buf, + }, nil +} + +// Call makes a request to the server and blocks the invoking task until a +// server responds with a response. Task should never be nil. +// Requests will not be sent before the connection is initialized. +// For async tasks, use CallAsync(). func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) { // Block requests sent before connection is initalized. - if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT { + if !conn.Initialized() { if err := t.Block(conn.initializedChan); err != nil { return nil, err } } + return conn.call(t, r) +} + +// CallAsync makes an async (aka background) request. +// Those requests either do not expect a response (e.g. release) or +// the response should be handled by others (e.g. init). +// Return immediately unless the connection is blocked (before initialization). +// Async call example: init, release, forget, aio, interrupt. +// When the Request is FUSE_INIT, it will not be blocked before initialization. +func (conn *connection) CallAsync(t *kernel.Task, r *Request) error { + // Block requests sent before connection is initalized. + if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT { + if err := t.Block(conn.initializedChan); err != nil { + return err + } + } + + // This should be the only place that invokes call() with a nil task. + _, err := conn.call(nil, r) + return err +} + +// call makes a call without blocking checks. +func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) { if !conn.connected { return nil, syserror.ENOTCONN } @@ -248,6 +315,31 @@ func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) { return fut.resolve(t) } +// Error returns the error of the FUSE call. +func (r *Response) Error() error { + errno := r.hdr.Error + if errno >= 0 { + return nil + } + + sysErrNo := syscall.Errno(-errno) + return error(sysErrNo) +} + +// UnmarshalPayload unmarshals the response data into m. +func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { + hdrLen := r.hdr.SizeBytes() + haveDataLen := r.hdr.Len - uint32(hdrLen) + wantDataLen := uint32(m.SizeBytes()) + + if haveDataLen < wantDataLen { + return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen) + } + + m.UnmarshalUnsafe(r.data[hdrLen:]) + return nil +} + // callFuture makes a request to the server and returns a future response. // Call resolve() when the response needs to be fulfilled. func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) { @@ -266,6 +358,11 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, // if there are always too many ongoing requests all the time. The // supported maxActiveRequests setting should be really high to avoid this. for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests { + if t == nil { + // Since there is no task that is waiting. We must error out. + return nil, errors.New("FUSE request queue full") + } + log.Infof("Blocking request %v from being queued. Too many active requests: %v", r.id, conn.fd.numActiveRequests) conn.fd.mu.Unlock() @@ -281,19 +378,9 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, // callFutureLocked makes a request to the server and returns a future response. func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) { - // Check connected again holding conn.mu. - conn.mu.Lock() - if !conn.connected { - conn.mu.Unlock() - // we checked connected before, - // this must be due to aborted connection. - return nil, syserror.ECONNABORTED - } - conn.mu.Unlock() - conn.fd.queue.PushBack(r) - conn.fd.numActiveRequests++ - fut := newFutureResponse(r) + conn.fd.numActiveRequests += 1 + fut := newFutureResponse(r.hdr.Opcode) conn.fd.completions[r.id] = fut // Signal the readers that there is something to read. @@ -301,3 +388,50 @@ func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureRes return fut, nil } + +// futureResponse represents an in-flight request, that may or may not have +// completed yet. Convert it to a resolved Response by calling Resolve, but note +// that this may block. +// +// +stateify savable +type futureResponse struct { + opcode linux.FUSEOpcode + ch chan struct{} + hdr *linux.FUSEHeaderOut + data []byte +} + +// newFutureResponse creates a future response to a FUSE request. +func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse { + return &futureResponse{ + opcode: opcode, + ch: make(chan struct{}), + } +} + +// resolve blocks the task until the server responds to its corresponding request, +// then returns a resolved response. +func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) { + // If there is no Task associated with this request - then we don't try to resolve + // the response. Instead, the task writing the response (proxy to the server) will + // process the response on our behalf. + if t == nil { + log.Infof("fuse.Response.resolve: Not waiting on a response from server.") + return nil, nil + } + + if err := t.Block(f.ch); err != nil { + return nil, err + } + + return f.getResponse(), nil +} + +// getResponse creates a Response from the data the futureResponse has. +func (f *futureResponse) getResponse() *Response { + return &Response{ + opcode: f.opcode, + hdr: *f.hdr, + data: f.data, + } +} diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index 64c3e32e2..e522ff9a0 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -55,6 +56,9 @@ type DeviceFD struct { vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD + // mounted specifies whether a FUSE filesystem was mounted using the DeviceFD. + mounted bool + // nextOpID is used to create new requests. nextOpID linux.FUSEOpID @@ -96,15 +100,13 @@ type DeviceFD struct { // Release implements vfs.FileDescriptionImpl.Release. func (fd *DeviceFD) Release(context.Context) { - if fd.fs != nil { - fd.fs.conn.connected = false - } + fd.fs.conn.connected = false } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if fd.fs == nil { + if !fd.mounted { return 0, syserror.EPERM } @@ -114,16 +116,10 @@ func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset in // Read implements vfs.FileDescriptionImpl.Read. func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if fd.fs == nil { + if !fd.mounted { return 0, syserror.EPERM } - // Return ENODEV if the filesystem is umounted. - if fd.fs.umounted { - // TODO(gvisor.dev/issue/3525): return ECONNABORTED if aborted via fuse control fs. - return 0, syserror.ENODEV - } - // We require that any Read done on this filesystem have a sane minimum // read buffer. It must have the capacity for the fixed parts of any request // header (Linux uses the request header and the FUSEWriteIn header for this @@ -147,82 +143,58 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R } // readLocked implements the reading of the fuse device while locked with DeviceFD.mu. -// -// Preconditions: dst is large enough for any reasonable request. func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - var req *Request + if fd.queue.Empty() { + return 0, syserror.ErrWouldBlock + } - // Find the first valid request. - // For the normal case this loop only execute once. - for !fd.queue.Empty() { - req = fd.queue.Front() + var readCursor uint32 + var bytesRead int64 + for { + req := fd.queue.Front() + if dst.NumBytes() < int64(req.hdr.Len) { + // The request is too large. Cannot process it. All requests must be smaller than the + // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT + // handshake. + errno := -int32(syscall.EIO) + if req.hdr.Opcode == linux.FUSE_SETXATTR { + errno = -int32(syscall.E2BIG) + } - if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() { - break - } + // Return the error to the calling task. + if err := fd.sendError(ctx, errno, req); err != nil { + return 0, err + } - // The request is too large. Cannot process it. All requests must be smaller than the - // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT - // handshake. - errno := -int32(syscall.EIO) - if req.hdr.Opcode == linux.FUSE_SETXATTR { - errno = -int32(syscall.E2BIG) - } + // We're done with this request. + fd.queue.Remove(req) - // Return the error to the calling task. - if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil { - return 0, err + // Restart the read as this request was invalid. + log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.") + return fd.readLocked(ctx, dst, opts) } - // We're done with this request. - fd.queue.Remove(req) - req = nil - } - - if req == nil { - return 0, syserror.ErrWouldBlock - } - - // We already checked the size: dst must be able to fit the whole request. - // Now we write the marshalled header, the payload, - // and the potential additional payload - // to the user memory IOSequence. - - n, err := dst.CopyOut(ctx, req.data) - if err != nil { - return 0, err - } - if n != len(req.data) { - return 0, syserror.EIO - } - - if req.hdr.Opcode == linux.FUSE_WRITE { - written, err := dst.DropFirst(n).CopyOut(ctx, req.payload) + n, err := dst.CopyOut(ctx, req.data[readCursor:]) if err != nil { return 0, err } - if written != len(req.payload) { - return 0, syserror.EIO - } - n += int(written) - } + readCursor += uint32(n) + bytesRead += int64(n) - // Fully done with this req, remove it from the queue. - fd.queue.Remove(req) - - // Remove noReply ones from map of requests expecting a reply. - if req.noReply { - fd.numActiveRequests -= 1 - delete(fd.completions, req.hdr.Unique) + if readCursor >= req.hdr.Len { + // Fully done with this req, remove it from the queue. + fd.queue.Remove(req) + break + } } - return int64(n), nil + return bytesRead, nil } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if fd.fs == nil { + if !fd.mounted { return 0, syserror.EPERM } @@ -239,15 +211,10 @@ func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs. // writeLocked implements writing to the fuse device while locked with DeviceFD.mu. func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if fd.fs == nil { + if !fd.mounted { return 0, syserror.EPERM } - // Return ENODEV if the filesystem is umounted. - if fd.fs.umounted { - return 0, syserror.ENODEV - } - var cn, n int64 hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) @@ -309,12 +276,7 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt fut, ok := fd.completions[hdr.Unique] if !ok { - if fut.hdr.Unique == linux.FUSE_RELEASE { - // Currently we simply discard the reply for FUSE_RELEASE. - return n + src.NumBytes(), nil - } - // Server sent us a response for a request we never sent, - // or for which we already received a reply (e.g. aborted), an unlikely event. + // Server sent us a response for a request we never sent? return 0, syserror.EINVAL } @@ -345,23 +307,8 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt // Readiness implements vfs.FileDescriptionImpl.Readiness. func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask { - fd.mu.Lock() - defer fd.mu.Unlock() - return fd.readinessLocked(mask) -} - -// readinessLocked implements checking the readiness of the fuse device while -// locked with DeviceFD.mu. -func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask { var ready waiter.EventMask - - if fd.fs.umounted { - ready |= waiter.EventErr - return ready & mask - } - - // FD is always writable. - ready |= waiter.EventOut + ready |= waiter.EventOut // FD is always writable if !fd.queue.Empty() { // Have reqs available, FD is readable. ready |= waiter.EventIn @@ -383,7 +330,7 @@ func (fd *DeviceFD) EventUnregister(e *waiter.Entry) { // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if fd.fs == nil { + if !fd.mounted { return 0, syserror.EPERM } @@ -392,54 +339,58 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64 // sendResponse sends a response to the waiting task (if any). func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error { - // Signal the task waiting on a response if any. - defer close(fut.ch) + // See if the running task need to perform some action before returning. + // Since we just finished writing the future, we can be sure that + // getResponse generates a populated response. + if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil { + return err + } // Signal that the queue is no longer full. select { case fd.fullQueueCh <- struct{}{}: default: } - fd.numActiveRequests-- - - if fut.async { - return fd.asyncCallBack(ctx, fut.getResponse()) - } + fd.numActiveRequests -= 1 + // Signal the task waiting on a response. + close(fut.ch) return nil } -// sendError sends an error response to the waiting task (if any) by calling sendResponse(). -func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error { +// sendError sends an error response to the waiting task (if any). +func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error { // Return the error to the calling task. outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) respHdr := linux.FUSEHeaderOut{ Len: outHdrLen, Error: errno, - Unique: unique, + Unique: req.hdr.Unique, } fut, ok := fd.completions[respHdr.Unique] if !ok { - // A response for a request we never sent, - // or for which we already received a reply (e.g. aborted). + // Server sent us a response for a request we never sent? return syserror.EINVAL } delete(fd.completions, respHdr.Unique) fut.hdr = &respHdr - return fd.sendResponse(ctx, fut) + if err := fd.sendResponse(ctx, fut); err != nil { + return err + } + + return nil } -// asyncCallBack executes pre-defined callback function for async requests. -// Currently used by: FUSE_INIT. -func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error { - switch r.opcode { - case linux.FUSE_INIT: +// noReceiverAction has the calling kernel.Task do some action if its known that no +// receiver is going to be waiting on the future channel. This is to be used by: +// FUSE_INIT. +func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error { + if r.opcode == linux.FUSE_INIT { creds := auth.CredentialsFromContext(ctx) rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace() return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs)) - // TODO(gvisor.dev/issue/3247): support async read: correctly process the response. } return nil diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go deleted file mode 100644 index 798c4a6f3..000000000 --- a/pkg/sentry/fsimpl/fuse/directory.go +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fuse - -import ( - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -type directoryFD struct { - fileDescription -} - -// Allocate implements directoryFD.Allocate. -func (directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { - return syserror.EISDIR -} - -// PRead implements FileDescriptionImpl.PRead. -func (directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.EISDIR -} - -// Read implements FileDescriptionImpl.Read. -func (directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.EISDIR -} - -// PWrite implements FileDescriptionImpl.PWrite. -func (directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EISDIR -} - -// Write implements FileDescriptionImpl.Write. -func (directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EISDIR -} - -// IterDirents implements FileDescriptionImpl.IterDirents. -func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error { - fusefs := dir.inode().fs - task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) - - in := linux.FUSEReadIn{ - Fh: dir.Fh, - Offset: uint64(atomic.LoadInt64(&dir.off)), - Size: linux.FUSE_PAGE_SIZE, - Flags: dir.statusFlags(), - } - - // TODO(gVisor.dev/issue/3404): Support FUSE_READDIRPLUS. - req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), dir.inode().nodeID, linux.FUSE_READDIR, &in) - if err != nil { - return err - } - - res, err := fusefs.conn.Call(task, req) - if err != nil { - return err - } - if err := res.Error(); err != nil { - return err - } - - var out linux.FUSEDirents - if err := res.UnmarshalPayload(&out); err != nil { - return err - } - - for _, fuseDirent := range out.Dirents { - nextOff := int64(fuseDirent.Meta.Off) - dirent := vfs.Dirent{ - Name: fuseDirent.Name, - Type: uint8(fuseDirent.Meta.Type), - Ino: fuseDirent.Meta.Ino, - NextOff: nextOff, - } - - if err := callback.Handle(dirent); err != nil { - return err - } - atomic.StoreInt64(&dir.off, nextOff) - } - - return nil -} diff --git a/pkg/sentry/fsimpl/fuse/file.go b/pkg/sentry/fsimpl/fuse/file.go deleted file mode 100644 index 991efcda4..000000000 --- a/pkg/sentry/fsimpl/fuse/file.go +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fuse - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/usermem" -) - -// fileDescription implements vfs.FileDescriptionImpl for fuse. -type fileDescription struct { - vfsfd vfs.FileDescription - vfs.FileDescriptionDefaultImpl - vfs.DentryMetadataFileDescriptionImpl - vfs.NoLockFD - - // the file handle used in userspace. - Fh uint64 - - // Nonseekable is indicate cannot perform seek on a file. - Nonseekable bool - - // DirectIO suggest fuse to use direct io operation. - DirectIO bool - - // OpenFlag is the flag returned by open. - OpenFlag uint32 - - // off is the file offset. - off int64 -} - -func (fd *fileDescription) dentry() *kernfs.Dentry { - return fd.vfsfd.Dentry().Impl().(*kernfs.Dentry) -} - -func (fd *fileDescription) inode() *inode { - return fd.dentry().Inode().(*inode) -} - -func (fd *fileDescription) filesystem() *vfs.Filesystem { - return fd.vfsfd.VirtualDentry().Mount().Filesystem() -} - -func (fd *fileDescription) statusFlags() uint32 { - return fd.vfsfd.StatusFlags() -} - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *fileDescription) Release(ctx context.Context) { - // no need to release if FUSE server doesn't implement Open. - conn := fd.inode().fs.conn - if conn.noOpen { - return - } - - in := linux.FUSEReleaseIn{ - Fh: fd.Fh, - Flags: fd.statusFlags(), - } - // TODO(gvisor.dev/issue/3245): add logic when we support file lock owner. - var opcode linux.FUSEOpcode - if fd.inode().Mode().IsDir() { - opcode = linux.FUSE_RELEASEDIR - } else { - opcode = linux.FUSE_RELEASE - } - kernelTask := kernel.TaskFromContext(ctx) - // ignoring errors and FUSE server reply is analogous to Linux's behavior. - req, err := conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), fd.inode().nodeID, opcode, &in) - if err != nil { - // No way to invoke Call() with an errored request. - return - } - req.noReply = true - conn.CallAsync(kernelTask, req) -} - -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, nil -} - -// Read implements vfs.FileDescriptionImpl.Read. -func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - return 0, nil -} - -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, nil -} - -// Write implements vfs.FileDescriptionImpl.Write. -func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return 0, nil -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - return 0, nil -} - -// Stat implements vfs.FileDescriptionImpl.Stat. -func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { - fs := fd.filesystem() - inode := fd.inode() - return inode.Stat(ctx, fs, opts) -} - -// SetStat implements vfs.FileDescriptionImpl.SetStat. -func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - fs := fd.filesystem() - creds := auth.CredentialsFromContext(ctx) - return fd.inode().setAttr(ctx, fs, creds, opts, true, fd.Fh) -} diff --git a/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go b/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go index 6c82dce30..f72fe342e 100644 --- a/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go +++ b/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go @@ -6,191 +6,179 @@ import ( "gvisor.dev/gvisor/pkg/state" ) -func (x *inodeRefs) StateTypeName() string { - return "pkg/sentry/fsimpl/fuse.inodeRefs" +func (x *Request) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.Request" } -func (x *inodeRefs) StateFields() []string { +func (x *Request) StateFields() []string { return []string{ - "refCount", + "requestEntry", + "id", + "hdr", + "data", } } -func (x *inodeRefs) beforeSave() {} +func (x *Request) beforeSave() {} -func (x *inodeRefs) StateSave(m state.Sink) { +func (x *Request) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.refCount) + m.Save(0, &x.requestEntry) + m.Save(1, &x.id) + m.Save(2, &x.hdr) + m.Save(3, &x.data) } -func (x *inodeRefs) afterLoad() {} +func (x *Request) afterLoad() {} -func (x *inodeRefs) StateLoad(m state.Source) { - m.Load(0, &x.refCount) +func (x *Request) StateLoad(m state.Source) { + m.Load(0, &x.requestEntry) + m.Load(1, &x.id) + m.Load(2, &x.hdr) + m.Load(3, &x.data) } -func (x *requestList) StateTypeName() string { - return "pkg/sentry/fsimpl/fuse.requestList" +func (x *Response) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.Response" } -func (x *requestList) StateFields() []string { +func (x *Response) StateFields() []string { return []string{ - "head", - "tail", + "opcode", + "hdr", + "data", } } -func (x *requestList) beforeSave() {} +func (x *Response) beforeSave() {} -func (x *requestList) StateSave(m state.Sink) { +func (x *Response) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.head) - m.Save(1, &x.tail) + m.Save(0, &x.opcode) + m.Save(1, &x.hdr) + m.Save(2, &x.data) } -func (x *requestList) afterLoad() {} +func (x *Response) afterLoad() {} -func (x *requestList) StateLoad(m state.Source) { - m.Load(0, &x.head) - m.Load(1, &x.tail) +func (x *Response) StateLoad(m state.Source) { + m.Load(0, &x.opcode) + m.Load(1, &x.hdr) + m.Load(2, &x.data) } -func (x *requestEntry) StateTypeName() string { - return "pkg/sentry/fsimpl/fuse.requestEntry" +func (x *futureResponse) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.futureResponse" } -func (x *requestEntry) StateFields() []string { +func (x *futureResponse) StateFields() []string { return []string{ - "next", - "prev", + "opcode", + "ch", + "hdr", + "data", } } -func (x *requestEntry) beforeSave() {} +func (x *futureResponse) beforeSave() {} -func (x *requestEntry) StateSave(m state.Sink) { +func (x *futureResponse) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.next) - m.Save(1, &x.prev) + m.Save(0, &x.opcode) + m.Save(1, &x.ch) + m.Save(2, &x.hdr) + m.Save(3, &x.data) } -func (x *requestEntry) afterLoad() {} +func (x *futureResponse) afterLoad() {} -func (x *requestEntry) StateLoad(m state.Source) { - m.Load(0, &x.next) - m.Load(1, &x.prev) +func (x *futureResponse) StateLoad(m state.Source) { + m.Load(0, &x.opcode) + m.Load(1, &x.ch) + m.Load(2, &x.hdr) + m.Load(3, &x.data) } -func (x *Request) StateTypeName() string { - return "pkg/sentry/fsimpl/fuse.Request" +func (x *inodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.inodeRefs" } -func (x *Request) StateFields() []string { +func (x *inodeRefs) StateFields() []string { return []string{ - "requestEntry", - "id", - "hdr", - "data", - "payload", - "async", - "noReply", + "refCount", } } -func (x *Request) beforeSave() {} +func (x *inodeRefs) beforeSave() {} -func (x *Request) StateSave(m state.Sink) { +func (x *inodeRefs) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.requestEntry) - m.Save(1, &x.id) - m.Save(2, &x.hdr) - m.Save(3, &x.data) - m.Save(4, &x.payload) - m.Save(5, &x.async) - m.Save(6, &x.noReply) + m.Save(0, &x.refCount) } -func (x *Request) afterLoad() {} +func (x *inodeRefs) afterLoad() {} -func (x *Request) StateLoad(m state.Source) { - m.Load(0, &x.requestEntry) - m.Load(1, &x.id) - m.Load(2, &x.hdr) - m.Load(3, &x.data) - m.Load(4, &x.payload) - m.Load(5, &x.async) - m.Load(6, &x.noReply) +func (x *inodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) } -func (x *futureResponse) StateTypeName() string { - return "pkg/sentry/fsimpl/fuse.futureResponse" +func (x *requestList) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.requestList" } -func (x *futureResponse) StateFields() []string { +func (x *requestList) StateFields() []string { return []string{ - "opcode", - "ch", - "hdr", - "data", - "async", + "head", + "tail", } } -func (x *futureResponse) beforeSave() {} +func (x *requestList) beforeSave() {} -func (x *futureResponse) StateSave(m state.Sink) { +func (x *requestList) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.opcode) - m.Save(1, &x.ch) - m.Save(2, &x.hdr) - m.Save(3, &x.data) - m.Save(4, &x.async) + m.Save(0, &x.head) + m.Save(1, &x.tail) } -func (x *futureResponse) afterLoad() {} +func (x *requestList) afterLoad() {} -func (x *futureResponse) StateLoad(m state.Source) { - m.Load(0, &x.opcode) - m.Load(1, &x.ch) - m.Load(2, &x.hdr) - m.Load(3, &x.data) - m.Load(4, &x.async) +func (x *requestList) StateLoad(m state.Source) { + m.Load(0, &x.head) + m.Load(1, &x.tail) } -func (x *Response) StateTypeName() string { - return "pkg/sentry/fsimpl/fuse.Response" +func (x *requestEntry) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.requestEntry" } -func (x *Response) StateFields() []string { +func (x *requestEntry) StateFields() []string { return []string{ - "opcode", - "hdr", - "data", + "next", + "prev", } } -func (x *Response) beforeSave() {} +func (x *requestEntry) beforeSave() {} -func (x *Response) StateSave(m state.Sink) { +func (x *requestEntry) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.opcode) - m.Save(1, &x.hdr) - m.Save(2, &x.data) + m.Save(0, &x.next) + m.Save(1, &x.prev) } -func (x *Response) afterLoad() {} +func (x *requestEntry) afterLoad() {} -func (x *Response) StateLoad(m state.Source) { - m.Load(0, &x.opcode) - m.Load(1, &x.hdr) - m.Load(2, &x.data) +func (x *requestEntry) StateLoad(m state.Source) { + m.Load(0, &x.next) + m.Load(1, &x.prev) } func init() { + state.Register((*Request)(nil)) + state.Register((*Response)(nil)) + state.Register((*futureResponse)(nil)) state.Register((*inodeRefs)(nil)) state.Register((*requestList)(nil)) state.Register((*requestEntry)(nil)) - state.Register((*Request)(nil)) - state.Register((*futureResponse)(nil)) - state.Register((*Response)(nil)) } diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 402dabe5a..810819ae4 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -16,10 +16,7 @@ package fuse import ( - "math" "strconv" - "sync" - "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -29,17 +26,11 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/waiter" - "gvisor.dev/gvisor/tools/go_marshal/marshal" ) // Name is the default filesystem name. const Name = "fuse" -// maxActiveRequestsDefault is the default setting controlling the upper bound -// on the number of active requests at any given time. -const maxActiveRequestsDefault = 10000 - // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} @@ -65,11 +56,6 @@ type filesystemOptions struct { // exist at any time. Any further requests will block when trying to // Call the server. maxActiveRequests uint64 - - // maxRead is the max number of bytes to read, - // specified as "max_read" in fs parameters. - // If not specified by user, use math.MaxUint32 as default value. - maxRead uint32 } // filesystem implements vfs.FilesystemImpl. @@ -83,9 +69,6 @@ type filesystem struct { // opts is the options the fusefs is initialized with. opts *filesystemOptions - - // umounted is true if filesystem.Release() has been called. - umounted bool } // Name implements vfs.FilesystemType.Name. @@ -159,29 +142,14 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Set the maxInFlightRequests option. fsopts.maxActiveRequests = maxActiveRequestsDefault - if maxReadStr, ok := mopts["max_read"]; ok { - delete(mopts, "max_read") - maxRead, err := strconv.ParseUint(maxReadStr, 10, 32) - if err != nil { - log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr) - return nil, nil, syserror.EINVAL - } - if maxRead < fuseMinMaxRead { - maxRead = fuseMinMaxRead - } - fsopts.maxRead = uint32(maxRead) - } else { - fsopts.maxRead = math.MaxUint32 - } - // Check for unparsed options. if len(mopts) != 0 { - log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts) + log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts) return nil, nil, syserror.EINVAL } // Create a new FUSE filesystem. - fs, err := newFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd) + fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd) if err != nil { log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err) return nil, nil, err @@ -197,27 +165,26 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // root is the fusefs root directory. - root := fs.newRootInode(creds, fsopts.rootMode) + root := fs.newInode(creds, fsopts.rootMode) return fs.VFSFilesystem(), root.VFSDentry(), nil } -// newFUSEFilesystem creates a new FUSE filesystem. -func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) { - conn, err := newFUSEConnection(ctx, device, opts) +// NewFUSEFilesystem creates a new FUSE filesystem. +func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) { + fs := &filesystem{ + devMinor: devMinor, + opts: opts, + } + + conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests) if err != nil { log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) return nil, syserror.EINVAL } + fs.conn = conn fuseFD := device.Impl().(*DeviceFD) - - fs := &filesystem{ - devMinor: devMinor, - opts: opts, - conn: conn, - } - fuseFD.fs = fs return fs, nil @@ -225,14 +192,6 @@ func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { - fs.umounted = true - fs.conn.Abort(ctx) - - fs.conn.fd.mu.Lock() - // Notify all the waiters on this fd. - fs.conn.fd.waitQueue.Notify(waiter.EventIn) - fs.conn.fd.mu.Unlock() - fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } @@ -246,49 +205,14 @@ type inode struct { kernfs.InodeNotSymlink kernfs.OrderedChildren - dentry kernfs.Dentry - - // the owning filesystem. fs is immutable. - fs *filesystem - - // metaDataMu protects the metadata of this inode. - metadataMu sync.Mutex - - nodeID uint64 - locks vfs.FileLocks - // size of the file. - size uint64 - - // attributeVersion is the version of inode's attributes. - attributeVersion uint64 - - // attributeTime is the remaining vaild time of attributes. - attributeTime uint64 - - // version of the inode. - version uint64 - - // link is result of following a symbolic link. - link string -} - -func (fs *filesystem) newRootInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { - i := &inode{fs: fs} - i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755) - i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - i.dentry.Init(i) - i.nodeID = 1 - - return &i.dentry + dentry kernfs.Dentry } -func (fs *filesystem) newInode(nodeID uint64, attr linux.FUSEAttr) *kernfs.Dentry { - i := &inode{fs: fs, nodeID: nodeID} - creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)} - i.InodeAttrs.Init(&creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode)) - atomic.StoreUint64(&i.size, attr.Size) +func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { + i := &inode{} + i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) i.EnableLeakCheck() i.dentry.Init(i) @@ -298,301 +222,13 @@ func (fs *filesystem) newInode(nodeID uint64, attr linux.FUSEAttr) *kernfs.Dentr // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - isDir := i.InodeAttrs.Mode().IsDir() - // return error if specified to open directory but inode is not a directory. - if !isDir && opts.Mode.IsDir() { - return nil, syserror.ENOTDIR - } - if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS { - return nil, syserror.EOVERFLOW - } - - var fd *fileDescription - var fdImpl vfs.FileDescriptionImpl - if isDir { - directoryFD := &directoryFD{} - fd = &(directoryFD.fileDescription) - fdImpl = directoryFD - } else { - regularFD := ®ularFileFD{} - fd = &(regularFD.fileDescription) - fdImpl = regularFD - } - // FOPEN_KEEP_CACHE is the defualt flag for noOpen. - fd.OpenFlag = linux.FOPEN_KEEP_CACHE - - // Only send open request when FUSE server support open or is opening a directory. - if !i.fs.conn.noOpen || isDir { - kernelTask := kernel.TaskFromContext(ctx) - if kernelTask == nil { - log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context") - return nil, syserror.EINVAL - } - - // Build the request. - var opcode linux.FUSEOpcode - if isDir { - opcode = linux.FUSE_OPENDIR - } else { - opcode = linux.FUSE_OPEN - } - - in := linux.FUSEOpenIn{Flags: opts.Flags & ^uint32(linux.O_CREAT|linux.O_EXCL|linux.O_NOCTTY)} - if !i.fs.conn.atomicOTrunc { - in.Flags &= ^uint32(linux.O_TRUNC) - } - - req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in) - if err != nil { - return nil, err - } - - // Send the request and receive the reply. - res, err := i.fs.conn.Call(kernelTask, req) - if err != nil { - return nil, err - } - if err := res.Error(); err == syserror.ENOSYS && !isDir { - i.fs.conn.noOpen = true - } else if err != nil { - return nil, err - } else { - out := linux.FUSEOpenOut{} - if err := res.UnmarshalPayload(&out); err != nil { - return nil, err - } - - // Process the reply. - fd.OpenFlag = out.OpenFlag - if isDir { - fd.OpenFlag &= ^uint32(linux.FOPEN_DIRECT_IO) - } - - fd.Fh = out.Fh - } - } - - // TODO(gvisor.dev/issue/3234): invalidate mmap after implemented it for FUSE Inode - fd.DirectIO = fd.OpenFlag&linux.FOPEN_DIRECT_IO != 0 - fdOptions := &vfs.FileDescriptionOptions{} - if fd.OpenFlag&linux.FOPEN_NONSEEKABLE != 0 { - fdOptions.DenyPRead = true - fdOptions.DenyPWrite = true - fd.Nonseekable = true - } - - // If we don't send SETATTR before open (which is indicated by atomicOTrunc) - // and O_TRUNC is set, update the inode's version number and clean existing data - // by setting the file size to 0. - if i.fs.conn.atomicOTrunc && opts.Flags&linux.O_TRUNC != 0 { - i.fs.conn.mu.Lock() - i.fs.conn.attributeVersion++ - i.attributeVersion = i.fs.conn.attributeVersion - atomic.StoreUint64(&i.size, 0) - i.fs.conn.mu.Unlock() - i.attributeTime = 0 - } - - if err := fd.vfsfd.Init(fdImpl, opts.Flags, rp.Mount(), vfsd, fdOptions); err != nil { - return nil, err - } - return &fd.vfsfd, nil -} - -// Lookup implements kernfs.Inode.Lookup. -func (i *inode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { - in := linux.FUSELookupIn{Name: name} - return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in) -} - -// IterDirents implements kernfs.Inode.IterDirents. -func (*inode) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { - return offset, nil -} - -// Valid implements kernfs.Inode.Valid. -func (*inode) Valid(ctx context.Context) bool { - return true -} - -// NewFile implements kernfs.Inode.NewFile. -func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) { - kernelTask := kernel.TaskFromContext(ctx) - if kernelTask == nil { - log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID) - return nil, syserror.EINVAL - } - in := linux.FUSECreateIn{ - CreateMeta: linux.FUSECreateMeta{ - Flags: opts.Flags, - Mode: uint32(opts.Mode) | linux.S_IFREG, - Umask: uint32(kernelTask.FSContext().Umask()), - }, - Name: name, - } - return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in) -} - -// NewNode implements kernfs.Inode.NewNode. -func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) { - in := linux.FUSEMknodIn{ - MknodMeta: linux.FUSEMknodMeta{ - Mode: uint32(opts.Mode), - Rdev: linux.MakeDeviceID(uint16(opts.DevMajor), opts.DevMinor), - Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()), - }, - Name: name, - } - return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in) -} - -// NewSymlink implements kernfs.Inode.NewSymlink. -func (i *inode) NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) { - in := linux.FUSESymLinkIn{ - Name: name, - Target: target, - } - return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in) -} - -// Unlink implements kernfs.Inode.Unlink. -func (i *inode) Unlink(ctx context.Context, name string, child *vfs.Dentry) error { - kernelTask := kernel.TaskFromContext(ctx) - if kernelTask == nil { - log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) - return syserror.EINVAL - } - in := linux.FUSEUnlinkIn{Name: name} - req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in) - if err != nil { - return err - } - res, err := i.fs.conn.Call(kernelTask, req) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndStaticEntries, + }) if err != nil { - return err - } - // only return error, discard res. - if err := res.Error(); err != nil { - return err - } - return i.dentry.RemoveChildLocked(name, child) -} - -// NewDir implements kernfs.Inode.NewDir. -func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) { - in := linux.FUSEMkdirIn{ - MkdirMeta: linux.FUSEMkdirMeta{ - Mode: uint32(opts.Mode), - Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()), - }, - Name: name, - } - return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in) -} - -// RmDir implements kernfs.Inode.RmDir. -func (i *inode) RmDir(ctx context.Context, name string, child *vfs.Dentry) error { - fusefs := i.fs - task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) - - in := linux.FUSERmDirIn{Name: name} - req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in) - if err != nil { - return err - } - - res, err := i.fs.conn.Call(task, req) - if err != nil { - return err - } - if err := res.Error(); err != nil { - return err - } - - // TODO(Before merging): When creating new nodes, should we add nodes to the ordered children? - // If so we'll probably need to call this. We will also need to add them with the writable flag when - // appropriate. - // return i.OrderedChildren.RmDir(ctx, name, child) - - return nil -} - -// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response. -// Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP. -func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*vfs.Dentry, error) { - kernelTask := kernel.TaskFromContext(ctx) - if kernelTask == nil { - log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) - return nil, syserror.EINVAL - } - req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload) - if err != nil { - return nil, err - } - res, err := i.fs.conn.Call(kernelTask, req) - if err != nil { - return nil, err - } - if err := res.Error(); err != nil { return nil, err } - out := linux.FUSEEntryOut{} - if err := res.UnmarshalPayload(&out); err != nil { - return nil, err - } - if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) { - return nil, syserror.EIO - } - child := i.fs.newInode(out.NodeID, out.Attr) - if opcode == linux.FUSE_LOOKUP { - i.dentry.InsertChildLocked(name, child) - } else { - i.dentry.InsertChild(name, child) - } - return child.VFSDentry(), nil -} - -// Getlink implements kernfs.Inode.Getlink. -func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { - path, err := i.Readlink(ctx, mnt) - return vfs.VirtualDentry{}, path, err -} - -// Readlink implements kernfs.Inode.Readlink. -func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { - if i.Mode().FileType()&linux.S_IFLNK == 0 { - return "", syserror.EINVAL - } - if len(i.link) == 0 { - kernelTask := kernel.TaskFromContext(ctx) - if kernelTask == nil { - log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context") - return "", syserror.EINVAL - } - req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{}) - if err != nil { - return "", err - } - res, err := i.fs.conn.Call(kernelTask, req) - if err != nil { - return "", err - } - i.link = string(res.data[res.hdr.SizeBytes():]) - if !mnt.Options().ReadOnly { - i.attributeTime = 0 - } - } - return i.link, nil -} - -// getFUSEAttr returns a linux.FUSEAttr of this inode stored in local cache. -// TODO(gvisor.dev/issue/3679): Add support for other fields. -func (i *inode) getFUSEAttr() linux.FUSEAttr { - return linux.FUSEAttr{ - Ino: i.Ino(), - Size: atomic.LoadUint64(&i.size), - Mode: uint32(i.Mode()), - } + return fd.VFSFileDescription(), nil } // statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The @@ -648,89 +284,47 @@ func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx { return stat } -// getAttr gets the attribute of this inode by issuing a FUSE_GETATTR request -// or read from local cache. It updates the corresponding attributes if -// necessary. -func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions, flags uint32, fh uint64) (linux.FUSEAttr, error) { - attributeVersion := atomic.LoadUint64(&i.fs.conn.attributeVersion) - - // TODO(gvisor.dev/issue/3679): send the request only if - // - invalid local cache for fields specified in the opts.Mask - // - forced update - // - i.attributeTime expired - // If local cache is still valid, return local cache. - // Currently we always send a request, - // and we always set the metadata with the new result, - // unless attributeVersion has changed. - - task := kernel.TaskFromContext(ctx) +// Stat implements kernfs.Inode.Stat. +func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + fusefs := fs.Impl().(*filesystem) + conn := fusefs.conn + task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) if task == nil { log.Warningf("couldn't get kernel task from context") - return linux.FUSEAttr{}, syserror.EINVAL + return linux.Statx{}, syserror.EINVAL } - creds := auth.CredentialsFromContext(ctx) - - in := linux.FUSEGetAttrIn{ - GetAttrFlags: flags, - Fh: fh, - } - req, err := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in) + var in linux.FUSEGetAttrIn + // We don't set any attribute in the request, because in VFS2 fstat(2) will + // finally be translated into vfs.FilesystemImpl.StatAt() (see + // pkg/sentry/syscalls/linux/vfs2/stat.go), resulting in the same flow + // as stat(2). Thus GetAttrFlags and Fh variable will never be used in VFS2. + req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.Ino(), linux.FUSE_GETATTR, &in) if err != nil { - return linux.FUSEAttr{}, err + return linux.Statx{}, err } - res, err := i.fs.conn.Call(task, req) + res, err := conn.Call(task, req) if err != nil { - return linux.FUSEAttr{}, err + return linux.Statx{}, err } if err := res.Error(); err != nil { - return linux.FUSEAttr{}, err + return linux.Statx{}, err } var out linux.FUSEGetAttrOut if err := res.UnmarshalPayload(&out); err != nil { - return linux.FUSEAttr{}, err - } - - // Local version is newer, return the local one. - // Skip the update. - if attributeVersion != 0 && atomic.LoadUint64(&i.attributeVersion) > attributeVersion { - return i.getFUSEAttr(), nil + return linux.Statx{}, err } - // Set the metadata of kernfs.InodeAttrs. - if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{ - Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor), + // Set all metadata into kernfs.InodeAttrs. + if err := i.SetStat(ctx, fs, creds, vfs.SetStatOptions{ + Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, fusefs.devMinor), }); err != nil { - return linux.FUSEAttr{}, err - } - - // Set the size if no error (after SetStat() check). - atomic.StoreUint64(&i.size, out.Attr.Size) - - return out.Attr, nil -} - -// reviseAttr attempts to update the attributes for internal purposes -// by calling getAttr with a pre-specified mask. -// Used by read, write, lseek. -func (i *inode) reviseAttr(ctx context.Context, flags uint32, fh uint64) error { - // Never need atime for internal purposes. - _, err := i.getAttr(ctx, i.fs.VFSFilesystem(), vfs.StatOptions{ - Mask: linux.STATX_BASIC_STATS &^ linux.STATX_ATIME, - }, flags, fh) - return err -} - -// Stat implements kernfs.Inode.Stat. -func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - attr, err := i.getAttr(ctx, fs, opts, 0, 0) - if err != nil { return linux.Statx{}, err } - return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil + return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil } // DecRef implements kernfs.Inode. @@ -743,84 +337,3 @@ func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, e // TODO(gvisor.dev/issues/3413): Complete the implementation of statfs. return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil } - -// fattrMaskFromStats converts vfs.SetStatOptions.Stat.Mask to linux stats mask -// aligned with the attribute mask defined in include/linux/fs.h. -func fattrMaskFromStats(mask uint32) uint32 { - var fuseAttrMask uint32 - maskMap := map[uint32]uint32{ - linux.STATX_MODE: linux.FATTR_MODE, - linux.STATX_UID: linux.FATTR_UID, - linux.STATX_GID: linux.FATTR_GID, - linux.STATX_SIZE: linux.FATTR_SIZE, - linux.STATX_ATIME: linux.FATTR_ATIME, - linux.STATX_MTIME: linux.FATTR_MTIME, - linux.STATX_CTIME: linux.FATTR_CTIME, - } - for statxMask, fattrMask := range maskMap { - if mask&statxMask != 0 { - fuseAttrMask |= fattrMask - } - } - return fuseAttrMask -} - -// SetStat implements kernfs.Inode.SetStat. -func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { - return i.setAttr(ctx, fs, creds, opts, false, 0) -} - -func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions, useFh bool, fh uint64) error { - conn := i.fs.conn - task := kernel.TaskFromContext(ctx) - if task == nil { - log.Warningf("couldn't get kernel task from context") - return syserror.EINVAL - } - - // We should retain the original file type when assigning new mode. - fileType := uint16(i.Mode()) & linux.S_IFMT - fattrMask := fattrMaskFromStats(opts.Stat.Mask) - if useFh { - fattrMask |= linux.FATTR_FH - } - in := linux.FUSESetAttrIn{ - Valid: fattrMask, - Fh: fh, - Size: opts.Stat.Size, - Atime: uint64(opts.Stat.Atime.Sec), - Mtime: uint64(opts.Stat.Mtime.Sec), - Ctime: uint64(opts.Stat.Ctime.Sec), - AtimeNsec: opts.Stat.Atime.Nsec, - MtimeNsec: opts.Stat.Mtime.Nsec, - CtimeNsec: opts.Stat.Ctime.Nsec, - Mode: uint32(fileType | opts.Stat.Mode), - UID: opts.Stat.UID, - GID: opts.Stat.GID, - } - req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in) - if err != nil { - return err - } - - res, err := conn.Call(task, req) - if err != nil { - return err - } - if err := res.Error(); err != nil { - return err - } - out := linux.FUSEGetAttrOut{} - if err := res.UnmarshalPayload(&out); err != nil { - return err - } - - // Set the metadata of kernfs.InodeAttrs. - if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{ - Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor), - }); err != nil { - return err - } - - return nil -} diff --git a/pkg/sentry/fsimpl/fuse/connection_control.go b/pkg/sentry/fsimpl/fuse/init.go index a63c66e7c..779c2bd3f 100644 --- a/pkg/sentry/fsimpl/fuse/connection_control.go +++ b/pkg/sentry/fsimpl/fuse/init.go @@ -15,11 +15,7 @@ package fuse import ( - "sync/atomic" - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) @@ -33,10 +29,9 @@ const ( // Follow the same behavior as unix fuse implementation. fuseMaxTimeGranNs = 1000000000 - // Minimum value for MaxWrite and MaxRead. + // Minimum value for MaxWrite. // Follow the same behavior as unix fuse implementation. fuseMinMaxWrite = 4096 - fuseMinMaxRead = 4096 // Temporary default value for max readahead, 128kb. fuseDefaultMaxReadahead = 131072 @@ -54,26 +49,6 @@ var ( MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold ) -// SetInitialized atomically sets the connection as initialized. -func (conn *connection) SetInitialized() { - // Unblock the requests sent before INIT. - close(conn.initializedChan) - - // Close the channel first to avoid the non-atomic situation - // where conn.initialized is true but there are - // tasks being blocked on the channel. - // And it prevents the newer tasks from gaining - // unnecessary higher chance to be issued before the blocked one. - - atomic.StoreInt32(&(conn.initialized), int32(1)) -} - -// IsInitialized atomically check if the connection is initialized. -// pairs with SetInitialized(). -func (conn *connection) Initialized() bool { - return atomic.LoadInt32(&(conn.initialized)) != 0 -} - // InitSend sends a FUSE_INIT request. func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error { in := linux.FUSEInitIn{ @@ -100,24 +75,24 @@ func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error { return err } - initRes := fuseInitRes{initLen: res.DataLen()} - if err := res.UnmarshalPayload(&initRes); err != nil { + var out linux.FUSEInitOut + if err := res.UnmarshalPayload(&out); err != nil { return err } - return conn.initProcessReply(&initRes.initOut, hasSysAdminCap) + return conn.initProcessReply(&out, hasSysAdminCap) } // Process the FUSE_INIT reply from the FUSE server. -// It tries to acquire the conn.asyncMu lock if minor version is newer than 13. func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error { - // No matter error or not, always set initialzied. - // to unblock the blocked requests. - defer conn.SetInitialized() - // No support for old major fuse versions. if out.Major != linux.FUSE_KERNEL_VERSION { conn.connInitError = true + + // Set the connection as initialized and unblock the blocked requests + // (i.e. return error for them). + conn.SetInitialized() + return nil } @@ -125,14 +100,29 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap conn.connInitSuccess = true conn.minor = out.Minor - // No support for negotiating MaxWrite before minor version 5. - if out.Minor >= 5 { - conn.maxWrite = out.MaxWrite - } else { - conn.maxWrite = fuseMinMaxWrite - } - if conn.maxWrite < fuseMinMaxWrite { - conn.maxWrite = fuseMinMaxWrite + // No support for limits before minor version 13. + if out.Minor >= 13 { + conn.bgLock.Lock() + + if out.MaxBackground > 0 { + conn.maxBackground = out.MaxBackground + + if !hasSysAdminCap && + conn.maxBackground > MaxUserBackgroundRequest { + conn.maxBackground = MaxUserBackgroundRequest + } + } + + if out.CongestionThreshold > 0 { + conn.congestionThreshold = out.CongestionThreshold + + if !hasSysAdminCap && + conn.congestionThreshold > MaxUserCongestionThreshold { + conn.congestionThreshold = MaxUserCongestionThreshold + } + } + + conn.bgLock.Unlock() } // No support for the following flags before minor version 6. @@ -141,6 +131,8 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0 conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0 conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0 + conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0 + conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0 // TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs). @@ -156,90 +148,19 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap } } - // No support for limits before minor version 13. - if out.Minor >= 13 { - conn.asyncMu.Lock() - - if out.MaxBackground > 0 { - conn.asyncNumMax = out.MaxBackground - - if !hasSysAdminCap && - conn.asyncNumMax > MaxUserBackgroundRequest { - conn.asyncNumMax = MaxUserBackgroundRequest - } - } - - if out.CongestionThreshold > 0 { - conn.asyncCongestionThreshold = out.CongestionThreshold - - if !hasSysAdminCap && - conn.asyncCongestionThreshold > MaxUserCongestionThreshold { - conn.asyncCongestionThreshold = MaxUserCongestionThreshold - } - } - - conn.asyncMu.Unlock() - } - - return nil -} - -// Abort this FUSE connection. -// It tries to acquire conn.fd.mu, conn.lock, conn.bgLock in order. -// All possible requests waiting or blocking will be aborted. -func (conn *connection) Abort(ctx context.Context) { - conn.fd.mu.Lock() - conn.mu.Lock() - conn.asyncMu.Lock() - - if !conn.connected { - conn.asyncMu.Unlock() - conn.mu.Unlock() - conn.fd.mu.Unlock() - return - } - - conn.connected = false - - // Empty the `fd.queue` that holds the requests - // not yet read by the FUSE daemon yet. - // These are a subset of the requests in `fuse.completion` map. - for !conn.fd.queue.Empty() { - req := conn.fd.queue.Front() - conn.fd.queue.Remove(req) - } - - var terminate []linux.FUSEOpID - - // 2. Collect the requests have not been sent to FUSE daemon, - // or have not received a reply. - for unique := range conn.fd.completions { - terminate = append(terminate, unique) - } - - // Release all locks to avoid deadlock. - conn.asyncMu.Unlock() - conn.mu.Unlock() - conn.fd.mu.Unlock() - - // 1. The requets blocked before initialization. - // Will reach call() `connected` check and return. - if !conn.Initialized() { - conn.SetInitialized() + // No support for negotiating MaxWrite before minor version 5. + if out.Minor >= 5 { + conn.maxWrite = out.MaxWrite + } else { + conn.maxWrite = fuseMinMaxWrite } - - // 2. Terminate the requests collected above. - // Set ECONNABORTED error. - // sendError() will remove them from `fd.completion` map. - // Will enter the path of a normally received error. - for _, toTerminate := range terminate { - conn.fd.sendError(ctx, -int32(syscall.ECONNABORTED), toTerminate) + if conn.maxWrite < fuseMinMaxWrite { + conn.maxWrite = fuseMinMaxWrite } - // 3. The requests not yet written to FUSE device. - // Early terminate. - // Will reach callFutureLocked() `connected` check and return. - close(conn.fd.fullQueueCh) + // Set connection as initialized and unblock the requests + // issued before init. + conn.SetInitialized() - // TODO(gvisor.dev/issue/3528): Forget all pending forget reqs. + return nil } diff --git a/pkg/sentry/fsimpl/fuse/inode_refs.go b/pkg/sentry/fsimpl/fuse/inode_refs.go index 6b9456e1d..4fb4d4da7 100644 --- a/pkg/sentry/fsimpl/fuse/inode_refs.go +++ b/pkg/sentry/fsimpl/fuse/inode_refs.go @@ -2,11 +2,10 @@ package fuse import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go deleted file mode 100644 index 625d1547f..000000000 --- a/pkg/sentry/fsimpl/fuse/read_write.go +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fuse - -import ( - "io" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// ReadInPages sends FUSE_READ requests for the size after round it up to -// a multiple of page size, blocks on it for reply, processes the reply -// and returns the payload (or joined payloads) as a byte slice. -// This is used for the general purpose reading. -// We do not support direct IO (which read the exact number of bytes) -// at this moment. -func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off uint64, size uint32) ([][]byte, uint32, error) { - attributeVersion := atomic.LoadUint64(&fs.conn.attributeVersion) - - t := kernel.TaskFromContext(ctx) - if t == nil { - log.Warningf("fusefs.Read: couldn't get kernel task from context") - return nil, 0, syserror.EINVAL - } - - // Round up to a multiple of page size. - readSize, _ := usermem.PageRoundUp(uint64(size)) - - // One request cannnot exceed either maxRead or maxPages. - maxPages := fs.conn.maxRead >> usermem.PageShift - if maxPages > uint32(fs.conn.maxPages) { - maxPages = uint32(fs.conn.maxPages) - } - - var outs [][]byte - var sizeRead uint32 - - // readSize is a multiple of usermem.PageSize. - // Always request bytes as a multiple of pages. - pagesRead, pagesToRead := uint32(0), uint32(readSize>>usermem.PageShift) - - // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation. - in := linux.FUSEReadIn{ - Fh: fd.Fh, - LockOwner: 0, // TODO(gvisor.dev/issue/3245): file lock - ReadFlags: 0, // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER - Flags: fd.statusFlags(), - } - - // This loop is intended for fragmented read where the bytes to read is - // larger than either the maxPages or maxRead. - // For the majority of reads with normal size, this loop should only - // execute once. - for pagesRead < pagesToRead { - pagesCanRead := pagesToRead - pagesRead - if pagesCanRead > maxPages { - pagesCanRead = maxPages - } - - in.Offset = off + (uint64(pagesRead) << usermem.PageShift) - in.Size = pagesCanRead << usermem.PageShift - - req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_READ, &in) - if err != nil { - return nil, 0, err - } - - // TODO(gvisor.dev/issue/3247): support async read. - - res, err := fs.conn.Call(t, req) - if err != nil { - return nil, 0, err - } - if err := res.Error(); err != nil { - return nil, 0, err - } - - // Not enough bytes in response, - // either we reached EOF, - // or the FUSE server sends back a response - // that cannot even fit the hdr. - if len(res.data) <= res.hdr.SizeBytes() { - // We treat both case as EOF here for now - // since there is no reliable way to detect - // the over-short hdr case. - break - } - - // Directly using the slice to avoid extra copy. - out := res.data[res.hdr.SizeBytes():] - - outs = append(outs, out) - sizeRead += uint32(len(out)) - - pagesRead += pagesCanRead - } - - defer fs.ReadCallback(ctx, fd, off, size, sizeRead, attributeVersion) - - // No bytes returned: offset >= EOF. - if len(outs) == 0 { - return nil, 0, io.EOF - } - - return outs, sizeRead, nil -} - -// ReadCallback updates several information after receiving a read response. -// Due to readahead, sizeRead can be larger than size. -func (fs *filesystem) ReadCallback(ctx context.Context, fd *regularFileFD, off uint64, size uint32, sizeRead uint32, attributeVersion uint64) { - // TODO(gvisor.dev/issue/3247): support async read. - // If this is called by an async read, correctly process it. - // May need to update the signature. - - i := fd.inode() - // TODO(gvisor.dev/issue/1193): Invalidate or update atime. - - // Reached EOF. - if sizeRead < size { - // TODO(gvisor.dev/issue/3630): If we have writeback cache, then we need to fill this hole. - // Might need to update the buf to be returned from the Read(). - - // Update existing size. - newSize := off + uint64(sizeRead) - fs.conn.mu.Lock() - if attributeVersion == i.attributeVersion && newSize < atomic.LoadUint64(&i.size) { - fs.conn.attributeVersion++ - i.attributeVersion = i.fs.conn.attributeVersion - atomic.StoreUint64(&i.size, newSize) - } - fs.conn.mu.Unlock() - } -} - -// Write sends FUSE_WRITE requests and return the bytes -// written according to the response. -// -// Preconditions: len(data) == size. -func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, size uint32, data []byte) (uint32, error) { - t := kernel.TaskFromContext(ctx) - if t == nil { - log.Warningf("fusefs.Read: couldn't get kernel task from context") - return 0, syserror.EINVAL - } - - // One request cannnot exceed either maxWrite or maxPages. - maxWrite := uint32(fs.conn.maxPages) << usermem.PageShift - if maxWrite > fs.conn.maxWrite { - maxWrite = fs.conn.maxWrite - } - - // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation. - in := linux.FUSEWriteIn{ - Fh: fd.Fh, - // TODO(gvisor.dev/issue/3245): file lock - LockOwner: 0, - // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER - // TODO(gvisor.dev/issue/3237): |= linux.FUSE_WRITE_CACHE (not added yet) - WriteFlags: 0, - Flags: fd.statusFlags(), - } - - var written uint32 - - // This loop is intended for fragmented write where the bytes to write is - // larger than either the maxWrite or maxPages or when bigWrites is false. - // Unless a small value for max_write is explicitly used, this loop - // is expected to execute only once for the majority of the writes. - for written < size { - toWrite := size - written - - // Limit the write size to one page. - // Note that the bigWrites flag is obsolete, - // latest libfuse always sets it on. - if !fs.conn.bigWrites && toWrite > usermem.PageSize { - toWrite = usermem.PageSize - } - - // Limit the write size to maxWrite. - if toWrite > maxWrite { - toWrite = maxWrite - } - - in.Offset = off + uint64(written) - in.Size = toWrite - - req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_WRITE, &in) - if err != nil { - return 0, err - } - - req.payload = data[written : written+toWrite] - - // TODO(gvisor.dev/issue/3247): support async write. - - res, err := fs.conn.Call(t, req) - if err != nil { - return 0, err - } - if err := res.Error(); err != nil { - return 0, err - } - - out := linux.FUSEWriteOut{} - if err := res.UnmarshalPayload(&out); err != nil { - return 0, err - } - - // Write more than requested? EIO. - if out.Size > toWrite { - return 0, syserror.EIO - } - - written += out.Size - - // Break if short write. Not necessarily an error. - if out.Size != toWrite { - break - } - } - - return written, nil -} diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go deleted file mode 100644 index 5bdd096c3..000000000 --- a/pkg/sentry/fsimpl/fuse/regular_file.go +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fuse - -import ( - "io" - "math" - "sync" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -type regularFileFD struct { - fileDescription - - // off is the file offset. - off int64 - // offMu protects off. - offMu sync.Mutex -} - -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - if offset < 0 { - return 0, syserror.EINVAL - } - - // Check that flags are supported. - // - // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. - if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP - } - - size := dst.NumBytes() - if size == 0 { - // Early return if count is 0. - return 0, nil - } else if size > math.MaxUint32 { - // FUSE only supports uint32 for size. - // Overflow. - return 0, syserror.EINVAL - } - - // TODO(gvisor.dev/issue/3678): Add direct IO support. - - inode := fd.inode() - - // Reading beyond EOF, update file size if outdated. - if uint64(offset+size) > atomic.LoadUint64(&inode.size) { - if err := inode.reviseAttr(ctx, linux.FUSE_GETATTR_FH, fd.Fh); err != nil { - return 0, err - } - // If the offset after update is still too large, return error. - if uint64(offset) >= atomic.LoadUint64(&inode.size) { - return 0, io.EOF - } - } - - // Truncate the read with updated file size. - fileSize := atomic.LoadUint64(&inode.size) - if uint64(offset+size) > fileSize { - size = int64(fileSize) - offset - } - - buffers, n, err := inode.fs.ReadInPages(ctx, fd, uint64(offset), uint32(size)) - if err != nil { - return 0, err - } - - // TODO(gvisor.dev/issue/3237): support indirect IO (e.g. caching), - // store the bytes that were read ahead. - - // Update the number of bytes to copy for short read. - if n < uint32(size) { - size = int64(n) - } - - // Copy the bytes read to the dst. - // This loop is intended for fragmented reads. - // For the majority of reads, this loop only execute once. - var copied int64 - for _, buffer := range buffers { - toCopy := int64(len(buffer)) - if copied+toCopy > size { - toCopy = size - copied - } - cp, err := dst.DropFirst64(copied).CopyOut(ctx, buffer[:toCopy]) - if err != nil { - return 0, err - } - if int64(cp) != toCopy { - return 0, syserror.EIO - } - copied += toCopy - } - - return copied, nil -} - -// Read implements vfs.FileDescriptionImpl.Read. -func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - fd.offMu.Lock() - n, err := fd.PRead(ctx, dst, fd.off, opts) - fd.off += n - fd.offMu.Unlock() - return n, err -} - -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - n, _, err := fd.pwrite(ctx, src, offset, opts) - return n, err -} - -// Write implements vfs.FileDescriptionImpl.Write. -func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - fd.offMu.Lock() - n, off, err := fd.pwrite(ctx, src, fd.off, opts) - fd.off = off - fd.offMu.Unlock() - return n, err -} - -// pwrite returns the number of bytes written, final offset and error. The -// final offset should be ignored by PWrite. -func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { - if offset < 0 { - return 0, offset, syserror.EINVAL - } - - // Check that flags are supported. - // - // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. - if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, offset, syserror.EOPNOTSUPP - } - - inode := fd.inode() - inode.metadataMu.Lock() - defer inode.metadataMu.Unlock() - - // If the file is opened with O_APPEND, update offset to file size. - // Note: since our Open() implements the interface of kernfs, - // and kernfs currently does not support O_APPEND, this will never - // be true before we switch out from kernfs. - if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { - // Locking inode.metadataMu is sufficient for reading size - offset = int64(inode.size) - } - - srclen := src.NumBytes() - - if srclen > math.MaxUint32 { - // FUSE only supports uint32 for size. - // Overflow. - return 0, offset, syserror.EINVAL - } - if end := offset + srclen; end < offset { - // Overflow. - return 0, offset, syserror.EINVAL - } - - srclen, err = vfs.CheckLimit(ctx, offset, srclen) - if err != nil { - return 0, offset, err - } - - if srclen == 0 { - // Return before causing any side effects. - return 0, offset, nil - } - - src = src.TakeFirst64(srclen) - - // TODO(gvisor.dev/issue/3237): Add cache support: - // buffer cache. Ideally we write from src to our buffer cache first. - // The slice passed to fs.Write() should be a slice from buffer cache. - data := make([]byte, srclen) - // Reason for making a copy here: connection.Call() blocks on kerneltask, - // which in turn acquires mm.activeMu lock. Functions like CopyInTo() will - // attemp to acquire the mm.activeMu lock as well -> deadlock. - // We must finish reading from the userspace memory before - // t.Block() deactivates it. - cp, err := src.CopyIn(ctx, data) - if err != nil { - return 0, offset, err - } - if int64(cp) != srclen { - return 0, offset, syserror.EIO - } - - n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data) - if err != nil { - return 0, offset, err - } - - if n == 0 { - // We have checked srclen != 0 previously. - // If err == nil, then it's a short write and we return EIO. - return 0, offset, syserror.EIO - } - - written = int64(n) - finalOff = offset + written - - if finalOff > int64(inode.size) { - atomic.StoreUint64(&inode.size, uint64(finalOff)) - atomic.AddUint64(&inode.fs.conn.attributeVersion, 1) - } - - return -} diff --git a/pkg/sentry/fsimpl/fuse/request_response.go b/pkg/sentry/fsimpl/fuse/request_response.go deleted file mode 100644 index 70db50f38..000000000 --- a/pkg/sentry/fsimpl/fuse/request_response.go +++ /dev/null @@ -1,229 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fuse - -import ( - "fmt" - "syscall" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/tools/go_marshal/marshal" -) - -// fuseInitRes is a variable-length wrapper of linux.FUSEInitOut. The FUSE -// server may implement an older version of FUSE protocol, which contains a -// linux.FUSEInitOut with less attributes. -// -// Dynamically-sized objects cannot be marshalled. -type fuseInitRes struct { - marshal.StubMarshallable - - // initOut contains the response from the FUSE server. - initOut linux.FUSEInitOut - - // initLen is the total length of bytes of the response. - initLen uint32 -} - -// UnmarshalBytes deserializes src to the initOut attribute in a fuseInitRes. -func (r *fuseInitRes) UnmarshalBytes(src []byte) { - out := &r.initOut - - // Introduced before FUSE kernel version 7.13. - out.Major = uint32(usermem.ByteOrder.Uint32(src[:4])) - src = src[4:] - out.Minor = uint32(usermem.ByteOrder.Uint32(src[:4])) - src = src[4:] - out.MaxReadahead = uint32(usermem.ByteOrder.Uint32(src[:4])) - src = src[4:] - out.Flags = uint32(usermem.ByteOrder.Uint32(src[:4])) - src = src[4:] - out.MaxBackground = uint16(usermem.ByteOrder.Uint16(src[:2])) - src = src[2:] - out.CongestionThreshold = uint16(usermem.ByteOrder.Uint16(src[:2])) - src = src[2:] - out.MaxWrite = uint32(usermem.ByteOrder.Uint32(src[:4])) - src = src[4:] - - // Introduced in FUSE kernel version 7.23. - if len(src) >= 4 { - out.TimeGran = uint32(usermem.ByteOrder.Uint32(src[:4])) - src = src[4:] - } - // Introduced in FUSE kernel version 7.28. - if len(src) >= 2 { - out.MaxPages = uint16(usermem.ByteOrder.Uint16(src[:2])) - src = src[2:] - } -} - -// SizeBytes is the size of the payload of the FUSE_INIT response. -func (r *fuseInitRes) SizeBytes() int { - return int(r.initLen) -} - -// Ordinary requests have even IDs, while interrupts IDs are odd. -// Used to increment the unique ID for each FUSE request. -var reqIDStep uint64 = 2 - -// Request represents a FUSE operation request that hasn't been sent to the -// server yet. -// -// +stateify savable -type Request struct { - requestEntry - - id linux.FUSEOpID - hdr *linux.FUSEHeaderIn - data []byte - - // payload for this request: extra bytes to write after - // the data slice. Used by FUSE_WRITE. - payload []byte - - // If this request is async. - async bool - // If we don't care its response. - // Manually set by the caller. - noReply bool -} - -// NewRequest creates a new request that can be sent to the FUSE server. -func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { - conn.fd.mu.Lock() - defer conn.fd.mu.Unlock() - conn.fd.nextOpID += linux.FUSEOpID(reqIDStep) - - hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes() - hdr := linux.FUSEHeaderIn{ - Len: uint32(hdrLen + payload.SizeBytes()), - Opcode: opcode, - Unique: conn.fd.nextOpID, - NodeID: ino, - UID: uint32(creds.EffectiveKUID), - GID: uint32(creds.EffectiveKGID), - PID: pid, - } - - buf := make([]byte, hdr.Len) - - // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again. - hdr.MarshalBytes(buf[:hdrLen]) - payload.MarshalBytes(buf[hdrLen:]) - - return &Request{ - id: hdr.Unique, - hdr: &hdr, - data: buf, - }, nil -} - -// futureResponse represents an in-flight request, that may or may not have -// completed yet. Convert it to a resolved Response by calling Resolve, but note -// that this may block. -// -// +stateify savable -type futureResponse struct { - opcode linux.FUSEOpcode - ch chan struct{} - hdr *linux.FUSEHeaderOut - data []byte - - // If this request is async. - async bool -} - -// newFutureResponse creates a future response to a FUSE request. -func newFutureResponse(req *Request) *futureResponse { - return &futureResponse{ - opcode: req.hdr.Opcode, - ch: make(chan struct{}), - async: req.async, - } -} - -// resolve blocks the task until the server responds to its corresponding request, -// then returns a resolved response. -func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) { - // Return directly for async requests. - if f.async { - return nil, nil - } - - if err := t.Block(f.ch); err != nil { - return nil, err - } - - return f.getResponse(), nil -} - -// getResponse creates a Response from the data the futureResponse has. -func (f *futureResponse) getResponse() *Response { - return &Response{ - opcode: f.opcode, - hdr: *f.hdr, - data: f.data, - } -} - -// Response represents an actual response from the server, including the -// response payload. -// -// +stateify savable -type Response struct { - opcode linux.FUSEOpcode - hdr linux.FUSEHeaderOut - data []byte -} - -// Error returns the error of the FUSE call. -func (r *Response) Error() error { - errno := r.hdr.Error - if errno >= 0 { - return nil - } - - sysErrNo := syscall.Errno(-errno) - return error(sysErrNo) -} - -// DataLen returns the size of the response without the header. -func (r *Response) DataLen() uint32 { - return r.hdr.Len - uint32(r.hdr.SizeBytes()) -} - -// UnmarshalPayload unmarshals the response data into m. -func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { - hdrLen := r.hdr.SizeBytes() - haveDataLen := r.hdr.Len - uint32(hdrLen) - wantDataLen := uint32(m.SizeBytes()) - - if haveDataLen < wantDataLen { - return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen) - } - - // The response data is empty unless there is some payload. And so, doesn't - // need to be unmarshalled. - if r.data == nil { - return nil - } - - // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again. - m.UnmarshalBytes(r.data[hdrLen:]) - return nil -} diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index fa4e19113..0e21c31a4 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -195,11 +195,7 @@ const ( // and consistent with Linux's semantics (in particular, it is not always // possible for clients to set arbitrary atimes and mtimes depending on the // remote filesystem implementation, and never possible for clients to set - // arbitrary ctimes.) If a dentry containing a client-defined atime or - // mtime is evicted from cache, client timestamps will be sent to the - // remote filesystem on a best-effort basis to attempt to ensure that - // timestamps will be preserved when another dentry representing the same - // file is instantiated. + // arbitrary ctimes.) InteropModeExclusive InteropMode = iota // InteropModeWritethrough is appropriate when there are read-only users of @@ -1315,32 +1311,15 @@ func (d *dentry) destroyLocked(ctx context.Context) { d.handleMu.Unlock() if !d.file.isNil() { - valid := p9.SetAttrMask{} - attr := p9.SetAttr{} - if !d.isDeleted() { - // Write dirty timestamps back to the remote filesystem. - if atomic.LoadUint32(&d.atimeDirty) != 0 { - valid.ATime = true - valid.ATimeNotSystemTime = true - atime := atomic.LoadInt64(&d.atime) - attr.ATimeSeconds = uint64(atime / 1e9) - attr.ATimeNanoSeconds = uint64(atime % 1e9) - } - if atomic.LoadUint32(&d.mtimeDirty) != 0 { - valid.MTime = true - valid.MTimeNotSystemTime = true - mtime := atomic.LoadInt64(&d.mtime) - attr.MTimeSeconds = uint64(mtime / 1e9) - attr.MTimeNanoSeconds = uint64(mtime % 1e9) - } - } - - // Check if attributes need to be changed before closing the file. - if valid.ATime || valid.MTime { - if err := d.file.setAttrClose(ctx, valid, attr); err != nil { - log.Warningf("gofer.dentry.destroyLocked: failed to close file with write dirty timestamps: %v", err) - } - } else if err := d.file.close(ctx); err != nil { + // Note that it's possible that d.atimeDirty or d.mtimeDirty are true, + // i.e. client and server timestamps may differ (because e.g. a client + // write was serviced by the page cache, and only written back to the + // remote file later). Ideally, we'd write client timestamps back to + // the remote filesystem so that timestamps for a new dentry + // instantiated for the same file would remain coherent. Unfortunately, + // this turns out to be too expensive in many cases, so for now we + // don't do this. + if err := d.file.close(ctx); err != nil { log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err) } d.file = p9file{} diff --git a/pkg/sentry/fsimpl/host/connected_endpoint_refs.go b/pkg/sentry/fsimpl/host/connected_endpoint_refs.go index babb3f664..225f59782 100644 --- a/pkg/sentry/fsimpl/host/connected_endpoint_refs.go +++ b/pkg/sentry/fsimpl/host/connected_endpoint_refs.go @@ -2,11 +2,10 @@ package host import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/host/inode_refs.go b/pkg/sentry/fsimpl/host/inode_refs.go index 17f90ce4a..4075eae17 100644 --- a/pkg/sentry/fsimpl/host/inode_refs.go +++ b/pkg/sentry/fsimpl/host/inode_refs.go @@ -2,11 +2,10 @@ package host import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go index 35ded24bc..c0bf45f08 100644 --- a/pkg/sentry/fsimpl/host/socket_unsafe.go +++ b/pkg/sentry/fsimpl/host/socket_unsafe.go @@ -63,10 +63,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) ( controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC if n > length { - return length, n, msg.Controllen, controlTrunc, err + return length, n, msg.Controllen, controlTrunc, nil } - return n, n, msg.Controllen, controlTrunc, err + return n, n, msg.Controllen, controlTrunc, nil } // fdWriteVec sends from bufs to fd. diff --git a/pkg/sentry/fsimpl/kernfs/dentry_refs.go b/pkg/sentry/fsimpl/kernfs/dentry_refs.go index 79863b3bc..f99d4941a 100644 --- a/pkg/sentry/fsimpl/kernfs/dentry_refs.go +++ b/pkg/sentry/fsimpl/kernfs/dentry_refs.go @@ -2,11 +2,10 @@ package kernfs import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 49f6a0f1d..d7d3e8f48 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -140,7 +140,7 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir } // Reference on childVFSD dropped by a corresponding Valid. child = childVFSD.Impl().(*Dentry) - parent.InsertChildLocked(name, child) + parent.insertChildLocked(name, child) } return child, nil } @@ -548,7 +548,7 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st if !d.Impl().(*Dentry).isSymlink() { return "", syserror.EINVAL } - return inode.Readlink(ctx, rp.Mount()) + return inode.Readlink(ctx) } // RenameAt implements vfs.FilesystemImpl.RenameAt. @@ -657,10 +657,6 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - - // Store the name before walkExistingLocked as rp will be advanced past the - // name in the following call. - name := rp.Component() vfsd, inode, err := fs.walkExistingLocked(ctx, rp) fs.processDeferredDecRefsLocked(ctx) if err != nil { @@ -690,8 +686,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } - - if err := parentDentry.inode.RmDir(ctx, name, vfsd); err != nil { + if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } @@ -770,10 +765,6 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - - // Store the name before walkExistingLocked as rp will be advanced past the - // name in the following call. - name := rp.Component() vfsd, _, err := fs.walkExistingLocked(ctx, rp) fs.processDeferredDecRefsLocked(ctx) if err != nil { @@ -799,7 +790,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } - if err := parentDentry.inode.Unlink(ctx, name, vfsd); err != nil { + if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index ea4f679c2..c0b863ba4 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -172,7 +172,7 @@ func (InodeNoDynamicLookup) Valid(ctx context.Context) bool { type InodeNotSymlink struct{} // Readlink implements Inode.Readlink. -func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) { +func (InodeNotSymlink) Readlink(context.Context) (string, error) { return "", syserror.EINVAL } @@ -256,13 +256,6 @@ func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (li // SetStat implements Inode.SetStat. func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { - return a.SetInodeStat(ctx, fs, creds, opts) -} - -// SetInodeStat sets the corresponding attributes from opts to InodeAttrs. -// This function can be used by other kernfs-based filesystem implementation to -// sets the unexported attributes into kernfs.InodeAttrs. -func (a *InodeAttrs) SetInodeStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask == 0 { return nil } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 163f26ceb..88fcd54aa 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -60,7 +60,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory @@ -247,15 +246,15 @@ func (d *Dentry) OnZeroWatches(context.Context) {} // Precondition: d must represent a directory inode. func (d *Dentry) InsertChild(name string, child *Dentry) { d.dirMu.Lock() - d.InsertChildLocked(name, child) + d.insertChildLocked(name, child) d.dirMu.Unlock() } -// InsertChildLocked is equivalent to InsertChild, with additional +// insertChildLocked is equivalent to InsertChild, with additional // preconditions. // // Precondition: d.dirMu must be locked. -func (d *Dentry) InsertChildLocked(name string, child *Dentry) { +func (d *Dentry) insertChildLocked(name string, child *Dentry) { if !d.isDir() { panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) } @@ -268,36 +267,6 @@ func (d *Dentry) InsertChildLocked(name string, child *Dentry) { d.children[name] = child } -// RemoveChild removes child from the vfs dentry cache. This does not update the -// directory inode or modify the inode to be unlinked. So calling this on its own -// isn't sufficient to remove a child from a directory. -// -// Precondition: d must represent a directory inode. -func (d *Dentry) RemoveChild(name string, child *vfs.Dentry) error { - d.dirMu.Lock() - defer d.dirMu.Unlock() - return d.RemoveChildLocked(name, child) -} - -// RemoveChildLocked is equivalent to RemoveChild, with additional -// preconditions. -// -// Precondition: d.dirMu must be locked. -func (d *Dentry) RemoveChildLocked(name string, child *vfs.Dentry) error { - if !d.isDir() { - panic(fmt.Sprintf("RemoveChild called on non-directory Dentry: %+v.", d)) - } - c, ok := d.children[name] - if !ok { - return syserror.ENOENT - } - if &c.vfsd != child { - panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! Child: %+v, vfs: %+v", c, child)) - } - delete(d.children, name) - return nil -} - // Inode returns the dentry's inode. func (d *Dentry) Inode() Inode { return d.inode @@ -456,7 +425,7 @@ type inodeDynamicLookup interface { Valid(ctx context.Context) bool // IterDirents is used to iterate over dynamically created entries. It invokes - // cb on each entry in the directory represented by the Inode. + // cb on each entry in the directory represented by the FileDescription. // 'offset' is the offset for the entire IterDirents call, which may include // results from the caller (e.g. "." and ".."). 'relOffset' is the offset // inside the entries returned by this IterDirents invocation. In other words, @@ -468,7 +437,7 @@ type inodeDynamicLookup interface { type inodeSymlink interface { // Readlink returns the target of a symbolic link. If an inode is not a // symlink, the implementation should return EINVAL. - Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) + Readlink(ctx context.Context) (string, error) // Getlink returns the target of a symbolic link, as used by path // resolution: diff --git a/pkg/sentry/fsimpl/kernfs/static_directory_refs.go b/pkg/sentry/fsimpl/kernfs/static_directory_refs.go index 478b04bdd..2b258010e 100644 --- a/pkg/sentry/fsimpl/kernfs/static_directory_refs.go +++ b/pkg/sentry/fsimpl/kernfs/static_directory_refs.go @@ -2,11 +2,10 @@ package kernfs import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index a9812fcef..64731a3e4 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -52,7 +52,7 @@ func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor } // Readlink implements Inode. -func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) { +func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { return s.target, nil } diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go index c589b4746..360b77ef6 100644 --- a/pkg/sentry/fsimpl/overlay/copy_up.go +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -81,6 +82,8 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { Start: d.parent.upperVD, Path: fspath.Parse(d.name), } + // Used during copy-up of memory-mapped regular files. + var mmapOpts *memmap.MMapOpts cleanupUndoCopyUp := func() { var err error if ftype == linux.S_IFDIR { @@ -136,6 +139,25 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { break } } + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + if d.wrappedMappable != nil { + // We may have memory mappings of the file on the lower layer. + // Switch to mapping the file on the upper layer instead. + mmapOpts = &memmap.MMapOpts{ + Perms: usermem.ReadWrite, + MaxPerms: usermem.ReadWrite, + } + if err := newFD.ConfigureMMap(ctx, mmapOpts); err != nil { + cleanupUndoCopyUp() + return err + } + if mmapOpts.MappingIdentity != nil { + mmapOpts.MappingIdentity.DecRef(ctx) + } + // Don't actually switch Mappables until the end of copy-up; see + // below for why. + } if err := newFD.SetStat(ctx, vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_UID | linux.STATX_GID, @@ -265,6 +287,62 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { atomic.StoreUint64(&d.ino, upperStat.Ino) } + if mmapOpts != nil && mmapOpts.Mappable != nil { + // Note that if mmapOpts != nil, then d.mapsMu is locked for writing + // (from the S_IFREG path above). + + // Propagate mappings of d to the new Mappable. Remember which mappings + // we added so we can remove them on failure. + upperMappable := mmapOpts.Mappable + allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange) + for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + added := make(memmap.MappingsOfRange) + for m := range seg.Value() { + if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil { + for m := range added { + upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable) + } + for mr, mappings := range allAdded { + for m := range mappings { + upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable) + } + } + return err + } + added[m] = struct{}{} + } + allAdded[seg.Range()] = added + } + + // Switch to the new Mappable. We do this at the end of copy-up + // because: + // + // - We need to switch Mappables (by changing d.wrappedMappable) before + // invalidating Translations from the old Mappable (to pick up + // Translations from the new one). + // + // - We need to lock d.dataMu while changing d.wrappedMappable, but + // must invalidate Translations with d.dataMu unlocked (due to lock + // ordering). + // + // - Consequently, once we unlock d.dataMu, other threads may + // immediately observe the new (copied-up) Mappable, which we want to + // delay until copy-up is guaranteed to succeed. + d.dataMu.Lock() + lowerMappable := d.wrappedMappable + d.wrappedMappable = upperMappable + d.dataMu.Unlock() + d.lowerMappings.InvalidateAll(memmap.InvalidateOpts{}) + + // Remove mappings from the old Mappable. + for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + for m := range seg.Value() { + lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable) + } + } + d.lowerMappings.RemoveAll() + } + atomic.StoreUint32(&d.copiedUp, 1) return nil } diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go index 268b32537..74cfd3799 100644 --- a/pkg/sentry/fsimpl/overlay/non_directory.go +++ b/pkg/sentry/fsimpl/overlay/non_directory.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -256,10 +257,105 @@ func (fd *nonDirectoryFD) Sync(ctx context.Context) error { // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - wrappedFD, err := fd.getCurrentFD(ctx) + if err := fd.ensureMappable(ctx, opts); err != nil { + return err + } + return vfs.GenericConfigureMMap(&fd.vfsfd, fd.dentry(), opts) +} + +// ensureMappable ensures that fd.dentry().wrappedMappable is not nil. +func (fd *nonDirectoryFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error { + d := fd.dentry() + + // Fast path if we already have a Mappable for the current top layer. + if atomic.LoadUint32(&d.isMappable) != 0 { + return nil + } + + // Only permit mmap of regular files, since other file types may have + // unpredictable behavior when mmapped (e.g. /dev/zero). + if atomic.LoadUint32(&d.mode)&linux.S_IFMT != linux.S_IFREG { + return syserror.ENODEV + } + + // Get a Mappable for the current top layer. + fd.mu.Lock() + defer fd.mu.Unlock() + d.copyMu.RLock() + defer d.copyMu.RUnlock() + if atomic.LoadUint32(&d.isMappable) != 0 { + return nil + } + wrappedFD, err := fd.currentFDLocked(ctx) if err != nil { return err } - defer wrappedFD.DecRef(ctx) - return wrappedFD.ConfigureMMap(ctx, opts) + if err := wrappedFD.ConfigureMMap(ctx, opts); err != nil { + return err + } + if opts.MappingIdentity != nil { + opts.MappingIdentity.DecRef(ctx) + opts.MappingIdentity = nil + } + // Use this Mappable for all mappings of this layer (unless we raced with + // another call to ensureMappable). + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.dataMu.Lock() + defer d.dataMu.Unlock() + if d.wrappedMappable == nil { + d.wrappedMappable = opts.Mappable + atomic.StoreUint32(&d.isMappable, 1) + } + return nil +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + if err := d.wrappedMappable.AddMapping(ctx, ms, ar, offset, writable); err != nil { + return err + } + if !d.isCopiedUp() { + d.lowerMappings.AddMapping(ms, ar, offset, writable) + } + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.wrappedMappable.RemoveMapping(ctx, ms, ar, offset, writable) + if !d.isCopiedUp() { + d.lowerMappings.RemoveMapping(ms, ar, offset, writable) + } +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + if err := d.wrappedMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil { + return err + } + if !d.isCopiedUp() { + d.lowerMappings.AddMapping(ms, dstAR, offset, writable) + } + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + d.dataMu.RLock() + defer d.dataMu.RUnlock() + return d.wrappedMappable.Translate(ctx, required, optional, at) +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (d *dentry) InvalidateUnsavable(ctx context.Context) error { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + return d.wrappedMappable.InvalidateUnsavable(ctx) } diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index 9a8f7010e..b2efe5f80 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -22,6 +22,10 @@ // filesystem.renameMu // dentry.dirMu // dentry.copyMu +// *** "memmap.Mappable locks" below this point +// dentry.mapsMu +// *** "memmap.Mappable locks taken by Translate" below this point +// dentry.dataMu // // Locking dentry.dirMu in multiple dentries requires that parent dentries are // locked before child dentries, and that filesystem.renameMu is locked to @@ -37,6 +41,7 @@ import ( "gvisor.dev/gvisor/pkg/fspath" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -419,6 +424,35 @@ type dentry struct { devMinor uint32 ino uint64 + // If this dentry represents a regular file, then: + // + // - mapsMu is used to synchronize between copy-up and memmap.Mappable + // methods on dentry preceding mm.MemoryManager.activeMu in the lock order. + // + // - dataMu is used to synchronize between copy-up and + // dentry.(memmap.Mappable).Translate. + // + // - lowerMappings tracks memory mappings of the file. lowerMappings is + // used to invalidate mappings of the lower layer when the file is copied + // up to ensure that they remain coherent with subsequent writes to the + // file. (Note that, as of this writing, Linux overlayfs does not do this; + // this feature is a gVisor extension.) lowerMappings is protected by + // mapsMu. + // + // - If this dentry is copied-up, then wrappedMappable is the Mappable + // obtained from a call to the current top layer's + // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil + // (from a call to nonDirectoryFD.ensureMappable()), it cannot become nil. + // wrappedMappable is protected by mapsMu and dataMu. + // + // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is + // accessed using atomic memory operations. + mapsMu sync.Mutex + lowerMappings memmap.MappingSet + dataMu sync.RWMutex + wrappedMappable memmap.Mappable + isMappable uint32 + locks vfs.FileLocks } diff --git a/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go b/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go index 9431c1506..467c32752 100644 --- a/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go +++ b/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go @@ -2,11 +2,10 @@ package proc import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go b/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go index 872b20eb0..3fcda0948 100644 --- a/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go +++ b/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go @@ -2,11 +2,10 @@ package proc import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go b/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go index c6d9b3522..2da6801c2 100644 --- a/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go +++ b/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go @@ -2,11 +2,10 @@ package proc import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 5374538c9..3f0d78461 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -209,7 +209,7 @@ func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *ker return d } -func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { +func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { file, _ := getTaskFD(s.task, s.fd) if file == nil { return "", syserror.ENOENT diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 4f7f9cb00..356036b9b 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -668,7 +668,7 @@ func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentr } // Readlink implements kernfs.Inode. -func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { +func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { if !kernel.ContextCanTrace(ctx, s.task, false) { return "", syserror.EACCES } @@ -808,11 +808,11 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri } // Readlink implements Inode. -func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { +func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) { if err := checkTaskState(s.task); err != nil { return "", err } - return s.StaticSymlink.Readlink(ctx, mnt) + return s.StaticSymlink.Readlink(ctx) } // Getlink implements Inode.Getlink. diff --git a/pkg/sentry/fsimpl/proc/task_inode_refs.go b/pkg/sentry/fsimpl/proc/task_inode_refs.go index 714488450..b6e19844c 100644 --- a/pkg/sentry/fsimpl/proc/task_inode_refs.go +++ b/pkg/sentry/fsimpl/proc/task_inode_refs.go @@ -2,11 +2,10 @@ package proc import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 68c541046..8c41729e4 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -51,7 +51,7 @@ func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns return d } -func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { +func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? @@ -64,8 +64,8 @@ func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error return strconv.FormatUint(uint64(tgid), 10), nil } -func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { - target, err := s.Readlink(ctx, mnt) +func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx) return vfs.VirtualDentry{}, target, err } @@ -94,7 +94,7 @@ func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, return d } -func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { +func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? @@ -108,8 +108,8 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, return fmt.Sprintf("%d/task/%d", tgid, tid), nil } -func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { - target, err := s.Readlink(ctx, mnt) +func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx) return vfs.VirtualDentry{}, target, err } diff --git a/pkg/sentry/fsimpl/proc/tasks_inode_refs.go b/pkg/sentry/fsimpl/proc/tasks_inode_refs.go index 22d9cc488..6207364e4 100644 --- a/pkg/sentry/fsimpl/proc/tasks_inode_refs.go +++ b/pkg/sentry/fsimpl/proc/tasks_inode_refs.go @@ -2,11 +2,10 @@ package proc import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/sys/dir_refs.go b/pkg/sentry/fsimpl/sys/dir_refs.go index 89609b198..9d15d4c80 100644 --- a/pkg/sentry/fsimpl/sys/dir_refs.go +++ b/pkg/sentry/fsimpl/sys/dir_refs.go @@ -2,11 +2,10 @@ package sys import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/tmpfs/inode_refs.go b/pkg/sentry/fsimpl/tmpfs/inode_refs.go index dbf0b2766..ff5e99c52 100644 --- a/pkg/sentry/fsimpl/tmpfs/inode_refs.go +++ b/pkg/sentry/fsimpl/tmpfs/inode_refs.go @@ -2,11 +2,10 @@ package tmpfs import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/kernel/fd_table_refs.go b/pkg/sentry/kernel/fd_table_refs.go index ecba138ac..a630289c9 100644 --- a/pkg/sentry/kernel/fd_table_refs.go +++ b/pkg/sentry/kernel/fd_table_refs.go @@ -2,11 +2,10 @@ package kernel import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/kernel/fs_context_refs.go b/pkg/sentry/kernel/fs_context_refs.go index fb2fde971..e8bb1e6ee 100644 --- a/pkg/sentry/kernel/fs_context_refs.go +++ b/pkg/sentry/kernel/fs_context_refs.go @@ -2,11 +2,10 @@ package kernel import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/kernel/process_group_refs.go b/pkg/sentry/kernel/process_group_refs.go index 4ed6e6458..4b257d548 100644 --- a/pkg/sentry/kernel/process_group_refs.go +++ b/pkg/sentry/kernel/process_group_refs.go @@ -2,11 +2,10 @@ package kernel import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go index 8a2418c41..a0f2fe45c 100644 --- a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go +++ b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go @@ -1,12 +1,12 @@ package kernel import ( - "strings" "unsafe" "fmt" "gvisor.dev/gvisor/pkg/sync" "reflect" + "strings" ) // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race diff --git a/pkg/sentry/kernel/session_refs.go b/pkg/sentry/kernel/session_refs.go index f2e1bb797..204fdd060 100644 --- a/pkg/sentry/kernel/session_refs.go +++ b/pkg/sentry/kernel/session_refs.go @@ -2,11 +2,10 @@ package kernel import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/kernel/shm/shm_refs.go b/pkg/sentry/kernel/shm/shm_refs.go index 51e07d0b3..4bffdd0b3 100644 --- a/pkg/sentry/kernel/shm/shm_refs.go +++ b/pkg/sentry/kernel/shm/shm_refs.go @@ -2,11 +2,10 @@ package shm import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/mm/aio_mappable_refs.go b/pkg/sentry/mm/aio_mappable_refs.go index b99909f07..141747137 100644 --- a/pkg/sentry/mm/aio_mappable_refs.go +++ b/pkg/sentry/mm/aio_mappable_refs.go @@ -2,11 +2,10 @@ package mm import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/mm/special_mappable_refs.go b/pkg/sentry/mm/special_mappable_refs.go index 035bbe690..0921a5d18 100644 --- a/pkg/sentry/mm/special_mappable_refs.go +++ b/pkg/sentry/mm/special_mappable_refs.go @@ -2,11 +2,10 @@ package mm import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/platform/ring0/defs_impl_arm64.go b/pkg/sentry/platform/ring0/defs_impl_arm64.go index 424b66f76..eba2eac30 100644 --- a/pkg/sentry/platform/ring0/defs_impl_arm64.go +++ b/pkg/sentry/platform/ring0/defs_impl_arm64.go @@ -3,11 +3,11 @@ package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + "io" + "reflect" "fmt" "gvisor.dev/gvisor/pkg/usermem" - "io" - "reflect" ) // Useful bits. diff --git a/pkg/sentry/platform/ring0/entry_impl_arm64.s b/pkg/sentry/platform/ring0/entry_impl_arm64.s index f42800ebf..5bf870815 100644 --- a/pkg/sentry/platform/ring0/entry_impl_arm64.s +++ b/pkg/sentry/platform/ring0/entry_impl_arm64.s @@ -91,7 +91,9 @@ // ERET returns using the ELR and SPSR for the current exception level. #define ERET() \ - WORD $0xd69f03e0 + WORD $0xd69f03e0; \ + DSB $7; \ + ISB $15; // RSV_REG is a register that holds el1 information temporarily. #define RSV_REG R18_PLATFORM diff --git a/pkg/sentry/socket/unix/socket_refs.go b/pkg/sentry/socket/unix/socket_refs.go index dababb85f..39aaedc7f 100644 --- a/pkg/sentry/socket/unix/socket_refs.go +++ b/pkg/sentry/socket/unix/socket_refs.go @@ -2,11 +2,10 @@ package unix import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/socket/unix/transport/queue_refs.go b/pkg/sentry/socket/unix/transport/queue_refs.go index 0d4e34988..4c3dcd13f 100644 --- a/pkg/sentry/socket/unix/transport/queue_refs.go +++ b/pkg/sentry/socket/unix/transport/queue_refs.go @@ -2,11 +2,10 @@ package transport import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/time/seqatomic_parameters_unsafe.go b/pkg/sentry/time/seqatomic_parameters_unsafe.go index 14978ed91..f18440378 100644 --- a/pkg/sentry/time/seqatomic_parameters_unsafe.go +++ b/pkg/sentry/time/seqatomic_parameters_unsafe.go @@ -1,12 +1,12 @@ package time import ( - "strings" "unsafe" "fmt" "gvisor.dev/gvisor/pkg/sync" "reflect" + "strings" ) // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 2b668fd89..68b80a951 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -107,7 +107,7 @@ func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSeque // file_operations::iterate == file_operations::iterate_shared == NULL in // Linux. func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error { - return syserror.ENOSYS + return syserror.ENOTDIR } // Seek implements FileDescriptionImpl.Seek analogously to diff --git a/pkg/sentry/vfs/file_description_refs.go b/pkg/sentry/vfs/file_description_refs.go index bdd7e6554..6c7747259 100644 --- a/pkg/sentry/vfs/file_description_refs.go +++ b/pkg/sentry/vfs/file_description_refs.go @@ -2,11 +2,10 @@ package vfs import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/vfs/filesystem_refs.go b/pkg/sentry/vfs/filesystem_refs.go index 38a9a986f..96f681831 100644 --- a/pkg/sentry/vfs/filesystem_refs.go +++ b/pkg/sentry/vfs/filesystem_refs.go @@ -2,11 +2,10 @@ package vfs import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 06ca91989..9da09d4c1 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -18,14 +18,12 @@ import ( "bytes" "fmt" "math" - "path" "sort" "strings" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" ) @@ -740,11 +738,23 @@ func (mntns *MountNamespace) Root() VirtualDentry { // // Preconditions: taskRootDir.Ok(). func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { - vfs.mountMu.Lock() - defer vfs.mountMu.Unlock() rootMnt := taskRootDir.mount + + vfs.mountMu.Lock() mounts := rootMnt.submountsLocked() + // Take a reference on mounts since we need to drop vfs.mountMu before + // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()). + for _, mnt := range mounts { + mnt.IncRef() + } + vfs.mountMu.Unlock() + defer func() { + for _, mnt := range mounts { + mnt.DecRef(ctx) + } + }() sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) + for _, mnt := range mounts { // Get the path to this mount relative to task root. mntRootVD := VirtualDentry{ @@ -755,7 +765,7 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi if err != nil { // For some reason we didn't get a path. Log a warning // and run with empty path. - ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) + ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err) path = "" } if path == "" { @@ -789,11 +799,25 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi // // Preconditions: taskRootDir.Ok(). func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { - vfs.mountMu.Lock() - defer vfs.mountMu.Unlock() rootMnt := taskRootDir.mount + + vfs.mountMu.Lock() mounts := rootMnt.submountsLocked() + // Take a reference on mounts since we need to drop vfs.mountMu before + // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or + // vfs.StatAt() (=> FilesystemImpl.StatAt()). + for _, mnt := range mounts { + mnt.IncRef() + } + vfs.mountMu.Unlock() + defer func() { + for _, mnt := range mounts { + mnt.DecRef(ctx) + } + }() sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) + + creds := auth.CredentialsFromContext(ctx) for _, mnt := range mounts { // Get the path to this mount relative to task root. mntRootVD := VirtualDentry{ @@ -804,7 +828,7 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo if err != nil { // For some reason we didn't get a path. Log a warning // and run with empty path. - ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) + ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err) path = "" } if path == "" { @@ -817,9 +841,10 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo Root: mntRootVD, Start: mntRootVD, } - statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{}) + statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{}) if err != nil { // Well that's not good. Ignore this mount. + ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err) break } @@ -831,6 +856,9 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo fmt.Fprintf(buf, "%d ", mnt.ID) // (2) Parent ID (or this ID if there is no parent). + // Note that even if the call to mnt.parent() races with Mount + // destruction (which is possible since we're not holding vfs.mountMu), + // its Mount.ID will still be valid. pID := mnt.ID if p := mnt.parent(); p != nil { pID = p.ID @@ -879,30 +907,6 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo } } -// MakeSyntheticMountpoint creates parent directories of target if they do not -// exist and attempts to create a directory for the mountpoint. If a -// non-directory file already exists there then we allow it. -func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error { - mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} - - // Make sure the parent directory of target exists. - if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil { - return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err) - } - - // Attempt to mkdir the final component. If a file (of any type) exists - // then we let allow mounting on top of that because we do not require the - // target to be an existing directory, unlike Linux mount(2). - if err := vfs.MkdirAt(ctx, creds, &PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(target), - }, mkdirOpts); err != nil && err != syserror.EEXIST { - return fmt.Errorf("failed to create mountpoint %q: %w", target, err) - } - return nil -} - // manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents. // See Linux fs/seq_file.c:mangle_path. func manglePath(p string) string { diff --git a/pkg/sentry/vfs/mount_namespace_refs.go b/pkg/sentry/vfs/mount_namespace_refs.go index 63285fb8e..4c422c81f 100644 --- a/pkg/sentry/vfs/mount_namespace_refs.go +++ b/pkg/sentry/vfs/mount_namespace_refs.go @@ -2,11 +2,10 @@ package vfs import ( "fmt" - "runtime" - "sync/atomic" - "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "runtime" + "sync/atomic" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index ed1cf99ba..1ebf355ef 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -819,6 +819,30 @@ func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string return nil } +// MakeSyntheticMountpoint creates parent directories of target if they do not +// exist and attempts to create a directory for the mountpoint. If a +// non-directory file already exists there then we allow it. +func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error { + mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} + + // Make sure the parent directory of target exists. + if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil { + return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err) + } + + // Attempt to mkdir the final component. If a file (of any type) exists + // then we let allow mounting on top of that because we do not require the + // target to be an existing directory, unlike Linux mount(2). + if err := vfs.MkdirAt(ctx, creds, &PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(target), + }, mkdirOpts); err != nil && err != syserror.EEXIST { + return fmt.Errorf("failed to create mountpoint %q: %w", target, err) + } + return nil +} + // A VirtualDentry represents a node in a VFS tree, by combining a Dentry // (which represents a node in a Filesystem's tree) and a Mount (which // represents the Filesystem's position in a VFS mount tree). |