summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fsimpl
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/fsimpl')
-rw-r--r--pkg/sentry/fsimpl/devpts/master.go2
-rw-r--r--pkg/sentry/fsimpl/devpts/queue.go2
-rw-r--r--pkg/sentry/fsimpl/devpts/replica.go2
-rw-r--r--pkg/sentry/fsimpl/devpts/root_inode_refs.go2
-rw-r--r--pkg/sentry/fsimpl/devpts/terminal.go2
-rw-r--r--pkg/sentry/fsimpl/fuse/connection.go348
-rw-r--r--pkg/sentry/fsimpl/fuse/dev.go185
-rw-r--r--pkg/sentry/fsimpl/fuse/directory.go105
-rw-r--r--pkg/sentry/fsimpl/fuse/file.go133
-rw-r--r--pkg/sentry/fsimpl/fuse/fuse_state_autogen.go198
-rw-r--r--pkg/sentry/fsimpl/fuse/fusefs.go569
-rw-r--r--pkg/sentry/fsimpl/fuse/init.go (renamed from pkg/sentry/fsimpl/fuse/connection_control.go)169
-rw-r--r--pkg/sentry/fsimpl/fuse/inode_refs.go2
-rw-r--r--pkg/sentry/fsimpl/fuse/read_write.go242
-rw-r--r--pkg/sentry/fsimpl/fuse/regular_file.go230
-rw-r--r--pkg/sentry/fsimpl/fuse/request_response.go229
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go41
-rw-r--r--pkg/sentry/fsimpl/host/connected_endpoint_refs.go2
-rw-r--r--pkg/sentry/fsimpl/host/inode_refs.go2
-rw-r--r--pkg/sentry/fsimpl/host/socket_unsafe.go4
-rw-r--r--pkg/sentry/fsimpl/host/tty.go2
-rw-r--r--pkg/sentry/fsimpl/kernfs/dentry_refs.go2
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go17
-rw-r--r--pkg/sentry/fsimpl/kernfs/inode_impl_util.go9
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go41
-rw-r--r--pkg/sentry/fsimpl/kernfs/static_directory_refs.go2
-rw-r--r--pkg/sentry/fsimpl/kernfs/symlink.go2
-rw-r--r--pkg/sentry/fsimpl/overlay/copy_up.go78
-rw-r--r--pkg/sentry/fsimpl/overlay/non_directory.go102
-rw-r--r--pkg/sentry/fsimpl/overlay/overlay.go34
-rw-r--r--pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go2
-rw-r--r--pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go2
-rw-r--r--pkg/sentry/fsimpl/proc/subtasks_inode_refs.go2
-rw-r--r--pkg/sentry/fsimpl/proc/task_fds.go2
-rw-r--r--pkg/sentry/fsimpl/proc/task_files.go6
-rw-r--r--pkg/sentry/fsimpl/proc/task_inode_refs.go2
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_files.go12
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_inode_refs.go2
-rw-r--r--pkg/sentry/fsimpl/sys/dir_refs.go2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/inode_refs.go2
40 files changed, 750 insertions, 2042 deletions
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index d07e1ded8..83d790b38 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -17,6 +17,7 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
@@ -27,7 +28,6 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
// masterInode is the inode for the master end of the Terminal.
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
index ca36b66e9..55bff3e60 100644
--- a/pkg/sentry/fsimpl/devpts/queue.go
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -17,6 +17,7 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -24,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
// waitBufMaxBytes is the maximum size of a wait buffer. It is based on
diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go
index 1f99f4b4d..58f6c1d3a 100644
--- a/pkg/sentry/fsimpl/devpts/replica.go
+++ b/pkg/sentry/fsimpl/devpts/replica.go
@@ -17,6 +17,7 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
@@ -26,7 +27,6 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
// replicaInode is the inode for the replica end of the Terminal.
diff --git a/pkg/sentry/fsimpl/devpts/root_inode_refs.go b/pkg/sentry/fsimpl/devpts/root_inode_refs.go
index 068ee2f20..051801202 100644
--- a/pkg/sentry/fsimpl/devpts/root_inode_refs.go
+++ b/pkg/sentry/fsimpl/devpts/root_inode_refs.go
@@ -1,10 +1,10 @@
package devpts
import (
+ "fmt"
"runtime"
"sync/atomic"
- "fmt"
"gvisor.dev/gvisor/pkg/log"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
)
diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go
index 731955d62..510bd6d89 100644
--- a/pkg/sentry/fsimpl/devpts/terminal.go
+++ b/pkg/sentry/fsimpl/devpts/terminal.go
@@ -17,9 +17,9 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
// Terminal is a pseudoterminal.
diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go
index a7402c149..eb1d1a2b7 100644
--- a/pkg/sentry/fsimpl/fuse/connection.go
+++ b/pkg/sentry/fsimpl/fuse/connection.go
@@ -15,17 +15,31 @@
package fuse
import (
+ "errors"
+ "fmt"
"sync"
+ "sync/atomic"
+ "syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
+// maxActiveRequestsDefault is the default setting controlling the upper bound
+// on the number of active requests at any given time.
+const maxActiveRequestsDefault = 10000
+
+// Ordinary requests have even IDs, while interrupts IDs are odd.
+// Used to increment the unique ID for each FUSE request.
+var reqIDStep uint64 = 2
+
const (
// fuseDefaultMaxBackground is the default value for MaxBackground.
fuseDefaultMaxBackground = 12
@@ -38,36 +52,43 @@ const (
fuseDefaultMaxPagesPerReq = 32
)
+// Request represents a FUSE operation request that hasn't been sent to the
+// server yet.
+//
+// +stateify savable
+type Request struct {
+ requestEntry
+
+ id linux.FUSEOpID
+ hdr *linux.FUSEHeaderIn
+ data []byte
+}
+
+// Response represents an actual response from the server, including the
+// response payload.
+//
+// +stateify savable
+type Response struct {
+ opcode linux.FUSEOpcode
+ hdr linux.FUSEHeaderOut
+ data []byte
+}
+
// connection is the struct by which the sentry communicates with the FUSE server daemon.
-// Lock order:
-// - conn.fd.mu
-// - conn.mu
-// - conn.asyncMu
type connection struct {
fd *DeviceFD
- // mu protects access to struct memebers.
- mu sync.Mutex
-
- // attributeVersion is the version of connection's attributes.
- attributeVersion uint64
-
- // We target FUSE 7.23.
// The following FUSE_INIT flags are currently unsupported by this implementation:
+ // - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC)
// - FUSE_EXPORT_SUPPORT
+ // - FUSE_HANDLE_KILLPRIV
// - FUSE_POSIX_LOCKS: requires POSIX locks
// - FUSE_FLOCK_LOCKS: requires POSIX locks
// - FUSE_AUTO_INVAL_DATA: requires page caching eviction
+ // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction
// - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation
// - FUSE_ASYNC_DIO
- // - FUSE_PARALLEL_DIROPS (7.25)
- // - FUSE_HANDLE_KILLPRIV (7.26)
- // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler (7.26)
- // - FUSE_ABORT_ERROR (7.27)
- // - FUSE_CACHE_SYMLINKS (7.28)
- // - FUSE_NO_OPENDIR_SUPPORT (7.29)
- // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction (7.30)
- // - FUSE_MAP_ALIGNMENT (7.31)
+ // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler
// initialized after receiving FUSE_INIT reply.
// Until it's set, suspend sending FUSE requests.
@@ -77,6 +98,10 @@ type connection struct {
// initializedChan is used to block requests before initialization.
initializedChan chan struct{}
+ // blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground).
+ // TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block.
+ blocked bool
+
// connected (connection established) when a new FUSE file system is created.
// Set to false when:
// umount,
@@ -84,55 +109,48 @@ type connection struct {
// device release.
connected bool
+ // aborted via sysfs.
+ // TODO(gvisor.dev/issue/3185): abort all queued requests.
+ aborted bool
+
// connInitError if FUSE_INIT encountered error (major version mismatch).
// Only set in INIT.
connInitError bool
// connInitSuccess if FUSE_INIT is successful.
// Only set in INIT.
- // Used for destory (not yet implemented).
+ // Used for destory.
connInitSuccess bool
- // aborted via sysfs, and will send ECONNABORTED to read after disconnection (instead of ENODEV).
- // Set only if abortErr is true and via fuse control fs (not yet implemented).
- // TODO(gvisor.dev/issue/3525): set this to true when user aborts.
- aborted bool
+ // TODO(gvisor.dev/issue/3185): All the queue logic are working in progress.
- // numWating is the number of requests waiting to be
- // sent to FUSE device or being processed by FUSE daemon.
- numWaiting uint32
+ // NumberBackground is the number of requests in the background.
+ numBackground uint16
- // Terminology note:
- //
- // - `asyncNumMax` is the `MaxBackground` in the FUSE_INIT_IN struct.
- //
- // - `asyncCongestionThreshold` is the `CongestionThreshold` in the FUSE_INIT_IN struct.
- //
- // We call the "background" requests in unix term as async requests.
- // The "async requests" in unix term is our async requests that expect a reply,
- // i.e. `!requestOptions.noReply`
+ // congestionThreshold for NumBackground.
+ // Negotiated in FUSE_INIT.
+ congestionThreshold uint16
+
+ // maxBackground is the maximum number of NumBackground.
+ // Block connection when it is reached.
+ // Negotiated in FUSE_INIT.
+ maxBackground uint16
- // asyncMu protects the async request fields.
- asyncMu sync.Mutex
+ // numActiveBackground is the number of requests in background and has being marked as active.
+ numActiveBackground uint16
- // asyncNum is the number of async requests.
- // Protected by asyncMu.
- asyncNum uint16
+ // numWating is the number of requests waiting for completion.
+ numWaiting uint32
- // asyncCongestionThreshold the number of async requests.
- // Negotiated in FUSE_INIT as "CongestionThreshold".
- // TODO(gvisor.dev/issue/3529): add congestion control.
- // Protected by asyncMu.
- asyncCongestionThreshold uint16
+ // TODO(gvisor.dev/issue/3185): BgQueue
+ // some queue for background queued requests.
- // asyncNumMax is the maximum number of asyncNum.
- // Connection blocks the async requests when it is reached.
- // Negotiated in FUSE_INIT as "MaxBackground".
- // Protected by asyncMu.
- asyncNumMax uint16
+ // bgLock protects:
+ // MaxBackground, CongestionThreshold, NumBackground,
+ // NumActiveBackground, BgQueue, Blocked.
+ bgLock sync.Mutex
// maxRead is the maximum size of a read buffer in in bytes.
- // Initialized from a fuse fs parameter.
maxRead uint32
// maxWrite is the maximum size of a write buffer in bytes.
@@ -147,20 +165,23 @@ type connection struct {
// Negotiated and only set in INIT.
minor uint32
- // atomicOTrunc is true when FUSE does not send a separate SETATTR request
- // before open with O_TRUNC flag.
- // Negotiated and only set in INIT.
- atomicOTrunc bool
-
// asyncRead if read pages asynchronously.
// Negotiated and only set in INIT.
asyncRead bool
+ // abortErr is true if kernel need to return an unique read error after abort.
+ // Negotiated and only set in INIT.
+ abortErr bool
+
// writebackCache is true for write-back cache policy,
// false for write-through policy.
// Negotiated and only set in INIT.
writebackCache bool
+ // cacheSymlinks if filesystem needs to cache READLINK responses in page cache.
+ // Negotiated and only set in INIT.
+ cacheSymlinks bool
+
// bigWrites if doing multi-page cached writes.
// Negotiated and only set in INIT.
bigWrites bool
@@ -168,70 +189,116 @@ type connection struct {
// dontMask if filestestem does not apply umask to creation modes.
// Negotiated in INIT.
dontMask bool
-
- // noOpen if FUSE server doesn't support open operation.
- // This flag only influence performance, not correctness of the program.
- noOpen bool
}
// newFUSEConnection creates a FUSE connection to fd.
-func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, opts *filesystemOptions) (*connection, error) {
+func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) {
// Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to
// mount a FUSE filesystem.
fuseFD := fd.Impl().(*DeviceFD)
+ fuseFD.mounted = true
// Create the writeBuf for the header to be stored in.
hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
fuseFD.writeBuf = make([]byte, hdrLen)
fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse)
- fuseFD.fullQueueCh = make(chan struct{}, opts.maxActiveRequests)
+ fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests)
fuseFD.writeCursor = 0
return &connection{
- fd: fuseFD,
- asyncNumMax: fuseDefaultMaxBackground,
- asyncCongestionThreshold: fuseDefaultCongestionThreshold,
- maxRead: opts.maxRead,
- maxPages: fuseDefaultMaxPagesPerReq,
- initializedChan: make(chan struct{}),
- connected: true,
+ fd: fuseFD,
+ maxBackground: fuseDefaultMaxBackground,
+ congestionThreshold: fuseDefaultCongestionThreshold,
+ maxPages: fuseDefaultMaxPagesPerReq,
+ initializedChan: make(chan struct{}),
+ connected: true,
}, nil
}
-// CallAsync makes an async (aka background) request.
-// It's a simple wrapper around Call().
-func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
- r.async = true
- _, err := conn.Call(t, r)
- return err
+// SetInitialized atomically sets the connection as initialized.
+func (conn *connection) SetInitialized() {
+ // Unblock the requests sent before INIT.
+ close(conn.initializedChan)
+
+ // Close the channel first to avoid the non-atomic situation
+ // where conn.initialized is true but there are
+ // tasks being blocked on the channel.
+ // And it prevents the newer tasks from gaining
+ // unnecessary higher chance to be issued before the blocked one.
+
+ atomic.StoreInt32(&(conn.initialized), int32(1))
}
-// Call makes a request to the server.
-// Block before the connection is initialized.
-// When the Request is FUSE_INIT, it will not be blocked before initialization.
-// Task should never be nil.
-//
-// For a sync request, it blocks the invoking task until
-// a server responds with a response.
-//
-// For an async request (that do not expect a response immediately),
-// it returns directly unless being blocked either before initialization
-// or when there are too many async requests ongoing.
-//
-// Example for async request:
-// init, readahead, write, async read/write, fuse_notify_reply,
-// non-sync release, interrupt, forget.
-//
-// The forget request does not have a reply,
-// as documented in include/uapi/linux/fuse.h:FUSE_FORGET.
+// IsInitialized atomically check if the connection is initialized.
+// pairs with SetInitialized().
+func (conn *connection) Initialized() bool {
+ return atomic.LoadInt32(&(conn.initialized)) != 0
+}
+
+// NewRequest creates a new request that can be sent to the FUSE server.
+func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
+ conn.fd.mu.Lock()
+ defer conn.fd.mu.Unlock()
+ conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
+
+ hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
+ hdr := linux.FUSEHeaderIn{
+ Len: uint32(hdrLen + payload.SizeBytes()),
+ Opcode: opcode,
+ Unique: conn.fd.nextOpID,
+ NodeID: ino,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ PID: pid,
+ }
+
+ buf := make([]byte, hdr.Len)
+ hdr.MarshalUnsafe(buf[:hdrLen])
+ payload.MarshalUnsafe(buf[hdrLen:])
+
+ return &Request{
+ id: hdr.Unique,
+ hdr: &hdr,
+ data: buf,
+ }, nil
+}
+
+// Call makes a request to the server and blocks the invoking task until a
+// server responds with a response. Task should never be nil.
+// Requests will not be sent before the connection is initialized.
+// For async tasks, use CallAsync().
func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
// Block requests sent before connection is initalized.
- if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT {
+ if !conn.Initialized() {
if err := t.Block(conn.initializedChan); err != nil {
return nil, err
}
}
+ return conn.call(t, r)
+}
+
+// CallAsync makes an async (aka background) request.
+// Those requests either do not expect a response (e.g. release) or
+// the response should be handled by others (e.g. init).
+// Return immediately unless the connection is blocked (before initialization).
+// Async call example: init, release, forget, aio, interrupt.
+// When the Request is FUSE_INIT, it will not be blocked before initialization.
+func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+ // Block requests sent before connection is initalized.
+ if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT {
+ if err := t.Block(conn.initializedChan); err != nil {
+ return err
+ }
+ }
+
+ // This should be the only place that invokes call() with a nil task.
+ _, err := conn.call(nil, r)
+ return err
+}
+
+// call makes a call without blocking checks.
+func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
if !conn.connected {
return nil, syserror.ENOTCONN
}
@@ -248,6 +315,31 @@ func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
return fut.resolve(t)
}
+// Error returns the error of the FUSE call.
+func (r *Response) Error() error {
+ errno := r.hdr.Error
+ if errno >= 0 {
+ return nil
+ }
+
+ sysErrNo := syscall.Errno(-errno)
+ return error(sysErrNo)
+}
+
+// UnmarshalPayload unmarshals the response data into m.
+func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
+ hdrLen := r.hdr.SizeBytes()
+ haveDataLen := r.hdr.Len - uint32(hdrLen)
+ wantDataLen := uint32(m.SizeBytes())
+
+ if haveDataLen < wantDataLen {
+ return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen)
+ }
+
+ m.UnmarshalUnsafe(r.data[hdrLen:])
+ return nil
+}
+
// callFuture makes a request to the server and returns a future response.
// Call resolve() when the response needs to be fulfilled.
func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) {
@@ -266,6 +358,11 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse,
// if there are always too many ongoing requests all the time. The
// supported maxActiveRequests setting should be really high to avoid this.
for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
+ if t == nil {
+ // Since there is no task that is waiting. We must error out.
+ return nil, errors.New("FUSE request queue full")
+ }
+
log.Infof("Blocking request %v from being queued. Too many active requests: %v",
r.id, conn.fd.numActiveRequests)
conn.fd.mu.Unlock()
@@ -281,19 +378,9 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse,
// callFutureLocked makes a request to the server and returns a future response.
func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) {
- // Check connected again holding conn.mu.
- conn.mu.Lock()
- if !conn.connected {
- conn.mu.Unlock()
- // we checked connected before,
- // this must be due to aborted connection.
- return nil, syserror.ECONNABORTED
- }
- conn.mu.Unlock()
-
conn.fd.queue.PushBack(r)
- conn.fd.numActiveRequests++
- fut := newFutureResponse(r)
+ conn.fd.numActiveRequests += 1
+ fut := newFutureResponse(r.hdr.Opcode)
conn.fd.completions[r.id] = fut
// Signal the readers that there is something to read.
@@ -301,3 +388,50 @@ func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureRes
return fut, nil
}
+
+// futureResponse represents an in-flight request, that may or may not have
+// completed yet. Convert it to a resolved Response by calling Resolve, but note
+// that this may block.
+//
+// +stateify savable
+type futureResponse struct {
+ opcode linux.FUSEOpcode
+ ch chan struct{}
+ hdr *linux.FUSEHeaderOut
+ data []byte
+}
+
+// newFutureResponse creates a future response to a FUSE request.
+func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse {
+ return &futureResponse{
+ opcode: opcode,
+ ch: make(chan struct{}),
+ }
+}
+
+// resolve blocks the task until the server responds to its corresponding request,
+// then returns a resolved response.
+func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
+ // If there is no Task associated with this request - then we don't try to resolve
+ // the response. Instead, the task writing the response (proxy to the server) will
+ // process the response on our behalf.
+ if t == nil {
+ log.Infof("fuse.Response.resolve: Not waiting on a response from server.")
+ return nil, nil
+ }
+
+ if err := t.Block(f.ch); err != nil {
+ return nil, err
+ }
+
+ return f.getResponse(), nil
+}
+
+// getResponse creates a Response from the data the futureResponse has.
+func (f *futureResponse) getResponse() *Response {
+ return &Response{
+ opcode: f.opcode,
+ hdr: *f.hdr,
+ data: f.data,
+ }
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
index 64c3e32e2..e522ff9a0 100644
--- a/pkg/sentry/fsimpl/fuse/dev.go
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -55,6 +56,9 @@ type DeviceFD struct {
vfs.DentryMetadataFileDescriptionImpl
vfs.NoLockFD
+ // mounted specifies whether a FUSE filesystem was mounted using the DeviceFD.
+ mounted bool
+
// nextOpID is used to create new requests.
nextOpID linux.FUSEOpID
@@ -96,15 +100,13 @@ type DeviceFD struct {
// Release implements vfs.FileDescriptionImpl.Release.
func (fd *DeviceFD) Release(context.Context) {
- if fd.fs != nil {
- fd.fs.conn.connected = false
- }
+ fd.fs.conn.connected = false
}
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if fd.fs == nil {
+ if !fd.mounted {
return 0, syserror.EPERM
}
@@ -114,16 +116,10 @@ func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset in
// Read implements vfs.FileDescriptionImpl.Read.
func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if fd.fs == nil {
+ if !fd.mounted {
return 0, syserror.EPERM
}
- // Return ENODEV if the filesystem is umounted.
- if fd.fs.umounted {
- // TODO(gvisor.dev/issue/3525): return ECONNABORTED if aborted via fuse control fs.
- return 0, syserror.ENODEV
- }
-
// We require that any Read done on this filesystem have a sane minimum
// read buffer. It must have the capacity for the fixed parts of any request
// header (Linux uses the request header and the FUSEWriteIn header for this
@@ -147,82 +143,58 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R
}
// readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
-//
-// Preconditions: dst is large enough for any reasonable request.
func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
- var req *Request
+ if fd.queue.Empty() {
+ return 0, syserror.ErrWouldBlock
+ }
- // Find the first valid request.
- // For the normal case this loop only execute once.
- for !fd.queue.Empty() {
- req = fd.queue.Front()
+ var readCursor uint32
+ var bytesRead int64
+ for {
+ req := fd.queue.Front()
+ if dst.NumBytes() < int64(req.hdr.Len) {
+ // The request is too large. Cannot process it. All requests must be smaller than the
+ // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
+ // handshake.
+ errno := -int32(syscall.EIO)
+ if req.hdr.Opcode == linux.FUSE_SETXATTR {
+ errno = -int32(syscall.E2BIG)
+ }
- if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() {
- break
- }
+ // Return the error to the calling task.
+ if err := fd.sendError(ctx, errno, req); err != nil {
+ return 0, err
+ }
- // The request is too large. Cannot process it. All requests must be smaller than the
- // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
- // handshake.
- errno := -int32(syscall.EIO)
- if req.hdr.Opcode == linux.FUSE_SETXATTR {
- errno = -int32(syscall.E2BIG)
- }
+ // We're done with this request.
+ fd.queue.Remove(req)
- // Return the error to the calling task.
- if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil {
- return 0, err
+ // Restart the read as this request was invalid.
+ log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.")
+ return fd.readLocked(ctx, dst, opts)
}
- // We're done with this request.
- fd.queue.Remove(req)
- req = nil
- }
-
- if req == nil {
- return 0, syserror.ErrWouldBlock
- }
-
- // We already checked the size: dst must be able to fit the whole request.
- // Now we write the marshalled header, the payload,
- // and the potential additional payload
- // to the user memory IOSequence.
-
- n, err := dst.CopyOut(ctx, req.data)
- if err != nil {
- return 0, err
- }
- if n != len(req.data) {
- return 0, syserror.EIO
- }
-
- if req.hdr.Opcode == linux.FUSE_WRITE {
- written, err := dst.DropFirst(n).CopyOut(ctx, req.payload)
+ n, err := dst.CopyOut(ctx, req.data[readCursor:])
if err != nil {
return 0, err
}
- if written != len(req.payload) {
- return 0, syserror.EIO
- }
- n += int(written)
- }
+ readCursor += uint32(n)
+ bytesRead += int64(n)
- // Fully done with this req, remove it from the queue.
- fd.queue.Remove(req)
-
- // Remove noReply ones from map of requests expecting a reply.
- if req.noReply {
- fd.numActiveRequests -= 1
- delete(fd.completions, req.hdr.Unique)
+ if readCursor >= req.hdr.Len {
+ // Fully done with this req, remove it from the queue.
+ fd.queue.Remove(req)
+ break
+ }
}
- return int64(n), nil
+ return bytesRead, nil
}
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if fd.fs == nil {
+ if !fd.mounted {
return 0, syserror.EPERM
}
@@ -239,15 +211,10 @@ func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.
// writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if fd.fs == nil {
+ if !fd.mounted {
return 0, syserror.EPERM
}
- // Return ENODEV if the filesystem is umounted.
- if fd.fs.umounted {
- return 0, syserror.ENODEV
- }
-
var cn, n int64
hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
@@ -309,12 +276,7 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt
fut, ok := fd.completions[hdr.Unique]
if !ok {
- if fut.hdr.Unique == linux.FUSE_RELEASE {
- // Currently we simply discard the reply for FUSE_RELEASE.
- return n + src.NumBytes(), nil
- }
- // Server sent us a response for a request we never sent,
- // or for which we already received a reply (e.g. aborted), an unlikely event.
+ // Server sent us a response for a request we never sent?
return 0, syserror.EINVAL
}
@@ -345,23 +307,8 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt
// Readiness implements vfs.FileDescriptionImpl.Readiness.
func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
- fd.mu.Lock()
- defer fd.mu.Unlock()
- return fd.readinessLocked(mask)
-}
-
-// readinessLocked implements checking the readiness of the fuse device while
-// locked with DeviceFD.mu.
-func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask {
var ready waiter.EventMask
-
- if fd.fs.umounted {
- ready |= waiter.EventErr
- return ready & mask
- }
-
- // FD is always writable.
- ready |= waiter.EventOut
+ ready |= waiter.EventOut // FD is always writable
if !fd.queue.Empty() {
// Have reqs available, FD is readable.
ready |= waiter.EventIn
@@ -383,7 +330,7 @@ func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if fd.fs == nil {
+ if !fd.mounted {
return 0, syserror.EPERM
}
@@ -392,54 +339,58 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64
// sendResponse sends a response to the waiting task (if any).
func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
- // Signal the task waiting on a response if any.
- defer close(fut.ch)
+ // See if the running task need to perform some action before returning.
+ // Since we just finished writing the future, we can be sure that
+ // getResponse generates a populated response.
+ if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil {
+ return err
+ }
// Signal that the queue is no longer full.
select {
case fd.fullQueueCh <- struct{}{}:
default:
}
- fd.numActiveRequests--
-
- if fut.async {
- return fd.asyncCallBack(ctx, fut.getResponse())
- }
+ fd.numActiveRequests -= 1
+ // Signal the task waiting on a response.
+ close(fut.ch)
return nil
}
-// sendError sends an error response to the waiting task (if any) by calling sendResponse().
-func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error {
+// sendError sends an error response to the waiting task (if any).
+func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error {
// Return the error to the calling task.
outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
respHdr := linux.FUSEHeaderOut{
Len: outHdrLen,
Error: errno,
- Unique: unique,
+ Unique: req.hdr.Unique,
}
fut, ok := fd.completions[respHdr.Unique]
if !ok {
- // A response for a request we never sent,
- // or for which we already received a reply (e.g. aborted).
+ // Server sent us a response for a request we never sent?
return syserror.EINVAL
}
delete(fd.completions, respHdr.Unique)
fut.hdr = &respHdr
- return fd.sendResponse(ctx, fut)
+ if err := fd.sendResponse(ctx, fut); err != nil {
+ return err
+ }
+
+ return nil
}
-// asyncCallBack executes pre-defined callback function for async requests.
-// Currently used by: FUSE_INIT.
-func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error {
- switch r.opcode {
- case linux.FUSE_INIT:
+// noReceiverAction has the calling kernel.Task do some action if its known that no
+// receiver is going to be waiting on the future channel. This is to be used by:
+// FUSE_INIT.
+func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error {
+ if r.opcode == linux.FUSE_INIT {
creds := auth.CredentialsFromContext(ctx)
rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
- // TODO(gvisor.dev/issue/3247): support async read: correctly process the response.
}
return nil
diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go
deleted file mode 100644
index 798c4a6f3..000000000
--- a/pkg/sentry/fsimpl/fuse/directory.go
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fuse
-
-import (
- "sync/atomic"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
-)
-
-type directoryFD struct {
- fileDescription
-}
-
-// Allocate implements directoryFD.Allocate.
-func (directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
- return syserror.EISDIR
-}
-
-// PRead implements FileDescriptionImpl.PRead.
-func (directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
- return 0, syserror.EISDIR
-}
-
-// Read implements FileDescriptionImpl.Read.
-func (directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
- return 0, syserror.EISDIR
-}
-
-// PWrite implements FileDescriptionImpl.PWrite.
-func (directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
- return 0, syserror.EISDIR
-}
-
-// Write implements FileDescriptionImpl.Write.
-func (directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
- return 0, syserror.EISDIR
-}
-
-// IterDirents implements FileDescriptionImpl.IterDirents.
-func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error {
- fusefs := dir.inode().fs
- task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
-
- in := linux.FUSEReadIn{
- Fh: dir.Fh,
- Offset: uint64(atomic.LoadInt64(&dir.off)),
- Size: linux.FUSE_PAGE_SIZE,
- Flags: dir.statusFlags(),
- }
-
- // TODO(gVisor.dev/issue/3404): Support FUSE_READDIRPLUS.
- req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), dir.inode().nodeID, linux.FUSE_READDIR, &in)
- if err != nil {
- return err
- }
-
- res, err := fusefs.conn.Call(task, req)
- if err != nil {
- return err
- }
- if err := res.Error(); err != nil {
- return err
- }
-
- var out linux.FUSEDirents
- if err := res.UnmarshalPayload(&out); err != nil {
- return err
- }
-
- for _, fuseDirent := range out.Dirents {
- nextOff := int64(fuseDirent.Meta.Off)
- dirent := vfs.Dirent{
- Name: fuseDirent.Name,
- Type: uint8(fuseDirent.Meta.Type),
- Ino: fuseDirent.Meta.Ino,
- NextOff: nextOff,
- }
-
- if err := callback.Handle(dirent); err != nil {
- return err
- }
- atomic.StoreInt64(&dir.off, nextOff)
- }
-
- return nil
-}
diff --git a/pkg/sentry/fsimpl/fuse/file.go b/pkg/sentry/fsimpl/fuse/file.go
deleted file mode 100644
index 991efcda4..000000000
--- a/pkg/sentry/fsimpl/fuse/file.go
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fuse
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
- "gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/usermem"
-)
-
-// fileDescription implements vfs.FileDescriptionImpl for fuse.
-type fileDescription struct {
- vfsfd vfs.FileDescription
- vfs.FileDescriptionDefaultImpl
- vfs.DentryMetadataFileDescriptionImpl
- vfs.NoLockFD
-
- // the file handle used in userspace.
- Fh uint64
-
- // Nonseekable is indicate cannot perform seek on a file.
- Nonseekable bool
-
- // DirectIO suggest fuse to use direct io operation.
- DirectIO bool
-
- // OpenFlag is the flag returned by open.
- OpenFlag uint32
-
- // off is the file offset.
- off int64
-}
-
-func (fd *fileDescription) dentry() *kernfs.Dentry {
- return fd.vfsfd.Dentry().Impl().(*kernfs.Dentry)
-}
-
-func (fd *fileDescription) inode() *inode {
- return fd.dentry().Inode().(*inode)
-}
-
-func (fd *fileDescription) filesystem() *vfs.Filesystem {
- return fd.vfsfd.VirtualDentry().Mount().Filesystem()
-}
-
-func (fd *fileDescription) statusFlags() uint32 {
- return fd.vfsfd.StatusFlags()
-}
-
-// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *fileDescription) Release(ctx context.Context) {
- // no need to release if FUSE server doesn't implement Open.
- conn := fd.inode().fs.conn
- if conn.noOpen {
- return
- }
-
- in := linux.FUSEReleaseIn{
- Fh: fd.Fh,
- Flags: fd.statusFlags(),
- }
- // TODO(gvisor.dev/issue/3245): add logic when we support file lock owner.
- var opcode linux.FUSEOpcode
- if fd.inode().Mode().IsDir() {
- opcode = linux.FUSE_RELEASEDIR
- } else {
- opcode = linux.FUSE_RELEASE
- }
- kernelTask := kernel.TaskFromContext(ctx)
- // ignoring errors and FUSE server reply is analogous to Linux's behavior.
- req, err := conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), fd.inode().nodeID, opcode, &in)
- if err != nil {
- // No way to invoke Call() with an errored request.
- return
- }
- req.noReply = true
- conn.CallAsync(kernelTask, req)
-}
-
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
- return 0, nil
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
- return 0, nil
-}
-
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
- return 0, nil
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
- return 0, nil
-}
-
-// Seek implements vfs.FileDescriptionImpl.Seek.
-func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
- return 0, nil
-}
-
-// Stat implements vfs.FileDescriptionImpl.Stat.
-func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
- fs := fd.filesystem()
- inode := fd.inode()
- return inode.Stat(ctx, fs, opts)
-}
-
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
- fs := fd.filesystem()
- creds := auth.CredentialsFromContext(ctx)
- return fd.inode().setAttr(ctx, fs, creds, opts, true, fd.Fh)
-}
diff --git a/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go b/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go
index 6c82dce30..f72fe342e 100644
--- a/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go
+++ b/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go
@@ -6,191 +6,179 @@ import (
"gvisor.dev/gvisor/pkg/state"
)
-func (x *inodeRefs) StateTypeName() string {
- return "pkg/sentry/fsimpl/fuse.inodeRefs"
+func (x *Request) StateTypeName() string {
+ return "pkg/sentry/fsimpl/fuse.Request"
}
-func (x *inodeRefs) StateFields() []string {
+func (x *Request) StateFields() []string {
return []string{
- "refCount",
+ "requestEntry",
+ "id",
+ "hdr",
+ "data",
}
}
-func (x *inodeRefs) beforeSave() {}
+func (x *Request) beforeSave() {}
-func (x *inodeRefs) StateSave(m state.Sink) {
+func (x *Request) StateSave(m state.Sink) {
x.beforeSave()
- m.Save(0, &x.refCount)
+ m.Save(0, &x.requestEntry)
+ m.Save(1, &x.id)
+ m.Save(2, &x.hdr)
+ m.Save(3, &x.data)
}
-func (x *inodeRefs) afterLoad() {}
+func (x *Request) afterLoad() {}
-func (x *inodeRefs) StateLoad(m state.Source) {
- m.Load(0, &x.refCount)
+func (x *Request) StateLoad(m state.Source) {
+ m.Load(0, &x.requestEntry)
+ m.Load(1, &x.id)
+ m.Load(2, &x.hdr)
+ m.Load(3, &x.data)
}
-func (x *requestList) StateTypeName() string {
- return "pkg/sentry/fsimpl/fuse.requestList"
+func (x *Response) StateTypeName() string {
+ return "pkg/sentry/fsimpl/fuse.Response"
}
-func (x *requestList) StateFields() []string {
+func (x *Response) StateFields() []string {
return []string{
- "head",
- "tail",
+ "opcode",
+ "hdr",
+ "data",
}
}
-func (x *requestList) beforeSave() {}
+func (x *Response) beforeSave() {}
-func (x *requestList) StateSave(m state.Sink) {
+func (x *Response) StateSave(m state.Sink) {
x.beforeSave()
- m.Save(0, &x.head)
- m.Save(1, &x.tail)
+ m.Save(0, &x.opcode)
+ m.Save(1, &x.hdr)
+ m.Save(2, &x.data)
}
-func (x *requestList) afterLoad() {}
+func (x *Response) afterLoad() {}
-func (x *requestList) StateLoad(m state.Source) {
- m.Load(0, &x.head)
- m.Load(1, &x.tail)
+func (x *Response) StateLoad(m state.Source) {
+ m.Load(0, &x.opcode)
+ m.Load(1, &x.hdr)
+ m.Load(2, &x.data)
}
-func (x *requestEntry) StateTypeName() string {
- return "pkg/sentry/fsimpl/fuse.requestEntry"
+func (x *futureResponse) StateTypeName() string {
+ return "pkg/sentry/fsimpl/fuse.futureResponse"
}
-func (x *requestEntry) StateFields() []string {
+func (x *futureResponse) StateFields() []string {
return []string{
- "next",
- "prev",
+ "opcode",
+ "ch",
+ "hdr",
+ "data",
}
}
-func (x *requestEntry) beforeSave() {}
+func (x *futureResponse) beforeSave() {}
-func (x *requestEntry) StateSave(m state.Sink) {
+func (x *futureResponse) StateSave(m state.Sink) {
x.beforeSave()
- m.Save(0, &x.next)
- m.Save(1, &x.prev)
+ m.Save(0, &x.opcode)
+ m.Save(1, &x.ch)
+ m.Save(2, &x.hdr)
+ m.Save(3, &x.data)
}
-func (x *requestEntry) afterLoad() {}
+func (x *futureResponse) afterLoad() {}
-func (x *requestEntry) StateLoad(m state.Source) {
- m.Load(0, &x.next)
- m.Load(1, &x.prev)
+func (x *futureResponse) StateLoad(m state.Source) {
+ m.Load(0, &x.opcode)
+ m.Load(1, &x.ch)
+ m.Load(2, &x.hdr)
+ m.Load(3, &x.data)
}
-func (x *Request) StateTypeName() string {
- return "pkg/sentry/fsimpl/fuse.Request"
+func (x *inodeRefs) StateTypeName() string {
+ return "pkg/sentry/fsimpl/fuse.inodeRefs"
}
-func (x *Request) StateFields() []string {
+func (x *inodeRefs) StateFields() []string {
return []string{
- "requestEntry",
- "id",
- "hdr",
- "data",
- "payload",
- "async",
- "noReply",
+ "refCount",
}
}
-func (x *Request) beforeSave() {}
+func (x *inodeRefs) beforeSave() {}
-func (x *Request) StateSave(m state.Sink) {
+func (x *inodeRefs) StateSave(m state.Sink) {
x.beforeSave()
- m.Save(0, &x.requestEntry)
- m.Save(1, &x.id)
- m.Save(2, &x.hdr)
- m.Save(3, &x.data)
- m.Save(4, &x.payload)
- m.Save(5, &x.async)
- m.Save(6, &x.noReply)
+ m.Save(0, &x.refCount)
}
-func (x *Request) afterLoad() {}
+func (x *inodeRefs) afterLoad() {}
-func (x *Request) StateLoad(m state.Source) {
- m.Load(0, &x.requestEntry)
- m.Load(1, &x.id)
- m.Load(2, &x.hdr)
- m.Load(3, &x.data)
- m.Load(4, &x.payload)
- m.Load(5, &x.async)
- m.Load(6, &x.noReply)
+func (x *inodeRefs) StateLoad(m state.Source) {
+ m.Load(0, &x.refCount)
}
-func (x *futureResponse) StateTypeName() string {
- return "pkg/sentry/fsimpl/fuse.futureResponse"
+func (x *requestList) StateTypeName() string {
+ return "pkg/sentry/fsimpl/fuse.requestList"
}
-func (x *futureResponse) StateFields() []string {
+func (x *requestList) StateFields() []string {
return []string{
- "opcode",
- "ch",
- "hdr",
- "data",
- "async",
+ "head",
+ "tail",
}
}
-func (x *futureResponse) beforeSave() {}
+func (x *requestList) beforeSave() {}
-func (x *futureResponse) StateSave(m state.Sink) {
+func (x *requestList) StateSave(m state.Sink) {
x.beforeSave()
- m.Save(0, &x.opcode)
- m.Save(1, &x.ch)
- m.Save(2, &x.hdr)
- m.Save(3, &x.data)
- m.Save(4, &x.async)
+ m.Save(0, &x.head)
+ m.Save(1, &x.tail)
}
-func (x *futureResponse) afterLoad() {}
+func (x *requestList) afterLoad() {}
-func (x *futureResponse) StateLoad(m state.Source) {
- m.Load(0, &x.opcode)
- m.Load(1, &x.ch)
- m.Load(2, &x.hdr)
- m.Load(3, &x.data)
- m.Load(4, &x.async)
+func (x *requestList) StateLoad(m state.Source) {
+ m.Load(0, &x.head)
+ m.Load(1, &x.tail)
}
-func (x *Response) StateTypeName() string {
- return "pkg/sentry/fsimpl/fuse.Response"
+func (x *requestEntry) StateTypeName() string {
+ return "pkg/sentry/fsimpl/fuse.requestEntry"
}
-func (x *Response) StateFields() []string {
+func (x *requestEntry) StateFields() []string {
return []string{
- "opcode",
- "hdr",
- "data",
+ "next",
+ "prev",
}
}
-func (x *Response) beforeSave() {}
+func (x *requestEntry) beforeSave() {}
-func (x *Response) StateSave(m state.Sink) {
+func (x *requestEntry) StateSave(m state.Sink) {
x.beforeSave()
- m.Save(0, &x.opcode)
- m.Save(1, &x.hdr)
- m.Save(2, &x.data)
+ m.Save(0, &x.next)
+ m.Save(1, &x.prev)
}
-func (x *Response) afterLoad() {}
+func (x *requestEntry) afterLoad() {}
-func (x *Response) StateLoad(m state.Source) {
- m.Load(0, &x.opcode)
- m.Load(1, &x.hdr)
- m.Load(2, &x.data)
+func (x *requestEntry) StateLoad(m state.Source) {
+ m.Load(0, &x.next)
+ m.Load(1, &x.prev)
}
func init() {
+ state.Register((*Request)(nil))
+ state.Register((*Response)(nil))
+ state.Register((*futureResponse)(nil))
state.Register((*inodeRefs)(nil))
state.Register((*requestList)(nil))
state.Register((*requestEntry)(nil))
- state.Register((*Request)(nil))
- state.Register((*futureResponse)(nil))
- state.Register((*Response)(nil))
}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index 402dabe5a..810819ae4 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -16,10 +16,7 @@
package fuse
import (
- "math"
"strconv"
- "sync"
- "sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -29,17 +26,11 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
)
// Name is the default filesystem name.
const Name = "fuse"
-// maxActiveRequestsDefault is the default setting controlling the upper bound
-// on the number of active requests at any given time.
-const maxActiveRequestsDefault = 10000
-
// FilesystemType implements vfs.FilesystemType.
type FilesystemType struct{}
@@ -65,11 +56,6 @@ type filesystemOptions struct {
// exist at any time. Any further requests will block when trying to
// Call the server.
maxActiveRequests uint64
-
- // maxRead is the max number of bytes to read,
- // specified as "max_read" in fs parameters.
- // If not specified by user, use math.MaxUint32 as default value.
- maxRead uint32
}
// filesystem implements vfs.FilesystemImpl.
@@ -83,9 +69,6 @@ type filesystem struct {
// opts is the options the fusefs is initialized with.
opts *filesystemOptions
-
- // umounted is true if filesystem.Release() has been called.
- umounted bool
}
// Name implements vfs.FilesystemType.Name.
@@ -159,29 +142,14 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
// Set the maxInFlightRequests option.
fsopts.maxActiveRequests = maxActiveRequestsDefault
- if maxReadStr, ok := mopts["max_read"]; ok {
- delete(mopts, "max_read")
- maxRead, err := strconv.ParseUint(maxReadStr, 10, 32)
- if err != nil {
- log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr)
- return nil, nil, syserror.EINVAL
- }
- if maxRead < fuseMinMaxRead {
- maxRead = fuseMinMaxRead
- }
- fsopts.maxRead = uint32(maxRead)
- } else {
- fsopts.maxRead = math.MaxUint32
- }
-
// Check for unparsed options.
if len(mopts) != 0 {
- log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts)
+ log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts)
return nil, nil, syserror.EINVAL
}
// Create a new FUSE filesystem.
- fs, err := newFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
+ fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
if err != nil {
log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err)
return nil, nil, err
@@ -197,27 +165,26 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
// root is the fusefs root directory.
- root := fs.newRootInode(creds, fsopts.rootMode)
+ root := fs.newInode(creds, fsopts.rootMode)
return fs.VFSFilesystem(), root.VFSDentry(), nil
}
-// newFUSEFilesystem creates a new FUSE filesystem.
-func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
- conn, err := newFUSEConnection(ctx, device, opts)
+// NewFUSEFilesystem creates a new FUSE filesystem.
+func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
+ fs := &filesystem{
+ devMinor: devMinor,
+ opts: opts,
+ }
+
+ conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests)
if err != nil {
log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err)
return nil, syserror.EINVAL
}
+ fs.conn = conn
fuseFD := device.Impl().(*DeviceFD)
-
- fs := &filesystem{
- devMinor: devMinor,
- opts: opts,
- conn: conn,
- }
-
fuseFD.fs = fs
return fs, nil
@@ -225,14 +192,6 @@ func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt
// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
- fs.umounted = true
- fs.conn.Abort(ctx)
-
- fs.conn.fd.mu.Lock()
- // Notify all the waiters on this fd.
- fs.conn.fd.waitQueue.Notify(waiter.EventIn)
- fs.conn.fd.mu.Unlock()
-
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
fs.Filesystem.Release(ctx)
}
@@ -246,49 +205,14 @@ type inode struct {
kernfs.InodeNotSymlink
kernfs.OrderedChildren
- dentry kernfs.Dentry
-
- // the owning filesystem. fs is immutable.
- fs *filesystem
-
- // metaDataMu protects the metadata of this inode.
- metadataMu sync.Mutex
-
- nodeID uint64
-
locks vfs.FileLocks
- // size of the file.
- size uint64
-
- // attributeVersion is the version of inode's attributes.
- attributeVersion uint64
-
- // attributeTime is the remaining vaild time of attributes.
- attributeTime uint64
-
- // version of the inode.
- version uint64
-
- // link is result of following a symbolic link.
- link string
-}
-
-func (fs *filesystem) newRootInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
- i := &inode{fs: fs}
- i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755)
- i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- i.dentry.Init(i)
- i.nodeID = 1
-
- return &i.dentry
+ dentry kernfs.Dentry
}
-func (fs *filesystem) newInode(nodeID uint64, attr linux.FUSEAttr) *kernfs.Dentry {
- i := &inode{fs: fs, nodeID: nodeID}
- creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)}
- i.InodeAttrs.Init(&creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode))
- atomic.StoreUint64(&i.size, attr.Size)
+func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
+ i := &inode{}
+ i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
i.EnableLeakCheck()
i.dentry.Init(i)
@@ -298,301 +222,13 @@ func (fs *filesystem) newInode(nodeID uint64, attr linux.FUSEAttr) *kernfs.Dentr
// Open implements kernfs.Inode.Open.
func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- isDir := i.InodeAttrs.Mode().IsDir()
- // return error if specified to open directory but inode is not a directory.
- if !isDir && opts.Mode.IsDir() {
- return nil, syserror.ENOTDIR
- }
- if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS {
- return nil, syserror.EOVERFLOW
- }
-
- var fd *fileDescription
- var fdImpl vfs.FileDescriptionImpl
- if isDir {
- directoryFD := &directoryFD{}
- fd = &(directoryFD.fileDescription)
- fdImpl = directoryFD
- } else {
- regularFD := &regularFileFD{}
- fd = &(regularFD.fileDescription)
- fdImpl = regularFD
- }
- // FOPEN_KEEP_CACHE is the defualt flag for noOpen.
- fd.OpenFlag = linux.FOPEN_KEEP_CACHE
-
- // Only send open request when FUSE server support open or is opening a directory.
- if !i.fs.conn.noOpen || isDir {
- kernelTask := kernel.TaskFromContext(ctx)
- if kernelTask == nil {
- log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context")
- return nil, syserror.EINVAL
- }
-
- // Build the request.
- var opcode linux.FUSEOpcode
- if isDir {
- opcode = linux.FUSE_OPENDIR
- } else {
- opcode = linux.FUSE_OPEN
- }
-
- in := linux.FUSEOpenIn{Flags: opts.Flags & ^uint32(linux.O_CREAT|linux.O_EXCL|linux.O_NOCTTY)}
- if !i.fs.conn.atomicOTrunc {
- in.Flags &= ^uint32(linux.O_TRUNC)
- }
-
- req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in)
- if err != nil {
- return nil, err
- }
-
- // Send the request and receive the reply.
- res, err := i.fs.conn.Call(kernelTask, req)
- if err != nil {
- return nil, err
- }
- if err := res.Error(); err == syserror.ENOSYS && !isDir {
- i.fs.conn.noOpen = true
- } else if err != nil {
- return nil, err
- } else {
- out := linux.FUSEOpenOut{}
- if err := res.UnmarshalPayload(&out); err != nil {
- return nil, err
- }
-
- // Process the reply.
- fd.OpenFlag = out.OpenFlag
- if isDir {
- fd.OpenFlag &= ^uint32(linux.FOPEN_DIRECT_IO)
- }
-
- fd.Fh = out.Fh
- }
- }
-
- // TODO(gvisor.dev/issue/3234): invalidate mmap after implemented it for FUSE Inode
- fd.DirectIO = fd.OpenFlag&linux.FOPEN_DIRECT_IO != 0
- fdOptions := &vfs.FileDescriptionOptions{}
- if fd.OpenFlag&linux.FOPEN_NONSEEKABLE != 0 {
- fdOptions.DenyPRead = true
- fdOptions.DenyPWrite = true
- fd.Nonseekable = true
- }
-
- // If we don't send SETATTR before open (which is indicated by atomicOTrunc)
- // and O_TRUNC is set, update the inode's version number and clean existing data
- // by setting the file size to 0.
- if i.fs.conn.atomicOTrunc && opts.Flags&linux.O_TRUNC != 0 {
- i.fs.conn.mu.Lock()
- i.fs.conn.attributeVersion++
- i.attributeVersion = i.fs.conn.attributeVersion
- atomic.StoreUint64(&i.size, 0)
- i.fs.conn.mu.Unlock()
- i.attributeTime = 0
- }
-
- if err := fd.vfsfd.Init(fdImpl, opts.Flags, rp.Mount(), vfsd, fdOptions); err != nil {
- return nil, err
- }
- return &fd.vfsfd, nil
-}
-
-// Lookup implements kernfs.Inode.Lookup.
-func (i *inode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
- in := linux.FUSELookupIn{Name: name}
- return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in)
-}
-
-// IterDirents implements kernfs.Inode.IterDirents.
-func (*inode) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
- return offset, nil
-}
-
-// Valid implements kernfs.Inode.Valid.
-func (*inode) Valid(ctx context.Context) bool {
- return true
-}
-
-// NewFile implements kernfs.Inode.NewFile.
-func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
- kernelTask := kernel.TaskFromContext(ctx)
- if kernelTask == nil {
- log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID)
- return nil, syserror.EINVAL
- }
- in := linux.FUSECreateIn{
- CreateMeta: linux.FUSECreateMeta{
- Flags: opts.Flags,
- Mode: uint32(opts.Mode) | linux.S_IFREG,
- Umask: uint32(kernelTask.FSContext().Umask()),
- },
- Name: name,
- }
- return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in)
-}
-
-// NewNode implements kernfs.Inode.NewNode.
-func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) {
- in := linux.FUSEMknodIn{
- MknodMeta: linux.FUSEMknodMeta{
- Mode: uint32(opts.Mode),
- Rdev: linux.MakeDeviceID(uint16(opts.DevMajor), opts.DevMinor),
- Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
- },
- Name: name,
- }
- return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in)
-}
-
-// NewSymlink implements kernfs.Inode.NewSymlink.
-func (i *inode) NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) {
- in := linux.FUSESymLinkIn{
- Name: name,
- Target: target,
- }
- return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in)
-}
-
-// Unlink implements kernfs.Inode.Unlink.
-func (i *inode) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
- kernelTask := kernel.TaskFromContext(ctx)
- if kernelTask == nil {
- log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
- return syserror.EINVAL
- }
- in := linux.FUSEUnlinkIn{Name: name}
- req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in)
- if err != nil {
- return err
- }
- res, err := i.fs.conn.Call(kernelTask, req)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndStaticEntries,
+ })
if err != nil {
- return err
- }
- // only return error, discard res.
- if err := res.Error(); err != nil {
- return err
- }
- return i.dentry.RemoveChildLocked(name, child)
-}
-
-// NewDir implements kernfs.Inode.NewDir.
-func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
- in := linux.FUSEMkdirIn{
- MkdirMeta: linux.FUSEMkdirMeta{
- Mode: uint32(opts.Mode),
- Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
- },
- Name: name,
- }
- return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in)
-}
-
-// RmDir implements kernfs.Inode.RmDir.
-func (i *inode) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
- fusefs := i.fs
- task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
-
- in := linux.FUSERmDirIn{Name: name}
- req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in)
- if err != nil {
- return err
- }
-
- res, err := i.fs.conn.Call(task, req)
- if err != nil {
- return err
- }
- if err := res.Error(); err != nil {
- return err
- }
-
- // TODO(Before merging): When creating new nodes, should we add nodes to the ordered children?
- // If so we'll probably need to call this. We will also need to add them with the writable flag when
- // appropriate.
- // return i.OrderedChildren.RmDir(ctx, name, child)
-
- return nil
-}
-
-// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response.
-// Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP.
-func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*vfs.Dentry, error) {
- kernelTask := kernel.TaskFromContext(ctx)
- if kernelTask == nil {
- log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
- return nil, syserror.EINVAL
- }
- req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload)
- if err != nil {
- return nil, err
- }
- res, err := i.fs.conn.Call(kernelTask, req)
- if err != nil {
- return nil, err
- }
- if err := res.Error(); err != nil {
return nil, err
}
- out := linux.FUSEEntryOut{}
- if err := res.UnmarshalPayload(&out); err != nil {
- return nil, err
- }
- if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) {
- return nil, syserror.EIO
- }
- child := i.fs.newInode(out.NodeID, out.Attr)
- if opcode == linux.FUSE_LOOKUP {
- i.dentry.InsertChildLocked(name, child)
- } else {
- i.dentry.InsertChild(name, child)
- }
- return child.VFSDentry(), nil
-}
-
-// Getlink implements kernfs.Inode.Getlink.
-func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
- path, err := i.Readlink(ctx, mnt)
- return vfs.VirtualDentry{}, path, err
-}
-
-// Readlink implements kernfs.Inode.Readlink.
-func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
- if i.Mode().FileType()&linux.S_IFLNK == 0 {
- return "", syserror.EINVAL
- }
- if len(i.link) == 0 {
- kernelTask := kernel.TaskFromContext(ctx)
- if kernelTask == nil {
- log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context")
- return "", syserror.EINVAL
- }
- req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{})
- if err != nil {
- return "", err
- }
- res, err := i.fs.conn.Call(kernelTask, req)
- if err != nil {
- return "", err
- }
- i.link = string(res.data[res.hdr.SizeBytes():])
- if !mnt.Options().ReadOnly {
- i.attributeTime = 0
- }
- }
- return i.link, nil
-}
-
-// getFUSEAttr returns a linux.FUSEAttr of this inode stored in local cache.
-// TODO(gvisor.dev/issue/3679): Add support for other fields.
-func (i *inode) getFUSEAttr() linux.FUSEAttr {
- return linux.FUSEAttr{
- Ino: i.Ino(),
- Size: atomic.LoadUint64(&i.size),
- Mode: uint32(i.Mode()),
- }
+ return fd.VFSFileDescription(), nil
}
// statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The
@@ -648,89 +284,47 @@ func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx {
return stat
}
-// getAttr gets the attribute of this inode by issuing a FUSE_GETATTR request
-// or read from local cache. It updates the corresponding attributes if
-// necessary.
-func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions, flags uint32, fh uint64) (linux.FUSEAttr, error) {
- attributeVersion := atomic.LoadUint64(&i.fs.conn.attributeVersion)
-
- // TODO(gvisor.dev/issue/3679): send the request only if
- // - invalid local cache for fields specified in the opts.Mask
- // - forced update
- // - i.attributeTime expired
- // If local cache is still valid, return local cache.
- // Currently we always send a request,
- // and we always set the metadata with the new result,
- // unless attributeVersion has changed.
-
- task := kernel.TaskFromContext(ctx)
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ fusefs := fs.Impl().(*filesystem)
+ conn := fusefs.conn
+ task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
if task == nil {
log.Warningf("couldn't get kernel task from context")
- return linux.FUSEAttr{}, syserror.EINVAL
+ return linux.Statx{}, syserror.EINVAL
}
- creds := auth.CredentialsFromContext(ctx)
-
- in := linux.FUSEGetAttrIn{
- GetAttrFlags: flags,
- Fh: fh,
- }
- req, err := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in)
+ var in linux.FUSEGetAttrIn
+ // We don't set any attribute in the request, because in VFS2 fstat(2) will
+ // finally be translated into vfs.FilesystemImpl.StatAt() (see
+ // pkg/sentry/syscalls/linux/vfs2/stat.go), resulting in the same flow
+ // as stat(2). Thus GetAttrFlags and Fh variable will never be used in VFS2.
+ req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.Ino(), linux.FUSE_GETATTR, &in)
if err != nil {
- return linux.FUSEAttr{}, err
+ return linux.Statx{}, err
}
- res, err := i.fs.conn.Call(task, req)
+ res, err := conn.Call(task, req)
if err != nil {
- return linux.FUSEAttr{}, err
+ return linux.Statx{}, err
}
if err := res.Error(); err != nil {
- return linux.FUSEAttr{}, err
+ return linux.Statx{}, err
}
var out linux.FUSEGetAttrOut
if err := res.UnmarshalPayload(&out); err != nil {
- return linux.FUSEAttr{}, err
- }
-
- // Local version is newer, return the local one.
- // Skip the update.
- if attributeVersion != 0 && atomic.LoadUint64(&i.attributeVersion) > attributeVersion {
- return i.getFUSEAttr(), nil
+ return linux.Statx{}, err
}
- // Set the metadata of kernfs.InodeAttrs.
- if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{
- Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
+ // Set all metadata into kernfs.InodeAttrs.
+ if err := i.SetStat(ctx, fs, creds, vfs.SetStatOptions{
+ Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, fusefs.devMinor),
}); err != nil {
- return linux.FUSEAttr{}, err
- }
-
- // Set the size if no error (after SetStat() check).
- atomic.StoreUint64(&i.size, out.Attr.Size)
-
- return out.Attr, nil
-}
-
-// reviseAttr attempts to update the attributes for internal purposes
-// by calling getAttr with a pre-specified mask.
-// Used by read, write, lseek.
-func (i *inode) reviseAttr(ctx context.Context, flags uint32, fh uint64) error {
- // Never need atime for internal purposes.
- _, err := i.getAttr(ctx, i.fs.VFSFilesystem(), vfs.StatOptions{
- Mask: linux.STATX_BASIC_STATS &^ linux.STATX_ATIME,
- }, flags, fh)
- return err
-}
-
-// Stat implements kernfs.Inode.Stat.
-func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- attr, err := i.getAttr(ctx, fs, opts, 0, 0)
- if err != nil {
return linux.Statx{}, err
}
- return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil
+ return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil
}
// DecRef implements kernfs.Inode.
@@ -743,84 +337,3 @@ func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, e
// TODO(gvisor.dev/issues/3413): Complete the implementation of statfs.
return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil
}
-
-// fattrMaskFromStats converts vfs.SetStatOptions.Stat.Mask to linux stats mask
-// aligned with the attribute mask defined in include/linux/fs.h.
-func fattrMaskFromStats(mask uint32) uint32 {
- var fuseAttrMask uint32
- maskMap := map[uint32]uint32{
- linux.STATX_MODE: linux.FATTR_MODE,
- linux.STATX_UID: linux.FATTR_UID,
- linux.STATX_GID: linux.FATTR_GID,
- linux.STATX_SIZE: linux.FATTR_SIZE,
- linux.STATX_ATIME: linux.FATTR_ATIME,
- linux.STATX_MTIME: linux.FATTR_MTIME,
- linux.STATX_CTIME: linux.FATTR_CTIME,
- }
- for statxMask, fattrMask := range maskMap {
- if mask&statxMask != 0 {
- fuseAttrMask |= fattrMask
- }
- }
- return fuseAttrMask
-}
-
-// SetStat implements kernfs.Inode.SetStat.
-func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
- return i.setAttr(ctx, fs, creds, opts, false, 0)
-}
-
-func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions, useFh bool, fh uint64) error {
- conn := i.fs.conn
- task := kernel.TaskFromContext(ctx)
- if task == nil {
- log.Warningf("couldn't get kernel task from context")
- return syserror.EINVAL
- }
-
- // We should retain the original file type when assigning new mode.
- fileType := uint16(i.Mode()) & linux.S_IFMT
- fattrMask := fattrMaskFromStats(opts.Stat.Mask)
- if useFh {
- fattrMask |= linux.FATTR_FH
- }
- in := linux.FUSESetAttrIn{
- Valid: fattrMask,
- Fh: fh,
- Size: opts.Stat.Size,
- Atime: uint64(opts.Stat.Atime.Sec),
- Mtime: uint64(opts.Stat.Mtime.Sec),
- Ctime: uint64(opts.Stat.Ctime.Sec),
- AtimeNsec: opts.Stat.Atime.Nsec,
- MtimeNsec: opts.Stat.Mtime.Nsec,
- CtimeNsec: opts.Stat.Ctime.Nsec,
- Mode: uint32(fileType | opts.Stat.Mode),
- UID: opts.Stat.UID,
- GID: opts.Stat.GID,
- }
- req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in)
- if err != nil {
- return err
- }
-
- res, err := conn.Call(task, req)
- if err != nil {
- return err
- }
- if err := res.Error(); err != nil {
- return err
- }
- out := linux.FUSEGetAttrOut{}
- if err := res.UnmarshalPayload(&out); err != nil {
- return err
- }
-
- // Set the metadata of kernfs.InodeAttrs.
- if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{
- Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
- }); err != nil {
- return err
- }
-
- return nil
-}
diff --git a/pkg/sentry/fsimpl/fuse/connection_control.go b/pkg/sentry/fsimpl/fuse/init.go
index a63c66e7c..779c2bd3f 100644
--- a/pkg/sentry/fsimpl/fuse/connection_control.go
+++ b/pkg/sentry/fsimpl/fuse/init.go
@@ -15,11 +15,7 @@
package fuse
import (
- "sync/atomic"
- "syscall"
-
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)
@@ -33,10 +29,9 @@ const (
// Follow the same behavior as unix fuse implementation.
fuseMaxTimeGranNs = 1000000000
- // Minimum value for MaxWrite and MaxRead.
+ // Minimum value for MaxWrite.
// Follow the same behavior as unix fuse implementation.
fuseMinMaxWrite = 4096
- fuseMinMaxRead = 4096
// Temporary default value for max readahead, 128kb.
fuseDefaultMaxReadahead = 131072
@@ -54,26 +49,6 @@ var (
MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold
)
-// SetInitialized atomically sets the connection as initialized.
-func (conn *connection) SetInitialized() {
- // Unblock the requests sent before INIT.
- close(conn.initializedChan)
-
- // Close the channel first to avoid the non-atomic situation
- // where conn.initialized is true but there are
- // tasks being blocked on the channel.
- // And it prevents the newer tasks from gaining
- // unnecessary higher chance to be issued before the blocked one.
-
- atomic.StoreInt32(&(conn.initialized), int32(1))
-}
-
-// IsInitialized atomically check if the connection is initialized.
-// pairs with SetInitialized().
-func (conn *connection) Initialized() bool {
- return atomic.LoadInt32(&(conn.initialized)) != 0
-}
-
// InitSend sends a FUSE_INIT request.
func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
in := linux.FUSEInitIn{
@@ -100,24 +75,24 @@ func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error {
return err
}
- initRes := fuseInitRes{initLen: res.DataLen()}
- if err := res.UnmarshalPayload(&initRes); err != nil {
+ var out linux.FUSEInitOut
+ if err := res.UnmarshalPayload(&out); err != nil {
return err
}
- return conn.initProcessReply(&initRes.initOut, hasSysAdminCap)
+ return conn.initProcessReply(&out, hasSysAdminCap)
}
// Process the FUSE_INIT reply from the FUSE server.
-// It tries to acquire the conn.asyncMu lock if minor version is newer than 13.
func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error {
- // No matter error or not, always set initialzied.
- // to unblock the blocked requests.
- defer conn.SetInitialized()
-
// No support for old major fuse versions.
if out.Major != linux.FUSE_KERNEL_VERSION {
conn.connInitError = true
+
+ // Set the connection as initialized and unblock the blocked requests
+ // (i.e. return error for them).
+ conn.SetInitialized()
+
return nil
}
@@ -125,14 +100,29 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
conn.connInitSuccess = true
conn.minor = out.Minor
- // No support for negotiating MaxWrite before minor version 5.
- if out.Minor >= 5 {
- conn.maxWrite = out.MaxWrite
- } else {
- conn.maxWrite = fuseMinMaxWrite
- }
- if conn.maxWrite < fuseMinMaxWrite {
- conn.maxWrite = fuseMinMaxWrite
+ // No support for limits before minor version 13.
+ if out.Minor >= 13 {
+ conn.bgLock.Lock()
+
+ if out.MaxBackground > 0 {
+ conn.maxBackground = out.MaxBackground
+
+ if !hasSysAdminCap &&
+ conn.maxBackground > MaxUserBackgroundRequest {
+ conn.maxBackground = MaxUserBackgroundRequest
+ }
+ }
+
+ if out.CongestionThreshold > 0 {
+ conn.congestionThreshold = out.CongestionThreshold
+
+ if !hasSysAdminCap &&
+ conn.congestionThreshold > MaxUserCongestionThreshold {
+ conn.congestionThreshold = MaxUserCongestionThreshold
+ }
+ }
+
+ conn.bgLock.Unlock()
}
// No support for the following flags before minor version 6.
@@ -141,6 +131,8 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0
conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0
conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0
+ conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0
+ conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0
// TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs).
@@ -156,90 +148,19 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
}
}
- // No support for limits before minor version 13.
- if out.Minor >= 13 {
- conn.asyncMu.Lock()
-
- if out.MaxBackground > 0 {
- conn.asyncNumMax = out.MaxBackground
-
- if !hasSysAdminCap &&
- conn.asyncNumMax > MaxUserBackgroundRequest {
- conn.asyncNumMax = MaxUserBackgroundRequest
- }
- }
-
- if out.CongestionThreshold > 0 {
- conn.asyncCongestionThreshold = out.CongestionThreshold
-
- if !hasSysAdminCap &&
- conn.asyncCongestionThreshold > MaxUserCongestionThreshold {
- conn.asyncCongestionThreshold = MaxUserCongestionThreshold
- }
- }
-
- conn.asyncMu.Unlock()
- }
-
- return nil
-}
-
-// Abort this FUSE connection.
-// It tries to acquire conn.fd.mu, conn.lock, conn.bgLock in order.
-// All possible requests waiting or blocking will be aborted.
-func (conn *connection) Abort(ctx context.Context) {
- conn.fd.mu.Lock()
- conn.mu.Lock()
- conn.asyncMu.Lock()
-
- if !conn.connected {
- conn.asyncMu.Unlock()
- conn.mu.Unlock()
- conn.fd.mu.Unlock()
- return
- }
-
- conn.connected = false
-
- // Empty the `fd.queue` that holds the requests
- // not yet read by the FUSE daemon yet.
- // These are a subset of the requests in `fuse.completion` map.
- for !conn.fd.queue.Empty() {
- req := conn.fd.queue.Front()
- conn.fd.queue.Remove(req)
- }
-
- var terminate []linux.FUSEOpID
-
- // 2. Collect the requests have not been sent to FUSE daemon,
- // or have not received a reply.
- for unique := range conn.fd.completions {
- terminate = append(terminate, unique)
- }
-
- // Release all locks to avoid deadlock.
- conn.asyncMu.Unlock()
- conn.mu.Unlock()
- conn.fd.mu.Unlock()
-
- // 1. The requets blocked before initialization.
- // Will reach call() `connected` check and return.
- if !conn.Initialized() {
- conn.SetInitialized()
+ // No support for negotiating MaxWrite before minor version 5.
+ if out.Minor >= 5 {
+ conn.maxWrite = out.MaxWrite
+ } else {
+ conn.maxWrite = fuseMinMaxWrite
}
-
- // 2. Terminate the requests collected above.
- // Set ECONNABORTED error.
- // sendError() will remove them from `fd.completion` map.
- // Will enter the path of a normally received error.
- for _, toTerminate := range terminate {
- conn.fd.sendError(ctx, -int32(syscall.ECONNABORTED), toTerminate)
+ if conn.maxWrite < fuseMinMaxWrite {
+ conn.maxWrite = fuseMinMaxWrite
}
- // 3. The requests not yet written to FUSE device.
- // Early terminate.
- // Will reach callFutureLocked() `connected` check and return.
- close(conn.fd.fullQueueCh)
+ // Set connection as initialized and unblock the requests
+ // issued before init.
+ conn.SetInitialized()
- // TODO(gvisor.dev/issue/3528): Forget all pending forget reqs.
+ return nil
}
diff --git a/pkg/sentry/fsimpl/fuse/inode_refs.go b/pkg/sentry/fsimpl/fuse/inode_refs.go
index 5d1de6067..6b9456e1d 100644
--- a/pkg/sentry/fsimpl/fuse/inode_refs.go
+++ b/pkg/sentry/fsimpl/fuse/inode_refs.go
@@ -1,10 +1,10 @@
package fuse
import (
+ "fmt"
"runtime"
"sync/atomic"
- "fmt"
"gvisor.dev/gvisor/pkg/log"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
)
diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go
deleted file mode 100644
index 625d1547f..000000000
--- a/pkg/sentry/fsimpl/fuse/read_write.go
+++ /dev/null
@@ -1,242 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fuse
-
-import (
- "io"
- "sync/atomic"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
-)
-
-// ReadInPages sends FUSE_READ requests for the size after round it up to
-// a multiple of page size, blocks on it for reply, processes the reply
-// and returns the payload (or joined payloads) as a byte slice.
-// This is used for the general purpose reading.
-// We do not support direct IO (which read the exact number of bytes)
-// at this moment.
-func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off uint64, size uint32) ([][]byte, uint32, error) {
- attributeVersion := atomic.LoadUint64(&fs.conn.attributeVersion)
-
- t := kernel.TaskFromContext(ctx)
- if t == nil {
- log.Warningf("fusefs.Read: couldn't get kernel task from context")
- return nil, 0, syserror.EINVAL
- }
-
- // Round up to a multiple of page size.
- readSize, _ := usermem.PageRoundUp(uint64(size))
-
- // One request cannnot exceed either maxRead or maxPages.
- maxPages := fs.conn.maxRead >> usermem.PageShift
- if maxPages > uint32(fs.conn.maxPages) {
- maxPages = uint32(fs.conn.maxPages)
- }
-
- var outs [][]byte
- var sizeRead uint32
-
- // readSize is a multiple of usermem.PageSize.
- // Always request bytes as a multiple of pages.
- pagesRead, pagesToRead := uint32(0), uint32(readSize>>usermem.PageShift)
-
- // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
- in := linux.FUSEReadIn{
- Fh: fd.Fh,
- LockOwner: 0, // TODO(gvisor.dev/issue/3245): file lock
- ReadFlags: 0, // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
- Flags: fd.statusFlags(),
- }
-
- // This loop is intended for fragmented read where the bytes to read is
- // larger than either the maxPages or maxRead.
- // For the majority of reads with normal size, this loop should only
- // execute once.
- for pagesRead < pagesToRead {
- pagesCanRead := pagesToRead - pagesRead
- if pagesCanRead > maxPages {
- pagesCanRead = maxPages
- }
-
- in.Offset = off + (uint64(pagesRead) << usermem.PageShift)
- in.Size = pagesCanRead << usermem.PageShift
-
- req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_READ, &in)
- if err != nil {
- return nil, 0, err
- }
-
- // TODO(gvisor.dev/issue/3247): support async read.
-
- res, err := fs.conn.Call(t, req)
- if err != nil {
- return nil, 0, err
- }
- if err := res.Error(); err != nil {
- return nil, 0, err
- }
-
- // Not enough bytes in response,
- // either we reached EOF,
- // or the FUSE server sends back a response
- // that cannot even fit the hdr.
- if len(res.data) <= res.hdr.SizeBytes() {
- // We treat both case as EOF here for now
- // since there is no reliable way to detect
- // the over-short hdr case.
- break
- }
-
- // Directly using the slice to avoid extra copy.
- out := res.data[res.hdr.SizeBytes():]
-
- outs = append(outs, out)
- sizeRead += uint32(len(out))
-
- pagesRead += pagesCanRead
- }
-
- defer fs.ReadCallback(ctx, fd, off, size, sizeRead, attributeVersion)
-
- // No bytes returned: offset >= EOF.
- if len(outs) == 0 {
- return nil, 0, io.EOF
- }
-
- return outs, sizeRead, nil
-}
-
-// ReadCallback updates several information after receiving a read response.
-// Due to readahead, sizeRead can be larger than size.
-func (fs *filesystem) ReadCallback(ctx context.Context, fd *regularFileFD, off uint64, size uint32, sizeRead uint32, attributeVersion uint64) {
- // TODO(gvisor.dev/issue/3247): support async read.
- // If this is called by an async read, correctly process it.
- // May need to update the signature.
-
- i := fd.inode()
- // TODO(gvisor.dev/issue/1193): Invalidate or update atime.
-
- // Reached EOF.
- if sizeRead < size {
- // TODO(gvisor.dev/issue/3630): If we have writeback cache, then we need to fill this hole.
- // Might need to update the buf to be returned from the Read().
-
- // Update existing size.
- newSize := off + uint64(sizeRead)
- fs.conn.mu.Lock()
- if attributeVersion == i.attributeVersion && newSize < atomic.LoadUint64(&i.size) {
- fs.conn.attributeVersion++
- i.attributeVersion = i.fs.conn.attributeVersion
- atomic.StoreUint64(&i.size, newSize)
- }
- fs.conn.mu.Unlock()
- }
-}
-
-// Write sends FUSE_WRITE requests and return the bytes
-// written according to the response.
-//
-// Preconditions: len(data) == size.
-func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, size uint32, data []byte) (uint32, error) {
- t := kernel.TaskFromContext(ctx)
- if t == nil {
- log.Warningf("fusefs.Read: couldn't get kernel task from context")
- return 0, syserror.EINVAL
- }
-
- // One request cannnot exceed either maxWrite or maxPages.
- maxWrite := uint32(fs.conn.maxPages) << usermem.PageShift
- if maxWrite > fs.conn.maxWrite {
- maxWrite = fs.conn.maxWrite
- }
-
- // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
- in := linux.FUSEWriteIn{
- Fh: fd.Fh,
- // TODO(gvisor.dev/issue/3245): file lock
- LockOwner: 0,
- // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
- // TODO(gvisor.dev/issue/3237): |= linux.FUSE_WRITE_CACHE (not added yet)
- WriteFlags: 0,
- Flags: fd.statusFlags(),
- }
-
- var written uint32
-
- // This loop is intended for fragmented write where the bytes to write is
- // larger than either the maxWrite or maxPages or when bigWrites is false.
- // Unless a small value for max_write is explicitly used, this loop
- // is expected to execute only once for the majority of the writes.
- for written < size {
- toWrite := size - written
-
- // Limit the write size to one page.
- // Note that the bigWrites flag is obsolete,
- // latest libfuse always sets it on.
- if !fs.conn.bigWrites && toWrite > usermem.PageSize {
- toWrite = usermem.PageSize
- }
-
- // Limit the write size to maxWrite.
- if toWrite > maxWrite {
- toWrite = maxWrite
- }
-
- in.Offset = off + uint64(written)
- in.Size = toWrite
-
- req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_WRITE, &in)
- if err != nil {
- return 0, err
- }
-
- req.payload = data[written : written+toWrite]
-
- // TODO(gvisor.dev/issue/3247): support async write.
-
- res, err := fs.conn.Call(t, req)
- if err != nil {
- return 0, err
- }
- if err := res.Error(); err != nil {
- return 0, err
- }
-
- out := linux.FUSEWriteOut{}
- if err := res.UnmarshalPayload(&out); err != nil {
- return 0, err
- }
-
- // Write more than requested? EIO.
- if out.Size > toWrite {
- return 0, syserror.EIO
- }
-
- written += out.Size
-
- // Break if short write. Not necessarily an error.
- if out.Size != toWrite {
- break
- }
- }
-
- return written, nil
-}
diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go
deleted file mode 100644
index 5bdd096c3..000000000
--- a/pkg/sentry/fsimpl/fuse/regular_file.go
+++ /dev/null
@@ -1,230 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fuse
-
-import (
- "io"
- "math"
- "sync"
- "sync/atomic"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
-)
-
-type regularFileFD struct {
- fileDescription
-
- // off is the file offset.
- off int64
- // offMu protects off.
- offMu sync.Mutex
-}
-
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
- if offset < 0 {
- return 0, syserror.EINVAL
- }
-
- // Check that flags are supported.
- //
- // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
- if opts.Flags&^linux.RWF_HIPRI != 0 {
- return 0, syserror.EOPNOTSUPP
- }
-
- size := dst.NumBytes()
- if size == 0 {
- // Early return if count is 0.
- return 0, nil
- } else if size > math.MaxUint32 {
- // FUSE only supports uint32 for size.
- // Overflow.
- return 0, syserror.EINVAL
- }
-
- // TODO(gvisor.dev/issue/3678): Add direct IO support.
-
- inode := fd.inode()
-
- // Reading beyond EOF, update file size if outdated.
- if uint64(offset+size) > atomic.LoadUint64(&inode.size) {
- if err := inode.reviseAttr(ctx, linux.FUSE_GETATTR_FH, fd.Fh); err != nil {
- return 0, err
- }
- // If the offset after update is still too large, return error.
- if uint64(offset) >= atomic.LoadUint64(&inode.size) {
- return 0, io.EOF
- }
- }
-
- // Truncate the read with updated file size.
- fileSize := atomic.LoadUint64(&inode.size)
- if uint64(offset+size) > fileSize {
- size = int64(fileSize) - offset
- }
-
- buffers, n, err := inode.fs.ReadInPages(ctx, fd, uint64(offset), uint32(size))
- if err != nil {
- return 0, err
- }
-
- // TODO(gvisor.dev/issue/3237): support indirect IO (e.g. caching),
- // store the bytes that were read ahead.
-
- // Update the number of bytes to copy for short read.
- if n < uint32(size) {
- size = int64(n)
- }
-
- // Copy the bytes read to the dst.
- // This loop is intended for fragmented reads.
- // For the majority of reads, this loop only execute once.
- var copied int64
- for _, buffer := range buffers {
- toCopy := int64(len(buffer))
- if copied+toCopy > size {
- toCopy = size - copied
- }
- cp, err := dst.DropFirst64(copied).CopyOut(ctx, buffer[:toCopy])
- if err != nil {
- return 0, err
- }
- if int64(cp) != toCopy {
- return 0, syserror.EIO
- }
- copied += toCopy
- }
-
- return copied, nil
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
- fd.offMu.Lock()
- n, err := fd.PRead(ctx, dst, fd.off, opts)
- fd.off += n
- fd.offMu.Unlock()
- return n, err
-}
-
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
- n, _, err := fd.pwrite(ctx, src, offset, opts)
- return n, err
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
- fd.offMu.Lock()
- n, off, err := fd.pwrite(ctx, src, fd.off, opts)
- fd.off = off
- fd.offMu.Unlock()
- return n, err
-}
-
-// pwrite returns the number of bytes written, final offset and error. The
-// final offset should be ignored by PWrite.
-func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
- if offset < 0 {
- return 0, offset, syserror.EINVAL
- }
-
- // Check that flags are supported.
- //
- // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
- if opts.Flags&^linux.RWF_HIPRI != 0 {
- return 0, offset, syserror.EOPNOTSUPP
- }
-
- inode := fd.inode()
- inode.metadataMu.Lock()
- defer inode.metadataMu.Unlock()
-
- // If the file is opened with O_APPEND, update offset to file size.
- // Note: since our Open() implements the interface of kernfs,
- // and kernfs currently does not support O_APPEND, this will never
- // be true before we switch out from kernfs.
- if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
- // Locking inode.metadataMu is sufficient for reading size
- offset = int64(inode.size)
- }
-
- srclen := src.NumBytes()
-
- if srclen > math.MaxUint32 {
- // FUSE only supports uint32 for size.
- // Overflow.
- return 0, offset, syserror.EINVAL
- }
- if end := offset + srclen; end < offset {
- // Overflow.
- return 0, offset, syserror.EINVAL
- }
-
- srclen, err = vfs.CheckLimit(ctx, offset, srclen)
- if err != nil {
- return 0, offset, err
- }
-
- if srclen == 0 {
- // Return before causing any side effects.
- return 0, offset, nil
- }
-
- src = src.TakeFirst64(srclen)
-
- // TODO(gvisor.dev/issue/3237): Add cache support:
- // buffer cache. Ideally we write from src to our buffer cache first.
- // The slice passed to fs.Write() should be a slice from buffer cache.
- data := make([]byte, srclen)
- // Reason for making a copy here: connection.Call() blocks on kerneltask,
- // which in turn acquires mm.activeMu lock. Functions like CopyInTo() will
- // attemp to acquire the mm.activeMu lock as well -> deadlock.
- // We must finish reading from the userspace memory before
- // t.Block() deactivates it.
- cp, err := src.CopyIn(ctx, data)
- if err != nil {
- return 0, offset, err
- }
- if int64(cp) != srclen {
- return 0, offset, syserror.EIO
- }
-
- n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data)
- if err != nil {
- return 0, offset, err
- }
-
- if n == 0 {
- // We have checked srclen != 0 previously.
- // If err == nil, then it's a short write and we return EIO.
- return 0, offset, syserror.EIO
- }
-
- written = int64(n)
- finalOff = offset + written
-
- if finalOff > int64(inode.size) {
- atomic.StoreUint64(&inode.size, uint64(finalOff))
- atomic.AddUint64(&inode.fs.conn.attributeVersion, 1)
- }
-
- return
-}
diff --git a/pkg/sentry/fsimpl/fuse/request_response.go b/pkg/sentry/fsimpl/fuse/request_response.go
deleted file mode 100644
index 70db50f38..000000000
--- a/pkg/sentry/fsimpl/fuse/request_response.go
+++ /dev/null
@@ -1,229 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fuse
-
-import (
- "fmt"
- "syscall"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/usermem"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
-)
-
-// fuseInitRes is a variable-length wrapper of linux.FUSEInitOut. The FUSE
-// server may implement an older version of FUSE protocol, which contains a
-// linux.FUSEInitOut with less attributes.
-//
-// Dynamically-sized objects cannot be marshalled.
-type fuseInitRes struct {
- marshal.StubMarshallable
-
- // initOut contains the response from the FUSE server.
- initOut linux.FUSEInitOut
-
- // initLen is the total length of bytes of the response.
- initLen uint32
-}
-
-// UnmarshalBytes deserializes src to the initOut attribute in a fuseInitRes.
-func (r *fuseInitRes) UnmarshalBytes(src []byte) {
- out := &r.initOut
-
- // Introduced before FUSE kernel version 7.13.
- out.Major = uint32(usermem.ByteOrder.Uint32(src[:4]))
- src = src[4:]
- out.Minor = uint32(usermem.ByteOrder.Uint32(src[:4]))
- src = src[4:]
- out.MaxReadahead = uint32(usermem.ByteOrder.Uint32(src[:4]))
- src = src[4:]
- out.Flags = uint32(usermem.ByteOrder.Uint32(src[:4]))
- src = src[4:]
- out.MaxBackground = uint16(usermem.ByteOrder.Uint16(src[:2]))
- src = src[2:]
- out.CongestionThreshold = uint16(usermem.ByteOrder.Uint16(src[:2]))
- src = src[2:]
- out.MaxWrite = uint32(usermem.ByteOrder.Uint32(src[:4]))
- src = src[4:]
-
- // Introduced in FUSE kernel version 7.23.
- if len(src) >= 4 {
- out.TimeGran = uint32(usermem.ByteOrder.Uint32(src[:4]))
- src = src[4:]
- }
- // Introduced in FUSE kernel version 7.28.
- if len(src) >= 2 {
- out.MaxPages = uint16(usermem.ByteOrder.Uint16(src[:2]))
- src = src[2:]
- }
-}
-
-// SizeBytes is the size of the payload of the FUSE_INIT response.
-func (r *fuseInitRes) SizeBytes() int {
- return int(r.initLen)
-}
-
-// Ordinary requests have even IDs, while interrupts IDs are odd.
-// Used to increment the unique ID for each FUSE request.
-var reqIDStep uint64 = 2
-
-// Request represents a FUSE operation request that hasn't been sent to the
-// server yet.
-//
-// +stateify savable
-type Request struct {
- requestEntry
-
- id linux.FUSEOpID
- hdr *linux.FUSEHeaderIn
- data []byte
-
- // payload for this request: extra bytes to write after
- // the data slice. Used by FUSE_WRITE.
- payload []byte
-
- // If this request is async.
- async bool
- // If we don't care its response.
- // Manually set by the caller.
- noReply bool
-}
-
-// NewRequest creates a new request that can be sent to the FUSE server.
-func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
- conn.fd.mu.Lock()
- defer conn.fd.mu.Unlock()
- conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
-
- hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
- hdr := linux.FUSEHeaderIn{
- Len: uint32(hdrLen + payload.SizeBytes()),
- Opcode: opcode,
- Unique: conn.fd.nextOpID,
- NodeID: ino,
- UID: uint32(creds.EffectiveKUID),
- GID: uint32(creds.EffectiveKGID),
- PID: pid,
- }
-
- buf := make([]byte, hdr.Len)
-
- // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
- hdr.MarshalBytes(buf[:hdrLen])
- payload.MarshalBytes(buf[hdrLen:])
-
- return &Request{
- id: hdr.Unique,
- hdr: &hdr,
- data: buf,
- }, nil
-}
-
-// futureResponse represents an in-flight request, that may or may not have
-// completed yet. Convert it to a resolved Response by calling Resolve, but note
-// that this may block.
-//
-// +stateify savable
-type futureResponse struct {
- opcode linux.FUSEOpcode
- ch chan struct{}
- hdr *linux.FUSEHeaderOut
- data []byte
-
- // If this request is async.
- async bool
-}
-
-// newFutureResponse creates a future response to a FUSE request.
-func newFutureResponse(req *Request) *futureResponse {
- return &futureResponse{
- opcode: req.hdr.Opcode,
- ch: make(chan struct{}),
- async: req.async,
- }
-}
-
-// resolve blocks the task until the server responds to its corresponding request,
-// then returns a resolved response.
-func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
- // Return directly for async requests.
- if f.async {
- return nil, nil
- }
-
- if err := t.Block(f.ch); err != nil {
- return nil, err
- }
-
- return f.getResponse(), nil
-}
-
-// getResponse creates a Response from the data the futureResponse has.
-func (f *futureResponse) getResponse() *Response {
- return &Response{
- opcode: f.opcode,
- hdr: *f.hdr,
- data: f.data,
- }
-}
-
-// Response represents an actual response from the server, including the
-// response payload.
-//
-// +stateify savable
-type Response struct {
- opcode linux.FUSEOpcode
- hdr linux.FUSEHeaderOut
- data []byte
-}
-
-// Error returns the error of the FUSE call.
-func (r *Response) Error() error {
- errno := r.hdr.Error
- if errno >= 0 {
- return nil
- }
-
- sysErrNo := syscall.Errno(-errno)
- return error(sysErrNo)
-}
-
-// DataLen returns the size of the response without the header.
-func (r *Response) DataLen() uint32 {
- return r.hdr.Len - uint32(r.hdr.SizeBytes())
-}
-
-// UnmarshalPayload unmarshals the response data into m.
-func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
- hdrLen := r.hdr.SizeBytes()
- haveDataLen := r.hdr.Len - uint32(hdrLen)
- wantDataLen := uint32(m.SizeBytes())
-
- if haveDataLen < wantDataLen {
- return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen)
- }
-
- // The response data is empty unless there is some payload. And so, doesn't
- // need to be unmarshalled.
- if r.data == nil {
- return nil
- }
-
- // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
- m.UnmarshalBytes(r.data[hdrLen:])
- return nil
-}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index fa4e19113..0e21c31a4 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -195,11 +195,7 @@ const (
// and consistent with Linux's semantics (in particular, it is not always
// possible for clients to set arbitrary atimes and mtimes depending on the
// remote filesystem implementation, and never possible for clients to set
- // arbitrary ctimes.) If a dentry containing a client-defined atime or
- // mtime is evicted from cache, client timestamps will be sent to the
- // remote filesystem on a best-effort basis to attempt to ensure that
- // timestamps will be preserved when another dentry representing the same
- // file is instantiated.
+ // arbitrary ctimes.)
InteropModeExclusive InteropMode = iota
// InteropModeWritethrough is appropriate when there are read-only users of
@@ -1315,32 +1311,15 @@ func (d *dentry) destroyLocked(ctx context.Context) {
d.handleMu.Unlock()
if !d.file.isNil() {
- valid := p9.SetAttrMask{}
- attr := p9.SetAttr{}
- if !d.isDeleted() {
- // Write dirty timestamps back to the remote filesystem.
- if atomic.LoadUint32(&d.atimeDirty) != 0 {
- valid.ATime = true
- valid.ATimeNotSystemTime = true
- atime := atomic.LoadInt64(&d.atime)
- attr.ATimeSeconds = uint64(atime / 1e9)
- attr.ATimeNanoSeconds = uint64(atime % 1e9)
- }
- if atomic.LoadUint32(&d.mtimeDirty) != 0 {
- valid.MTime = true
- valid.MTimeNotSystemTime = true
- mtime := atomic.LoadInt64(&d.mtime)
- attr.MTimeSeconds = uint64(mtime / 1e9)
- attr.MTimeNanoSeconds = uint64(mtime % 1e9)
- }
- }
-
- // Check if attributes need to be changed before closing the file.
- if valid.ATime || valid.MTime {
- if err := d.file.setAttrClose(ctx, valid, attr); err != nil {
- log.Warningf("gofer.dentry.destroyLocked: failed to close file with write dirty timestamps: %v", err)
- }
- } else if err := d.file.close(ctx); err != nil {
+ // Note that it's possible that d.atimeDirty or d.mtimeDirty are true,
+ // i.e. client and server timestamps may differ (because e.g. a client
+ // write was serviced by the page cache, and only written back to the
+ // remote file later). Ideally, we'd write client timestamps back to
+ // the remote filesystem so that timestamps for a new dentry
+ // instantiated for the same file would remain coherent. Unfortunately,
+ // this turns out to be too expensive in many cases, so for now we
+ // don't do this.
+ if err := d.file.close(ctx); err != nil {
log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err)
}
d.file = p9file{}
diff --git a/pkg/sentry/fsimpl/host/connected_endpoint_refs.go b/pkg/sentry/fsimpl/host/connected_endpoint_refs.go
index abf4a9082..babb3f664 100644
--- a/pkg/sentry/fsimpl/host/connected_endpoint_refs.go
+++ b/pkg/sentry/fsimpl/host/connected_endpoint_refs.go
@@ -1,10 +1,10 @@
package host
import (
+ "fmt"
"runtime"
"sync/atomic"
- "fmt"
"gvisor.dev/gvisor/pkg/log"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
)
diff --git a/pkg/sentry/fsimpl/host/inode_refs.go b/pkg/sentry/fsimpl/host/inode_refs.go
index 75b9f49e2..17f90ce4a 100644
--- a/pkg/sentry/fsimpl/host/inode_refs.go
+++ b/pkg/sentry/fsimpl/host/inode_refs.go
@@ -1,10 +1,10 @@
package host
import (
+ "fmt"
"runtime"
"sync/atomic"
- "fmt"
"gvisor.dev/gvisor/pkg/log"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
)
diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go
index 35ded24bc..c0bf45f08 100644
--- a/pkg/sentry/fsimpl/host/socket_unsafe.go
+++ b/pkg/sentry/fsimpl/host/socket_unsafe.go
@@ -63,10 +63,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (
controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
if n > length {
- return length, n, msg.Controllen, controlTrunc, err
+ return length, n, msg.Controllen, controlTrunc, nil
}
- return n, n, msg.Controllen, controlTrunc, err
+ return n, n, msg.Controllen, controlTrunc, nil
}
// fdWriteVec sends from bufs to fd.
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index 7a9be4b97..97cefa350 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -17,6 +17,7 @@ package host
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -25,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
- "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
// TTYFileDescription implements vfs.FileDescriptionImpl for a host file
diff --git a/pkg/sentry/fsimpl/kernfs/dentry_refs.go b/pkg/sentry/fsimpl/kernfs/dentry_refs.go
index b7125caee..79863b3bc 100644
--- a/pkg/sentry/fsimpl/kernfs/dentry_refs.go
+++ b/pkg/sentry/fsimpl/kernfs/dentry_refs.go
@@ -1,10 +1,10 @@
package kernfs
import (
+ "fmt"
"runtime"
"sync/atomic"
- "fmt"
"gvisor.dev/gvisor/pkg/log"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
)
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 49f6a0f1d..d7d3e8f48 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -140,7 +140,7 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
}
// Reference on childVFSD dropped by a corresponding Valid.
child = childVFSD.Impl().(*Dentry)
- parent.InsertChildLocked(name, child)
+ parent.insertChildLocked(name, child)
}
return child, nil
}
@@ -548,7 +548,7 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
if !d.Impl().(*Dentry).isSymlink() {
return "", syserror.EINVAL
}
- return inode.Readlink(ctx, rp.Mount())
+ return inode.Readlink(ctx)
}
// RenameAt implements vfs.FilesystemImpl.RenameAt.
@@ -657,10 +657,6 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
-
- // Store the name before walkExistingLocked as rp will be advanced past the
- // name in the following call.
- name := rp.Component()
vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
@@ -690,8 +686,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
-
- if err := parentDentry.inode.RmDir(ctx, name, vfsd); err != nil {
+ if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
@@ -770,10 +765,6 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
-
- // Store the name before walkExistingLocked as rp will be advanced past the
- // name in the following call.
- name := rp.Component()
vfsd, _, err := fs.walkExistingLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
@@ -799,7 +790,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
- if err := parentDentry.inode.Unlink(ctx, name, vfsd); err != nil {
+ if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index ea4f679c2..c0b863ba4 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -172,7 +172,7 @@ func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
type InodeNotSymlink struct{}
// Readlink implements Inode.Readlink.
-func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) {
+func (InodeNotSymlink) Readlink(context.Context) (string, error) {
return "", syserror.EINVAL
}
@@ -256,13 +256,6 @@ func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (li
// SetStat implements Inode.SetStat.
func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
- return a.SetInodeStat(ctx, fs, creds, opts)
-}
-
-// SetInodeStat sets the corresponding attributes from opts to InodeAttrs.
-// This function can be used by other kernfs-based filesystem implementation to
-// sets the unexported attributes into kernfs.InodeAttrs.
-func (a *InodeAttrs) SetInodeStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
if opts.Stat.Mask == 0 {
return nil
}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 163f26ceb..88fcd54aa 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -60,7 +60,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
)
// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
@@ -247,15 +246,15 @@ func (d *Dentry) OnZeroWatches(context.Context) {}
// Precondition: d must represent a directory inode.
func (d *Dentry) InsertChild(name string, child *Dentry) {
d.dirMu.Lock()
- d.InsertChildLocked(name, child)
+ d.insertChildLocked(name, child)
d.dirMu.Unlock()
}
-// InsertChildLocked is equivalent to InsertChild, with additional
+// insertChildLocked is equivalent to InsertChild, with additional
// preconditions.
//
// Precondition: d.dirMu must be locked.
-func (d *Dentry) InsertChildLocked(name string, child *Dentry) {
+func (d *Dentry) insertChildLocked(name string, child *Dentry) {
if !d.isDir() {
panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
}
@@ -268,36 +267,6 @@ func (d *Dentry) InsertChildLocked(name string, child *Dentry) {
d.children[name] = child
}
-// RemoveChild removes child from the vfs dentry cache. This does not update the
-// directory inode or modify the inode to be unlinked. So calling this on its own
-// isn't sufficient to remove a child from a directory.
-//
-// Precondition: d must represent a directory inode.
-func (d *Dentry) RemoveChild(name string, child *vfs.Dentry) error {
- d.dirMu.Lock()
- defer d.dirMu.Unlock()
- return d.RemoveChildLocked(name, child)
-}
-
-// RemoveChildLocked is equivalent to RemoveChild, with additional
-// preconditions.
-//
-// Precondition: d.dirMu must be locked.
-func (d *Dentry) RemoveChildLocked(name string, child *vfs.Dentry) error {
- if !d.isDir() {
- panic(fmt.Sprintf("RemoveChild called on non-directory Dentry: %+v.", d))
- }
- c, ok := d.children[name]
- if !ok {
- return syserror.ENOENT
- }
- if &c.vfsd != child {
- panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! Child: %+v, vfs: %+v", c, child))
- }
- delete(d.children, name)
- return nil
-}
-
// Inode returns the dentry's inode.
func (d *Dentry) Inode() Inode {
return d.inode
@@ -456,7 +425,7 @@ type inodeDynamicLookup interface {
Valid(ctx context.Context) bool
// IterDirents is used to iterate over dynamically created entries. It invokes
- // cb on each entry in the directory represented by the Inode.
+ // cb on each entry in the directory represented by the FileDescription.
// 'offset' is the offset for the entire IterDirents call, which may include
// results from the caller (e.g. "." and ".."). 'relOffset' is the offset
// inside the entries returned by this IterDirents invocation. In other words,
@@ -468,7 +437,7 @@ type inodeDynamicLookup interface {
type inodeSymlink interface {
// Readlink returns the target of a symbolic link. If an inode is not a
// symlink, the implementation should return EINVAL.
- Readlink(ctx context.Context, mnt *vfs.Mount) (string, error)
+ Readlink(ctx context.Context) (string, error)
// Getlink returns the target of a symbolic link, as used by path
// resolution:
diff --git a/pkg/sentry/fsimpl/kernfs/static_directory_refs.go b/pkg/sentry/fsimpl/kernfs/static_directory_refs.go
index 0ff013c97..478b04bdd 100644
--- a/pkg/sentry/fsimpl/kernfs/static_directory_refs.go
+++ b/pkg/sentry/fsimpl/kernfs/static_directory_refs.go
@@ -1,10 +1,10 @@
package kernfs
import (
+ "fmt"
"runtime"
"sync/atomic"
- "fmt"
"gvisor.dev/gvisor/pkg/log"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
)
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index a9812fcef..64731a3e4 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -52,7 +52,7 @@ func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor
}
// Readlink implements Inode.
-func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) {
+func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
return s.target, nil
}
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index c589b4746..360b77ef6 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -81,6 +82,8 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
Start: d.parent.upperVD,
Path: fspath.Parse(d.name),
}
+ // Used during copy-up of memory-mapped regular files.
+ var mmapOpts *memmap.MMapOpts
cleanupUndoCopyUp := func() {
var err error
if ftype == linux.S_IFDIR {
@@ -136,6 +139,25 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
break
}
}
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ if d.wrappedMappable != nil {
+ // We may have memory mappings of the file on the lower layer.
+ // Switch to mapping the file on the upper layer instead.
+ mmapOpts = &memmap.MMapOpts{
+ Perms: usermem.ReadWrite,
+ MaxPerms: usermem.ReadWrite,
+ }
+ if err := newFD.ConfigureMMap(ctx, mmapOpts); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ if mmapOpts.MappingIdentity != nil {
+ mmapOpts.MappingIdentity.DecRef(ctx)
+ }
+ // Don't actually switch Mappables until the end of copy-up; see
+ // below for why.
+ }
if err := newFD.SetStat(ctx, vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_UID | linux.STATX_GID,
@@ -265,6 +287,62 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
atomic.StoreUint64(&d.ino, upperStat.Ino)
}
+ if mmapOpts != nil && mmapOpts.Mappable != nil {
+ // Note that if mmapOpts != nil, then d.mapsMu is locked for writing
+ // (from the S_IFREG path above).
+
+ // Propagate mappings of d to the new Mappable. Remember which mappings
+ // we added so we can remove them on failure.
+ upperMappable := mmapOpts.Mappable
+ allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange)
+ for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ added := make(memmap.MappingsOfRange)
+ for m := range seg.Value() {
+ if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil {
+ for m := range added {
+ upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+ }
+ for mr, mappings := range allAdded {
+ for m := range mappings {
+ upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable)
+ }
+ }
+ return err
+ }
+ added[m] = struct{}{}
+ }
+ allAdded[seg.Range()] = added
+ }
+
+ // Switch to the new Mappable. We do this at the end of copy-up
+ // because:
+ //
+ // - We need to switch Mappables (by changing d.wrappedMappable) before
+ // invalidating Translations from the old Mappable (to pick up
+ // Translations from the new one).
+ //
+ // - We need to lock d.dataMu while changing d.wrappedMappable, but
+ // must invalidate Translations with d.dataMu unlocked (due to lock
+ // ordering).
+ //
+ // - Consequently, once we unlock d.dataMu, other threads may
+ // immediately observe the new (copied-up) Mappable, which we want to
+ // delay until copy-up is guaranteed to succeed.
+ d.dataMu.Lock()
+ lowerMappable := d.wrappedMappable
+ d.wrappedMappable = upperMappable
+ d.dataMu.Unlock()
+ d.lowerMappings.InvalidateAll(memmap.InvalidateOpts{})
+
+ // Remove mappings from the old Mappable.
+ for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ for m := range seg.Value() {
+ lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+ }
+ }
+ d.lowerMappings.RemoveAll()
+ }
+
atomic.StoreUint32(&d.copiedUp, 1)
return nil
}
diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go
index 268b32537..74cfd3799 100644
--- a/pkg/sentry/fsimpl/overlay/non_directory.go
+++ b/pkg/sentry/fsimpl/overlay/non_directory.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -256,10 +257,105 @@ func (fd *nonDirectoryFD) Sync(ctx context.Context) error {
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
- wrappedFD, err := fd.getCurrentFD(ctx)
+ if err := fd.ensureMappable(ctx, opts); err != nil {
+ return err
+ }
+ return vfs.GenericConfigureMMap(&fd.vfsfd, fd.dentry(), opts)
+}
+
+// ensureMappable ensures that fd.dentry().wrappedMappable is not nil.
+func (fd *nonDirectoryFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error {
+ d := fd.dentry()
+
+ // Fast path if we already have a Mappable for the current top layer.
+ if atomic.LoadUint32(&d.isMappable) != 0 {
+ return nil
+ }
+
+ // Only permit mmap of regular files, since other file types may have
+ // unpredictable behavior when mmapped (e.g. /dev/zero).
+ if atomic.LoadUint32(&d.mode)&linux.S_IFMT != linux.S_IFREG {
+ return syserror.ENODEV
+ }
+
+ // Get a Mappable for the current top layer.
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ d.copyMu.RLock()
+ defer d.copyMu.RUnlock()
+ if atomic.LoadUint32(&d.isMappable) != 0 {
+ return nil
+ }
+ wrappedFD, err := fd.currentFDLocked(ctx)
if err != nil {
return err
}
- defer wrappedFD.DecRef(ctx)
- return wrappedFD.ConfigureMMap(ctx, opts)
+ if err := wrappedFD.ConfigureMMap(ctx, opts); err != nil {
+ return err
+ }
+ if opts.MappingIdentity != nil {
+ opts.MappingIdentity.DecRef(ctx)
+ opts.MappingIdentity = nil
+ }
+ // Use this Mappable for all mappings of this layer (unless we raced with
+ // another call to ensureMappable).
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ d.dataMu.Lock()
+ defer d.dataMu.Unlock()
+ if d.wrappedMappable == nil {
+ d.wrappedMappable = opts.Mappable
+ atomic.StoreUint32(&d.isMappable, 1)
+ }
+ return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ if err := d.wrappedMappable.AddMapping(ctx, ms, ar, offset, writable); err != nil {
+ return err
+ }
+ if !d.isCopiedUp() {
+ d.lowerMappings.AddMapping(ms, ar, offset, writable)
+ }
+ return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ d.wrappedMappable.RemoveMapping(ctx, ms, ar, offset, writable)
+ if !d.isCopiedUp() {
+ d.lowerMappings.RemoveMapping(ms, ar, offset, writable)
+ }
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ if err := d.wrappedMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil {
+ return err
+ }
+ if !d.isCopiedUp() {
+ d.lowerMappings.AddMapping(ms, dstAR, offset, writable)
+ }
+ return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+ d.dataMu.RLock()
+ defer d.dataMu.RUnlock()
+ return d.wrappedMappable.Translate(ctx, required, optional, at)
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ return d.wrappedMappable.InvalidateUnsavable(ctx)
}
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index 9a8f7010e..b2efe5f80 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -22,6 +22,10 @@
// filesystem.renameMu
// dentry.dirMu
// dentry.copyMu
+// *** "memmap.Mappable locks" below this point
+// dentry.mapsMu
+// *** "memmap.Mappable locks taken by Translate" below this point
+// dentry.dataMu
//
// Locking dentry.dirMu in multiple dentries requires that parent dentries are
// locked before child dentries, and that filesystem.renameMu is locked to
@@ -37,6 +41,7 @@ import (
"gvisor.dev/gvisor/pkg/fspath"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -419,6 +424,35 @@ type dentry struct {
devMinor uint32
ino uint64
+ // If this dentry represents a regular file, then:
+ //
+ // - mapsMu is used to synchronize between copy-up and memmap.Mappable
+ // methods on dentry preceding mm.MemoryManager.activeMu in the lock order.
+ //
+ // - dataMu is used to synchronize between copy-up and
+ // dentry.(memmap.Mappable).Translate.
+ //
+ // - lowerMappings tracks memory mappings of the file. lowerMappings is
+ // used to invalidate mappings of the lower layer when the file is copied
+ // up to ensure that they remain coherent with subsequent writes to the
+ // file. (Note that, as of this writing, Linux overlayfs does not do this;
+ // this feature is a gVisor extension.) lowerMappings is protected by
+ // mapsMu.
+ //
+ // - If this dentry is copied-up, then wrappedMappable is the Mappable
+ // obtained from a call to the current top layer's
+ // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil
+ // (from a call to nonDirectoryFD.ensureMappable()), it cannot become nil.
+ // wrappedMappable is protected by mapsMu and dataMu.
+ //
+ // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is
+ // accessed using atomic memory operations.
+ mapsMu sync.Mutex
+ lowerMappings memmap.MappingSet
+ dataMu sync.RWMutex
+ wrappedMappable memmap.Mappable
+ isMappable uint32
+
locks vfs.FileLocks
}
diff --git a/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go b/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go
index 454862d98..9431c1506 100644
--- a/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go
+++ b/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go
@@ -1,10 +1,10 @@
package proc
import (
+ "fmt"
"runtime"
"sync/atomic"
- "fmt"
"gvisor.dev/gvisor/pkg/log"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
)
diff --git a/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go b/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go
index d2169be5b..872b20eb0 100644
--- a/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go
+++ b/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go
@@ -1,10 +1,10 @@
package proc
import (
+ "fmt"
"runtime"
"sync/atomic"
- "fmt"
"gvisor.dev/gvisor/pkg/log"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
)
diff --git a/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go b/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go
index 9b50f632c..c6d9b3522 100644
--- a/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go
+++ b/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go
@@ -1,10 +1,10 @@
package proc
import (
+ "fmt"
"runtime"
"sync/atomic"
- "fmt"
"gvisor.dev/gvisor/pkg/log"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
)
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 5374538c9..3f0d78461 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -209,7 +209,7 @@ func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *ker
return d
}
-func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
file, _ := getTaskFD(s.task, s.fd)
if file == nil {
return "", syserror.ENOENT
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 4f7f9cb00..356036b9b 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -668,7 +668,7 @@ func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentr
}
// Readlink implements kernfs.Inode.
-func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
if !kernel.ContextCanTrace(ctx, s.task, false) {
return "", syserror.EACCES
}
@@ -808,11 +808,11 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri
}
// Readlink implements Inode.
-func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
+func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) {
if err := checkTaskState(s.task); err != nil {
return "", err
}
- return s.StaticSymlink.Readlink(ctx, mnt)
+ return s.StaticSymlink.Readlink(ctx)
}
// Getlink implements Inode.Getlink.
diff --git a/pkg/sentry/fsimpl/proc/task_inode_refs.go b/pkg/sentry/fsimpl/proc/task_inode_refs.go
index c29272f9b..714488450 100644
--- a/pkg/sentry/fsimpl/proc/task_inode_refs.go
+++ b/pkg/sentry/fsimpl/proc/task_inode_refs.go
@@ -1,10 +1,10 @@
package proc
import (
+ "fmt"
"runtime"
"sync/atomic"
- "fmt"
"gvisor.dev/gvisor/pkg/log"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
)
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 68c541046..8c41729e4 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -51,7 +51,7 @@ func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns
return d
}
-func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
t := kernel.TaskFromContext(ctx)
if t == nil {
// Who is reading this link?
@@ -64,8 +64,8 @@ func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error
return strconv.FormatUint(uint64(tgid), 10), nil
}
-func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
- target, err := s.Readlink(ctx, mnt)
+func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
+ target, err := s.Readlink(ctx)
return vfs.VirtualDentry{}, target, err
}
@@ -94,7 +94,7 @@ func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64,
return d
}
-func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
t := kernel.TaskFromContext(ctx)
if t == nil {
// Who is reading this link?
@@ -108,8 +108,8 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string,
return fmt.Sprintf("%d/task/%d", tgid, tid), nil
}
-func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
- target, err := s.Readlink(ctx, mnt)
+func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
+ target, err := s.Readlink(ctx)
return vfs.VirtualDentry{}, target, err
}
diff --git a/pkg/sentry/fsimpl/proc/tasks_inode_refs.go b/pkg/sentry/fsimpl/proc/tasks_inode_refs.go
index 7e0b70f6c..22d9cc488 100644
--- a/pkg/sentry/fsimpl/proc/tasks_inode_refs.go
+++ b/pkg/sentry/fsimpl/proc/tasks_inode_refs.go
@@ -1,10 +1,10 @@
package proc
import (
+ "fmt"
"runtime"
"sync/atomic"
- "fmt"
"gvisor.dev/gvisor/pkg/log"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
)
diff --git a/pkg/sentry/fsimpl/sys/dir_refs.go b/pkg/sentry/fsimpl/sys/dir_refs.go
index d42edb20e..89609b198 100644
--- a/pkg/sentry/fsimpl/sys/dir_refs.go
+++ b/pkg/sentry/fsimpl/sys/dir_refs.go
@@ -1,10 +1,10 @@
package sys
import (
+ "fmt"
"runtime"
"sync/atomic"
- "fmt"
"gvisor.dev/gvisor/pkg/log"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
)
diff --git a/pkg/sentry/fsimpl/tmpfs/inode_refs.go b/pkg/sentry/fsimpl/tmpfs/inode_refs.go
index 4f4037adb..dbf0b2766 100644
--- a/pkg/sentry/fsimpl/tmpfs/inode_refs.go
+++ b/pkg/sentry/fsimpl/tmpfs/inode_refs.go
@@ -1,10 +1,10 @@
package tmpfs
import (
+ "fmt"
"runtime"
"sync/atomic"
- "fmt"
"gvisor.dev/gvisor/pkg/log"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
)