diff options
-rw-r--r-- | pkg/abi/linux/fuse.go | 143 | ||||
-rw-r--r-- | pkg/abi/linux/linux_abi_autogen_unsafe.go | 587 | ||||
-rw-r--r-- | pkg/abi/linux/linux_amd64_abi_autogen_unsafe.go | 6 | ||||
-rw-r--r-- | pkg/abi/linux/linux_arm64_abi_autogen_unsafe.go | 8 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/fuse/connection.go | 255 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/fuse/dev.go | 289 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/fuse/fuse_state_autogen.go | 157 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/fuse/fusefs.go | 52 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/fuse/register.go | 42 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/fuse/request_list.go | 193 |
10 files changed, 1695 insertions, 37 deletions
diff --git a/pkg/abi/linux/fuse.go b/pkg/abi/linux/fuse.go new file mode 100644 index 000000000..d3ebbccc4 --- /dev/null +++ b/pkg/abi/linux/fuse.go @@ -0,0 +1,143 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// +marshal +type FUSEOpcode uint32 + +// +marshal +type FUSEOpID uint64 + +// Opcodes for FUSE operations. Analogous to the opcodes in include/linux/fuse.h. +const ( + FUSE_LOOKUP FUSEOpcode = 1 + FUSE_FORGET = 2 /* no reply */ + FUSE_GETATTR = 3 + FUSE_SETATTR = 4 + FUSE_READLINK = 5 + FUSE_SYMLINK = 6 + _ + FUSE_MKNOD = 8 + FUSE_MKDIR = 9 + FUSE_UNLINK = 10 + FUSE_RMDIR = 11 + FUSE_RENAME = 12 + FUSE_LINK = 13 + FUSE_OPEN = 14 + FUSE_READ = 15 + FUSE_WRITE = 16 + FUSE_STATFS = 17 + FUSE_RELEASE = 18 + _ + FUSE_FSYNC = 20 + FUSE_SETXATTR = 21 + FUSE_GETXATTR = 22 + FUSE_LISTXATTR = 23 + FUSE_REMOVEXATTR = 24 + FUSE_FLUSH = 25 + FUSE_INIT = 26 + FUSE_OPENDIR = 27 + FUSE_READDIR = 28 + FUSE_RELEASEDIR = 29 + FUSE_FSYNCDIR = 30 + FUSE_GETLK = 31 + FUSE_SETLK = 32 + FUSE_SETLKW = 33 + FUSE_ACCESS = 34 + FUSE_CREATE = 35 + FUSE_INTERRUPT = 36 + FUSE_BMAP = 37 + FUSE_DESTROY = 38 + FUSE_IOCTL = 39 + FUSE_POLL = 40 + FUSE_NOTIFY_REPLY = 41 + FUSE_BATCH_FORGET = 42 +) + +const ( + // FUSE_MIN_READ_BUFFER is the minimum size the read can be for any FUSE filesystem. + // This is the minimum size Linux supports. See linux.fuse.h. + FUSE_MIN_READ_BUFFER uint32 = 8192 +) + +// FUSEHeaderIn is the header read by the daemon with each request. +// +// +marshal +type FUSEHeaderIn struct { + // Len specifies the total length of the data, including this header. + Len uint32 + + // Opcode specifies the kind of operation of the request. + Opcode FUSEOpcode + + // Unique specifies the unique identifier for this request. + Unique FUSEOpID + + // NodeID is the ID of the filesystem object being operated on. + NodeID uint64 + + // UID is the UID of the requesting process. + UID uint32 + + // GID is the GID of the requesting process. + GID uint32 + + // PID is the PID of the requesting process. + PID uint32 + + _ uint32 +} + +// FUSEHeaderOut is the header written by the daemon when it processes +// a request and wants to send a reply (almost all operations require a +// reply; if they do not, this will be explicitly documented). +// +// +marshal +type FUSEHeaderOut struct { + // Len specifies the total length of the data, including this header. + Len uint32 + + // Error specifies the error that occurred (0 if none). + Error int32 + + // Unique specifies the unique identifier of the corresponding request. + Unique FUSEOpID +} + +// FUSEWriteIn is the header written by a daemon when it makes a +// write request to the FUSE filesystem. +// +// +marshal +type FUSEWriteIn struct { + // Fh specifies the file handle that is being written to. + Fh uint64 + + // Offset is the offset of the write. + Offset uint64 + + // Size is the size of data being written. + Size uint32 + + // WriteFlags is the flags used during the write. + WriteFlags uint32 + + // LockOwner is the ID of the lock owner. + LockOwner uint64 + + // Flags is the flags for the request. + Flags uint32 + + _ uint32 +} diff --git a/pkg/abi/linux/linux_abi_autogen_unsafe.go b/pkg/abi/linux/linux_abi_autogen_unsafe.go index 540981b56..414a8ec20 100644 --- a/pkg/abi/linux/linux_abi_autogen_unsafe.go +++ b/pkg/abi/linux/linux_abi_autogen_unsafe.go @@ -15,6 +15,11 @@ import ( // Marshallable types used by this file. var _ marshal.Marshallable = (*ControlMessageCredentials)(nil) +var _ marshal.Marshallable = (*FUSEHeaderIn)(nil) +var _ marshal.Marshallable = (*FUSEHeaderOut)(nil) +var _ marshal.Marshallable = (*FUSEOpID)(nil) +var _ marshal.Marshallable = (*FUSEOpcode)(nil) +var _ marshal.Marshallable = (*FUSEWriteIn)(nil) var _ marshal.Marshallable = (*IFConf)(nil) var _ marshal.Marshallable = (*IFReq)(nil) var _ marshal.Marshallable = (*IPTEntry)(nil) @@ -142,7 +147,7 @@ func (s *Statx) Packed() bool { // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *Statx) MarshalUnsafe(dst []byte) { - if s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() && s.Atime.Packed() { + if s.Ctime.Packed() && s.Mtime.Packed() && s.Atime.Packed() && s.Btime.Packed() { safecopy.CopyIn(dst, unsafe.Pointer(s)) } else { // Type Statx doesn't have a packed layout in memory, fallback to MarshalBytes. @@ -385,6 +390,584 @@ func (s *Statfs) WriteTo(w io.Writer) (int64, error) { } // SizeBytes implements marshal.Marshallable.SizeBytes. +//go:nosplit +func (f *FUSEOpcode) SizeBytes() int { + return 4 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEOpcode) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint32(dst[:4], uint32(*f)) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEOpcode) UnmarshalBytes(src []byte) { + *f = FUSEOpcode(uint32(usermem.ByteOrder.Uint32(src[:4]))) +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEOpcode) Packed() bool { + // Scalar newtypes are always packed. + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEOpcode) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEOpcode) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEOpcode) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEOpcode) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEOpcode) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEOpcode) WriteTo(w io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := w.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +//go:nosplit +func (f *FUSEOpID) SizeBytes() int { + return 8 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEOpID) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint64(dst[:8], uint64(*f)) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEOpID) UnmarshalBytes(src []byte) { + *f = FUSEOpID(uint64(usermem.ByteOrder.Uint64(src[:8]))) +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEOpID) Packed() bool { + // Scalar newtypes are always packed. + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEOpID) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEOpID) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEOpID) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEOpID) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEOpID) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEOpID) WriteTo(w io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := w.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSEHeaderIn) SizeBytes() int { + return 28 + + (*FUSEOpcode)(nil).SizeBytes() + + (*FUSEOpID)(nil).SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEHeaderIn) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Len)) + dst = dst[4:] + f.Opcode.MarshalBytes(dst[:f.Opcode.SizeBytes()]) + dst = dst[f.Opcode.SizeBytes():] + f.Unique.MarshalBytes(dst[:f.Unique.SizeBytes()]) + dst = dst[f.Unique.SizeBytes():] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.NodeID)) + dst = dst[8:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.UID)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.GID)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.PID)) + dst = dst[4:] + // Padding: dst[:sizeof(uint32)] ~= uint32(0) + dst = dst[4:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEHeaderIn) UnmarshalBytes(src []byte) { + f.Len = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.Opcode.UnmarshalBytes(src[:f.Opcode.SizeBytes()]) + src = src[f.Opcode.SizeBytes():] + f.Unique.UnmarshalBytes(src[:f.Unique.SizeBytes()]) + src = src[f.Unique.SizeBytes():] + f.NodeID = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.UID = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.GID = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.PID = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + // Padding: var _ uint32 ~= src[:sizeof(uint32)] + src = src[4:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEHeaderIn) Packed() bool { + return f.Opcode.Packed() && f.Unique.Packed() +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEHeaderIn) MarshalUnsafe(dst []byte) { + if f.Opcode.Packed() && f.Unique.Packed() { + safecopy.CopyIn(dst, unsafe.Pointer(f)) + } else { + // Type FUSEHeaderIn doesn't have a packed layout in memory, fallback to MarshalBytes. + f.MarshalBytes(dst) + } +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEHeaderIn) UnmarshalUnsafe(src []byte) { + if f.Opcode.Packed() && f.Unique.Packed() { + safecopy.CopyOut(unsafe.Pointer(f), src) + } else { + // Type FUSEHeaderIn doesn't have a packed layout in memory, fallback to UnmarshalBytes. + f.UnmarshalBytes(src) + } +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEHeaderIn) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + if !f.Opcode.Packed() && f.Unique.Packed() { + // Type FUSEHeaderIn doesn't have a packed layout in memory, fall back to MarshalBytes. + buf := task.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. + f.MarshalBytes(buf) // escapes: fallback. + return task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + } + + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEHeaderIn) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEHeaderIn) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + if !f.Opcode.Packed() && f.Unique.Packed() { + // Type FUSEHeaderIn doesn't have a packed layout in memory, fall back to UnmarshalBytes. + buf := task.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Unmarshal unconditionally. If we had a short copy-in, this results in a + // partially unmarshalled struct. + f.UnmarshalBytes(buf) // escapes: fallback. + return length, err + } + + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEHeaderIn) WriteTo(w io.Writer) (int64, error) { + if !f.Opcode.Packed() && f.Unique.Packed() { + // Type FUSEHeaderIn doesn't have a packed layout in memory, fall back to MarshalBytes. + buf := make([]byte, f.SizeBytes()) + f.MarshalBytes(buf) + length, err := w.Write(buf) + return int64(length), err + } + + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := w.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSEHeaderOut) SizeBytes() int { + return 8 + + (*FUSEOpID)(nil).SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEHeaderOut) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Len)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Error)) + dst = dst[4:] + f.Unique.MarshalBytes(dst[:f.Unique.SizeBytes()]) + dst = dst[f.Unique.SizeBytes():] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEHeaderOut) UnmarshalBytes(src []byte) { + f.Len = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.Error = int32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.Unique.UnmarshalBytes(src[:f.Unique.SizeBytes()]) + src = src[f.Unique.SizeBytes():] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEHeaderOut) Packed() bool { + return f.Unique.Packed() +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEHeaderOut) MarshalUnsafe(dst []byte) { + if f.Unique.Packed() { + safecopy.CopyIn(dst, unsafe.Pointer(f)) + } else { + // Type FUSEHeaderOut doesn't have a packed layout in memory, fallback to MarshalBytes. + f.MarshalBytes(dst) + } +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEHeaderOut) UnmarshalUnsafe(src []byte) { + if f.Unique.Packed() { + safecopy.CopyOut(unsafe.Pointer(f), src) + } else { + // Type FUSEHeaderOut doesn't have a packed layout in memory, fallback to UnmarshalBytes. + f.UnmarshalBytes(src) + } +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEHeaderOut) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + if !f.Unique.Packed() { + // Type FUSEHeaderOut doesn't have a packed layout in memory, fall back to MarshalBytes. + buf := task.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. + f.MarshalBytes(buf) // escapes: fallback. + return task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + } + + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEHeaderOut) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEHeaderOut) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + if !f.Unique.Packed() { + // Type FUSEHeaderOut doesn't have a packed layout in memory, fall back to UnmarshalBytes. + buf := task.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Unmarshal unconditionally. If we had a short copy-in, this results in a + // partially unmarshalled struct. + f.UnmarshalBytes(buf) // escapes: fallback. + return length, err + } + + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEHeaderOut) WriteTo(w io.Writer) (int64, error) { + if !f.Unique.Packed() { + // Type FUSEHeaderOut doesn't have a packed layout in memory, fall back to MarshalBytes. + buf := make([]byte, f.SizeBytes()) + f.MarshalBytes(buf) + length, err := w.Write(buf) + return int64(length), err + } + + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := w.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSEWriteIn) SizeBytes() int { + return 40 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEWriteIn) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Offset)) + dst = dst[8:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Size)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.WriteFlags)) + dst = dst[4:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.LockOwner)) + dst = dst[8:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) + dst = dst[4:] + // Padding: dst[:sizeof(uint32)] ~= uint32(0) + dst = dst[4:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEWriteIn) UnmarshalBytes(src []byte) { + f.Fh = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Offset = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Size = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.WriteFlags = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.LockOwner = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Flags = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + // Padding: var _ uint32 ~= src[:sizeof(uint32)] + src = src[4:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEWriteIn) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEWriteIn) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEWriteIn) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEWriteIn) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEWriteIn) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEWriteIn) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEWriteIn) WriteTo(w io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := w.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. func (r *RobustListHead) SizeBytes() int { return 24 } @@ -818,7 +1401,7 @@ func (i *IPTEntry) Packed() bool { // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IPTEntry) MarshalUnsafe(dst []byte) { - if i.Counters.Packed() && i.IP.Packed() { + if i.IP.Packed() && i.Counters.Packed() { safecopy.CopyIn(dst, unsafe.Pointer(i)) } else { // Type IPTEntry doesn't have a packed layout in memory, fallback to MarshalBytes. diff --git a/pkg/abi/linux/linux_amd64_abi_autogen_unsafe.go b/pkg/abi/linux/linux_amd64_abi_autogen_unsafe.go index 238af9fb4..42c9623af 100644 --- a/pkg/abi/linux/linux_amd64_abi_autogen_unsafe.go +++ b/pkg/abi/linux/linux_amd64_abi_autogen_unsafe.go @@ -288,7 +288,7 @@ func (s *Stat) UnmarshalBytes(src []byte) { // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *Stat) Packed() bool { - return s.MTime.Packed() && s.CTime.Packed() && s.ATime.Packed() + return s.ATime.Packed() && s.MTime.Packed() && s.CTime.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. @@ -344,7 +344,7 @@ func (s *Stat) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { // CopyIn implements marshal.Marshallable.CopyIn. //go:nosplit func (s *Stat) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { - if !s.ATime.Packed() && s.MTime.Packed() && s.CTime.Packed() { + if !s.MTime.Packed() && s.CTime.Packed() && s.ATime.Packed() { // Type Stat doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := task.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := task.CopyInBytes(addr, buf) // escapes: okay. @@ -370,7 +370,7 @@ func (s *Stat) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { // WriteTo implements io.WriterTo.WriteTo. func (s *Stat) WriteTo(w io.Writer) (int64, error) { - if !s.ATime.Packed() && s.MTime.Packed() && s.CTime.Packed() { + if !s.MTime.Packed() && s.CTime.Packed() && s.ATime.Packed() { // Type Stat doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) diff --git a/pkg/abi/linux/linux_arm64_abi_autogen_unsafe.go b/pkg/abi/linux/linux_arm64_abi_autogen_unsafe.go index bdc129008..4d0ebca49 100644 --- a/pkg/abi/linux/linux_arm64_abi_autogen_unsafe.go +++ b/pkg/abi/linux/linux_arm64_abi_autogen_unsafe.go @@ -300,7 +300,7 @@ func (s *Stat) Packed() bool { // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *Stat) MarshalUnsafe(dst []byte) { - if s.CTime.Packed() && s.ATime.Packed() && s.MTime.Packed() { + if s.ATime.Packed() && s.MTime.Packed() && s.CTime.Packed() { safecopy.CopyIn(dst, unsafe.Pointer(s)) } else { // Type Stat doesn't have a packed layout in memory, fallback to MarshalBytes. @@ -321,7 +321,7 @@ func (s *Stat) UnmarshalUnsafe(src []byte) { // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (s *Stat) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { - if !s.CTime.Packed() && s.ATime.Packed() && s.MTime.Packed() { + if !s.ATime.Packed() && s.MTime.Packed() && s.CTime.Packed() { // Type Stat doesn't have a packed layout in memory, fall back to MarshalBytes. buf := task.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. @@ -351,7 +351,7 @@ func (s *Stat) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { // CopyIn implements marshal.Marshallable.CopyIn. //go:nosplit func (s *Stat) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { - if !s.CTime.Packed() && s.ATime.Packed() && s.MTime.Packed() { + if !s.MTime.Packed() && s.CTime.Packed() && s.ATime.Packed() { // Type Stat doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := task.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := task.CopyInBytes(addr, buf) // escapes: okay. @@ -377,7 +377,7 @@ func (s *Stat) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { // WriteTo implements io.WriterTo.WriteTo. func (s *Stat) WriteTo(w io.Writer) (int64, error) { - if !s.ATime.Packed() && s.MTime.Packed() && s.CTime.Packed() { + if !s.CTime.Packed() && s.ATime.Packed() && s.MTime.Packed() { // Type Stat doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go new file mode 100644 index 000000000..f330da0bd --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/connection.go @@ -0,0 +1,255 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "errors" + "fmt" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/waiter" + "gvisor.dev/gvisor/tools/go_marshal/marshal" +) + +// MaxActiveRequestsDefault is the default setting controlling the upper bound +// on the number of active requests at any given time. +const MaxActiveRequestsDefault = 10000 + +var ( + // Ordinary requests have even IDs, while interrupts IDs are odd. + InitReqBit uint64 = 1 + ReqIDStep uint64 = 2 +) + +// Request represents a FUSE operation request that hasn't been sent to the +// server yet. +// +// +stateify savable +type Request struct { + requestEntry + + id linux.FUSEOpID + hdr *linux.FUSEHeaderIn + data []byte +} + +// Response represents an actual response from the server, including the +// response payload. +// +// +stateify savable +type Response struct { + opcode linux.FUSEOpcode + hdr linux.FUSEHeaderOut + data []byte +} + +// Connection is the struct by which the sentry communicates with the FUSE server daemon. +type Connection struct { + fd *DeviceFD + + // MaxWrite is the daemon's maximum size of a write buffer. + // This is negotiated during FUSE_INIT. + MaxWrite uint32 +} + +// NewFUSEConnection creates a FUSE connection to fd +func NewFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*Connection, error) { + // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to + // mount a FUSE filesystem. + fuseFD := fd.Impl().(*DeviceFD) + fuseFD.mounted = true + + // Create the writeBuf for the header to be stored in. + hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + fuseFD.writeBuf = make([]byte, hdrLen) + fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse) + fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests) + fuseFD.writeCursor = 0 + + return &Connection{ + fd: fuseFD, + }, nil +} + +// NewRequest creates a new request that can be sent to the FUSE server. +func (conn *Connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { + conn.fd.mu.Lock() + defer conn.fd.mu.Unlock() + conn.fd.nextOpID += linux.FUSEOpID(ReqIDStep) + + hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes() + hdr := linux.FUSEHeaderIn{ + Len: uint32(hdrLen + payload.SizeBytes()), + Opcode: opcode, + Unique: conn.fd.nextOpID, + NodeID: ino, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + PID: pid, + } + + buf := make([]byte, hdr.Len) + hdr.MarshalUnsafe(buf[:hdrLen]) + payload.MarshalUnsafe(buf[hdrLen:]) + + return &Request{ + id: hdr.Unique, + hdr: &hdr, + data: buf, + }, nil +} + +// Call makes a request to the server and blocks the invoking task until a +// server responds with a response. +// NOTE: If no task is provided then the Call will simply enqueue the request +// and return a nil response. No blocking will happen in this case. Instead, +// this is used to signify that the processing of this request will happen by +// the kernel.Task that writes the response. See FUSE_INIT for such an +// invocation. +func (conn *Connection) Call(t *kernel.Task, r *Request) (*Response, error) { + fut, err := conn.callFuture(t, r) + if err != nil { + return nil, err + } + + return fut.resolve(t) +} + +// Error returns the error of the FUSE call. +func (r *Response) Error() error { + errno := r.hdr.Error + if errno >= 0 { + return nil + } + + sysErrNo := syscall.Errno(-errno) + return error(sysErrNo) +} + +// UnmarshalPayload unmarshals the response data into m. +func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { + hdrLen := r.hdr.SizeBytes() + haveDataLen := r.hdr.Len - uint32(hdrLen) + wantDataLen := uint32(m.SizeBytes()) + + if haveDataLen < wantDataLen { + return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen) + } + + m.UnmarshalUnsafe(r.data[hdrLen:]) + return nil +} + +// callFuture makes a request to the server and returns a future response. +// Call resolve() when the response needs to be fulfilled. +func (conn *Connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) { + conn.fd.mu.Lock() + defer conn.fd.mu.Unlock() + + // Is the queue full? + // + // We must busy wait here until the request can be queued. We don't + // block on the fd.fullQueueCh with a lock - so after being signalled, + // before we acquire the lock, it is possible that a barging task enters + // and queues a request. As a result, upon acquiring the lock we must + // again check if the room is available. + // + // This can potentially starve a request forever but this can only happen + // if there are always too many ongoing requests all the time. The + // supported maxActiveRequests setting should be really high to avoid this. + for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests { + if t == nil { + // Since there is no task that is waiting. We must error out. + return nil, errors.New("FUSE request queue full") + } + + log.Infof("Blocking request %v from being queued. Too many active requests: %v", + r.id, conn.fd.numActiveRequests) + conn.fd.mu.Unlock() + err := t.Block(conn.fd.fullQueueCh) + conn.fd.mu.Lock() + if err != nil { + return nil, err + } + } + + return conn.callFutureLocked(t, r) +} + +// callFutureLocked makes a request to the server and returns a future response. +func (conn *Connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) { + conn.fd.queue.PushBack(r) + conn.fd.numActiveRequests += 1 + fut := newFutureResponse(r.hdr.Opcode) + conn.fd.completions[r.id] = fut + + // Signal the readers that there is something to read. + conn.fd.waitQueue.Notify(waiter.EventIn) + + return fut, nil +} + +// futureResponse represents an in-flight request, that may or may not have +// completed yet. Convert it to a resolved Response by calling Resolve, but note +// that this may block. +// +// +stateify savable +type futureResponse struct { + opcode linux.FUSEOpcode + ch chan struct{} + hdr *linux.FUSEHeaderOut + data []byte +} + +// newFutureResponse creates a future response to a FUSE request. +func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse { + return &futureResponse{ + opcode: opcode, + ch: make(chan struct{}), + } +} + +// resolve blocks the task until the server responds to its corresponding request, +// then returns a resolved response. +func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) { + // If there is no Task associated with this request - then we don't try to resolve + // the response. Instead, the task writing the response (proxy to the server) will + // process the response on our behalf. + if t == nil { + log.Infof("fuse.Response.resolve: Not waiting on a response from server.") + return nil, nil + } + + if err := t.Block(f.ch); err != nil { + return nil, err + } + + return f.getResponse(), nil +} + +// getResponse creates a Response from the data the futureResponse has. +func (f *futureResponse) getResponse() *Response { + return &Response{ + opcode: f.opcode, + hdr: *f.hdr, + data: f.data, + } +} diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index c9e12a94f..f3443ac71 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -15,13 +15,17 @@ package fuse import ( + "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" ) const fuseDevMinor = 229 @@ -54,9 +58,43 @@ type DeviceFD struct { // mounted specifies whether a FUSE filesystem was mounted using the DeviceFD. mounted bool - // TODO(gvisor.dev/issue/2987): Add all the data structures needed to enqueue - // and deque requests, control synchronization and establish communication - // between the FUSE kernel module and the /dev/fuse character device. + // nextOpID is used to create new requests. + nextOpID linux.FUSEOpID + + // queue is the list of requests that need to be processed by the FUSE server. + queue requestList + + // numActiveRequests is the number of requests made by the Sentry that has + // yet to be responded to. + numActiveRequests uint64 + + // completions is used to map a request to its response. A Writer will use this + // to notify the caller of a completed response. + completions map[linux.FUSEOpID]*futureResponse + + writeCursor uint32 + + // writeBuf is the memory buffer used to copy in the FUSE out header from + // userspace. + writeBuf []byte + + // writeCursorFR current FR being copied from server. + writeCursorFR *futureResponse + + // mu protects all the queues, maps, buffers and cursors and nextOpID. + mu sync.Mutex + + // waitQueue is used to notify interested parties when the device becomes + // readable or writable. + waitQueue waiter.Queue + + // fullQueueCh is a channel used to synchronize the readers with the writers. + // Writers (inbound requests to the filesystem) block if there are too many + // unprocessed in-flight requests. + fullQueueCh chan struct{} + + // fs is the FUSE filesystem that this FD is being used for. + fs *filesystem } // Release implements vfs.FileDescriptionImpl.Release. @@ -79,7 +117,75 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R return 0, syserror.EPERM } - return 0, syserror.ENOSYS + // We require that any Read done on this filesystem have a sane minimum + // read buffer. It must have the capacity for the fixed parts of any request + // header (Linux uses the request header and the FUSEWriteIn header for this + // calculation) + the negotiated MaxWrite room for the data. + minBuffSize := linux.FUSE_MIN_READ_BUFFER + inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes()) + writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes()) + negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.MaxWrite + if minBuffSize < negotiatedMinBuffSize { + minBuffSize = negotiatedMinBuffSize + } + + // If the read buffer is too small, error out. + if dst.NumBytes() < int64(minBuffSize) { + return 0, syserror.EINVAL + } + + fd.mu.Lock() + defer fd.mu.Unlock() + return fd.readLocked(ctx, dst, opts) +} + +// readLocked implements the reading of the fuse device while locked with DeviceFD.mu. +func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + if fd.queue.Empty() { + return 0, syserror.ErrWouldBlock + } + + var readCursor uint32 + var bytesRead int64 + for { + req := fd.queue.Front() + if dst.NumBytes() < int64(req.hdr.Len) { + // The request is too large. Cannot process it. All requests must be smaller than the + // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT + // handshake. + errno := -int32(syscall.EIO) + if req.hdr.Opcode == linux.FUSE_SETXATTR { + errno = -int32(syscall.E2BIG) + } + + // Return the error to the calling task. + if err := fd.sendError(ctx, errno, req); err != nil { + return 0, err + } + + // We're done with this request. + fd.queue.Remove(req) + + // Restart the read as this request was invalid. + log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.") + return fd.readLocked(ctx, dst, opts) + } + + n, err := dst.CopyOut(ctx, req.data[readCursor:]) + if err != nil { + return 0, err + } + readCursor += uint32(n) + bytesRead += int64(n) + + if readCursor >= req.hdr.Len { + // Fully done with this req, remove it from the queue. + fd.queue.Remove(req) + break + } + } + + return bytesRead, nil } // PWrite implements vfs.FileDescriptionImpl.PWrite. @@ -94,12 +200,128 @@ func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset i // Write implements vfs.FileDescriptionImpl.Write. func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + return fd.writeLocked(ctx, src, opts) +} + +// writeLocked implements writing to the fuse device while locked with DeviceFD.mu. +func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. if !fd.mounted { return 0, syserror.EPERM } - return 0, syserror.ENOSYS + var cn, n int64 + hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + + for src.NumBytes() > 0 { + if fd.writeCursorFR != nil { + // Already have common header, and we're now copying the payload. + wantBytes := fd.writeCursorFR.hdr.Len + + // Note that the FR data doesn't have the header. Copy it over if its necessary. + if fd.writeCursorFR.data == nil { + fd.writeCursorFR.data = make([]byte, wantBytes) + } + + bytesCopied, err := src.CopyIn(ctx, fd.writeCursorFR.data[fd.writeCursor:wantBytes]) + if err != nil { + return 0, err + } + src = src.DropFirst(bytesCopied) + + cn = int64(bytesCopied) + n += cn + fd.writeCursor += uint32(cn) + if fd.writeCursor == wantBytes { + // Done reading this full response. Clean up and unblock the + // initiator. + break + } + + // Check if we have more data in src. + continue + } + + // Assert that the header isn't read into the writeBuf yet. + if fd.writeCursor >= hdrLen { + return 0, syserror.EINVAL + } + + // We don't have the full common response header yet. + wantBytes := hdrLen - fd.writeCursor + bytesCopied, err := src.CopyIn(ctx, fd.writeBuf[fd.writeCursor:wantBytes]) + if err != nil { + return 0, err + } + src = src.DropFirst(bytesCopied) + + cn = int64(bytesCopied) + n += cn + fd.writeCursor += uint32(cn) + if fd.writeCursor == hdrLen { + // Have full header in the writeBuf. Use it to fetch the actual futureResponse + // from the device's completions map. + var hdr linux.FUSEHeaderOut + hdr.UnmarshalBytes(fd.writeBuf) + + // We have the header now and so the writeBuf has served its purpose. + // We could reset it manually here but instead of doing that, at the + // end of the write, the writeCursor will be set to 0 thereby allowing + // the next request to overwrite whats in the buffer, + + fut, ok := fd.completions[hdr.Unique] + if !ok { + // Server sent us a response for a request we never sent? + return 0, syserror.EINVAL + } + + delete(fd.completions, hdr.Unique) + + // Copy over the header into the future response. The rest of the payload + // will be copied over to the FR's data in the next iteration. + fut.hdr = &hdr + fd.writeCursorFR = fut + + // Next iteration will now try read the complete request, if src has + // any data remaining. Otherwise we're done. + } + } + + if fd.writeCursorFR != nil { + if err := fd.sendResponse(ctx, fd.writeCursorFR); err != nil { + return 0, err + } + + // Ready the device for the next request. + fd.writeCursorFR = nil + fd.writeCursor = 0 + } + + return n, nil +} + +// Readiness implements vfs.FileDescriptionImpl.Readiness. +func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask { + var ready waiter.EventMask + ready |= waiter.EventOut // FD is always writable + if !fd.queue.Empty() { + // Have reqs available, FD is readable. + ready |= waiter.EventIn + } + + return ready & mask +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *DeviceFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.waitQueue.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *DeviceFD) EventUnregister(e *waiter.Entry) { + fd.waitQueue.EventUnregister(e) } // Seek implements vfs.FileDescriptionImpl.Seek. @@ -112,22 +334,61 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64 return 0, syserror.ENOSYS } -// Register registers the FUSE device with vfsObj. -func Register(vfsObj *vfs.VirtualFilesystem) error { - if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{ - GroupName: "misc", - }); err != nil { +// sendResponse sends a response to the waiting task (if any). +func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error { + // See if the running task need to perform some action before returning. + // Since we just finished writing the future, we can be sure that + // getResponse generates a populated response. + if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil { return err } + // Signal that the queue is no longer full. + select { + case fd.fullQueueCh <- struct{}{}: + default: + } + fd.numActiveRequests -= 1 + + // Signal the task waiting on a response. + close(fut.ch) return nil } -// CreateDevtmpfsFile creates a device special file in devtmpfs. -func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error { - if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil { +// sendError sends an error response to the waiting task (if any). +func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error { + // Return the error to the calling task. + outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + respHdr := linux.FUSEHeaderOut{ + Len: outHdrLen, + Error: errno, + Unique: req.hdr.Unique, + } + + fut, ok := fd.completions[respHdr.Unique] + if !ok { + // Server sent us a response for a request we never sent? + return syserror.EINVAL + } + delete(fd.completions, respHdr.Unique) + + fut.hdr = &respHdr + if err := fd.sendResponse(ctx, fut); err != nil { return err } return nil } + +// noReceiverAction has the calling kernel.Task do some action if its known that no +// receiver is going to be waiting on the future channel. This is to be used by: +// FUSE_INIT. +func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error { + if r.opcode == linux.FUSE_INIT { + // TODO: process init response here. + // Maybe get the creds from the context? + // creds := auth.CredentialsFromContext(ctx) + } + + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go b/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go index 2b9c882fb..e4ce04322 100644 --- a/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go +++ b/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go @@ -1,3 +1,160 @@ // automatically generated by stateify. package fuse + +import ( + "gvisor.dev/gvisor/pkg/state" +) + +func (x *Request) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.Request" +} + +func (x *Request) StateFields() []string { + return []string{ + "requestEntry", + "id", + "hdr", + "data", + } +} + +func (x *Request) beforeSave() {} + +func (x *Request) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.requestEntry) + m.Save(1, &x.id) + m.Save(2, &x.hdr) + m.Save(3, &x.data) +} + +func (x *Request) afterLoad() {} + +func (x *Request) StateLoad(m state.Source) { + m.Load(0, &x.requestEntry) + m.Load(1, &x.id) + m.Load(2, &x.hdr) + m.Load(3, &x.data) +} + +func (x *Response) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.Response" +} + +func (x *Response) StateFields() []string { + return []string{ + "opcode", + "hdr", + "data", + } +} + +func (x *Response) beforeSave() {} + +func (x *Response) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.opcode) + m.Save(1, &x.hdr) + m.Save(2, &x.data) +} + +func (x *Response) afterLoad() {} + +func (x *Response) StateLoad(m state.Source) { + m.Load(0, &x.opcode) + m.Load(1, &x.hdr) + m.Load(2, &x.data) +} + +func (x *futureResponse) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.futureResponse" +} + +func (x *futureResponse) StateFields() []string { + return []string{ + "opcode", + "ch", + "hdr", + "data", + } +} + +func (x *futureResponse) beforeSave() {} + +func (x *futureResponse) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.opcode) + m.Save(1, &x.ch) + m.Save(2, &x.hdr) + m.Save(3, &x.data) +} + +func (x *futureResponse) afterLoad() {} + +func (x *futureResponse) StateLoad(m state.Source) { + m.Load(0, &x.opcode) + m.Load(1, &x.ch) + m.Load(2, &x.hdr) + m.Load(3, &x.data) +} + +func (x *requestList) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.requestList" +} + +func (x *requestList) StateFields() []string { + return []string{ + "head", + "tail", + } +} + +func (x *requestList) beforeSave() {} + +func (x *requestList) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.head) + m.Save(1, &x.tail) +} + +func (x *requestList) afterLoad() {} + +func (x *requestList) StateLoad(m state.Source) { + m.Load(0, &x.head) + m.Load(1, &x.tail) +} + +func (x *requestEntry) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.requestEntry" +} + +func (x *requestEntry) StateFields() []string { + return []string{ + "next", + "prev", + } +} + +func (x *requestEntry) beforeSave() {} + +func (x *requestEntry) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.next) + m.Save(1, &x.prev) +} + +func (x *requestEntry) afterLoad() {} + +func (x *requestEntry) StateLoad(m state.Source) { + m.Load(0, &x.next) + m.Load(1, &x.prev) +} + +func init() { + state.Register((*Request)(nil)) + state.Register((*Response)(nil)) + state.Register((*futureResponse)(nil)) + state.Register((*requestList)(nil)) + state.Register((*requestEntry)(nil)) +} diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index f7775fb9b..911b6f7cb 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -51,6 +51,11 @@ type filesystemOptions struct { // rootMode specifies the the file mode of the filesystem's root. rootMode linux.FileMode + + // maxActiveRequests specifies the maximum number of active requests that can + // exist at any time. Any further requests will block when trying to + // Call the server. + maxActiveRequests uint64 } // filesystem implements vfs.FilesystemImpl. @@ -58,12 +63,12 @@ type filesystem struct { kernfs.Filesystem devMinor uint32 - // fuseFD is the FD returned when opening /dev/fuse. It is used for communication - // between the FUSE server daemon and the sentry fusefs. - fuseFD *DeviceFD + // conn is used for communication between the FUSE server + // daemon and the sentry fusefs. + conn *Connection // opts is the options the fusefs is initialized with. - opts filesystemOptions + opts *filesystemOptions } // Name implements vfs.FilesystemType.Name. @@ -100,7 +105,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fuseFd := kernelTask.GetFileVFS2(int32(deviceDescriptor)) // Parse and set all the other supported FUSE mount options. - // TODO: Expand the supported mount options. + // TODO(gVisor.dev/issue/3229): Expand the supported mount options. if userIDStr, ok := mopts["user_id"]; ok { delete(mopts, "user_id") userID, err := strconv.ParseUint(userIDStr, 10, 32) @@ -134,21 +139,20 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } fsopts.rootMode = rootMode + // Set the maxInFlightRequests option. + fsopts.maxActiveRequests = MaxActiveRequestsDefault + // Check for unparsed options. if len(mopts) != 0 { log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts) return nil, nil, syserror.EINVAL } - // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to - // mount a FUSE filesystem. - fuseFD := fuseFd.Impl().(*DeviceFD) - fuseFD.mounted = true - - fs := &filesystem{ - devMinor: devMinor, - fuseFD: fuseFD, - opts: fsopts, + // Create a new FUSE filesystem. + fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd) + if err != nil { + log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err) + return nil, nil, err } fs.VFSFilesystem().Init(vfsObj, &fsType, fs) @@ -162,6 +166,26 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return fs.VFSFilesystem(), root.VFSDentry(), nil } +// NewFUSEFilesystem creates a new FUSE filesystem. +func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) { + fs := &filesystem{ + devMinor: devMinor, + opts: opts, + } + + conn, err := NewFUSEConnection(ctx, device, opts.maxActiveRequests) + if err != nil { + log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) + return nil, syserror.EINVAL + } + + fs.conn = conn + fuseFD := device.Impl().(*DeviceFD) + fuseFD.fs = fs + + return fs, nil +} + // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release() { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) diff --git a/pkg/sentry/fsimpl/fuse/register.go b/pkg/sentry/fsimpl/fuse/register.go new file mode 100644 index 000000000..b5b581152 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/register.go @@ -0,0 +1,42 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// Register registers the FUSE device with vfsObj. +func Register(vfsObj *vfs.VirtualFilesystem) error { + if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{ + GroupName: "misc", + }); err != nil { + return err + } + + return nil +} + +// CreateDevtmpfsFile creates a device special file in devtmpfs. +func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error { + if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil { + return err + } + + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/request_list.go b/pkg/sentry/fsimpl/fuse/request_list.go new file mode 100644 index 000000000..002262f23 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/request_list.go @@ -0,0 +1,193 @@ +package fuse + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type requestElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (requestElementMapper) linkerFor(elem *Request) *Request { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type requestList struct { + head *Request + tail *Request +} + +// Reset resets list l to the empty state. +func (l *requestList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *requestList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *requestList) Front() *Request { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *requestList) Back() *Request { + return l.tail +} + +// Len returns the number of elements in the list. +// +// NOTE: This is an O(n) operation. +func (l *requestList) Len() (count int) { + for e := l.Front(); e != nil; e = (requestElementMapper{}.linkerFor(e)).Next() { + count++ + } + return count +} + +// PushFront inserts the element e at the front of list l. +func (l *requestList) PushFront(e *Request) { + linker := requestElementMapper{}.linkerFor(e) + linker.SetNext(l.head) + linker.SetPrev(nil) + if l.head != nil { + requestElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *requestList) PushBack(e *Request) { + linker := requestElementMapper{}.linkerFor(e) + linker.SetNext(nil) + linker.SetPrev(l.tail) + if l.tail != nil { + requestElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *requestList) PushBackList(m *requestList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + requestElementMapper{}.linkerFor(l.tail).SetNext(m.head) + requestElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *requestList) InsertAfter(b, e *Request) { + bLinker := requestElementMapper{}.linkerFor(b) + eLinker := requestElementMapper{}.linkerFor(e) + + a := bLinker.Next() + + eLinker.SetNext(a) + eLinker.SetPrev(b) + bLinker.SetNext(e) + + if a != nil { + requestElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *requestList) InsertBefore(a, e *Request) { + aLinker := requestElementMapper{}.linkerFor(a) + eLinker := requestElementMapper{}.linkerFor(e) + + b := aLinker.Prev() + eLinker.SetNext(a) + eLinker.SetPrev(b) + aLinker.SetPrev(e) + + if b != nil { + requestElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *requestList) Remove(e *Request) { + linker := requestElementMapper{}.linkerFor(e) + prev := linker.Prev() + next := linker.Next() + + if prev != nil { + requestElementMapper{}.linkerFor(prev).SetNext(next) + } else if l.head == e { + l.head = next + } + + if next != nil { + requestElementMapper{}.linkerFor(next).SetPrev(prev) + } else if l.tail == e { + l.tail = prev + } + + linker.SetNext(nil) + linker.SetPrev(nil) +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type requestEntry struct { + next *Request + prev *Request +} + +// Next returns the entry that follows e in the list. +func (e *requestEntry) Next() *Request { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *requestEntry) Prev() *Request { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *requestEntry) SetNext(elem *Request) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *requestEntry) SetPrev(elem *Request) { + e.prev = elem +} |