diff options
author | gVisor bot <gvisor-bot@google.com> | 2020-09-11 20:38:26 +0000 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2020-09-11 20:38:26 +0000 |
commit | 51194daa8da2bee362c9ecb6491016038e5f8436 (patch) | |
tree | aa1d6310ea8b331f6985e01ae8d60b722a3e9972 /pkg | |
parent | b758e31a9b2f0f68d0c0182a74da954f81053f72 (diff) | |
parent | b8bee78d0c99a0c9423061b3dda10b9ef8c9719d (diff) |
Merge release-20200810.0-236-gb8bee78d0 (automated)
Diffstat (limited to 'pkg')
61 files changed, 4088 insertions, 1059 deletions
diff --git a/pkg/abi/linux/fuse.go b/pkg/abi/linux/fuse.go index 7e30483ee..fdd22a13d 100644 --- a/pkg/abi/linux/fuse.go +++ b/pkg/abi/linux/fuse.go @@ -14,12 +14,20 @@ package linux +import ( + "gvisor.dev/gvisor/tools/go_marshal/marshal" + "gvisor.dev/gvisor/tools/go_marshal/primitive" +) + // +marshal type FUSEOpcode uint32 // +marshal type FUSEOpID uint64 +// FUSE_ROOT_ID is the id of root inode. +const FUSE_ROOT_ID = 1 + // Opcodes for FUSE operations. Analogous to the opcodes in include/linux/fuse.h. const ( FUSE_LOOKUP FUSEOpcode = 1 @@ -116,61 +124,28 @@ type FUSEHeaderOut struct { Unique FUSEOpID } -// FUSEWriteIn is the header written by a daemon when it makes a -// write request to the FUSE filesystem. -// -// +marshal -type FUSEWriteIn struct { - // Fh specifies the file handle that is being written to. - Fh uint64 - - // Offset is the offset of the write. - Offset uint64 - - // Size is the size of data being written. - Size uint32 - - // WriteFlags is the flags used during the write. - WriteFlags uint32 - - // LockOwner is the ID of the lock owner. - LockOwner uint64 - - // Flags is the flags for the request. - Flags uint32 - - _ uint32 -} - // FUSE_INIT flags, consistent with the ones in include/uapi/linux/fuse.h. +// Our taget version is 7.23 but we have few implemented in advance. const ( - FUSE_ASYNC_READ = 1 << 0 - FUSE_POSIX_LOCKS = 1 << 1 - FUSE_FILE_OPS = 1 << 2 - FUSE_ATOMIC_O_TRUNC = 1 << 3 - FUSE_EXPORT_SUPPORT = 1 << 4 - FUSE_BIG_WRITES = 1 << 5 - FUSE_DONT_MASK = 1 << 6 - FUSE_SPLICE_WRITE = 1 << 7 - FUSE_SPLICE_MOVE = 1 << 8 - FUSE_SPLICE_READ = 1 << 9 - FUSE_FLOCK_LOCKS = 1 << 10 - FUSE_HAS_IOCTL_DIR = 1 << 11 - FUSE_AUTO_INVAL_DATA = 1 << 12 - FUSE_DO_READDIRPLUS = 1 << 13 - FUSE_READDIRPLUS_AUTO = 1 << 14 - FUSE_ASYNC_DIO = 1 << 15 - FUSE_WRITEBACK_CACHE = 1 << 16 - FUSE_NO_OPEN_SUPPORT = 1 << 17 - FUSE_PARALLEL_DIROPS = 1 << 18 - FUSE_HANDLE_KILLPRIV = 1 << 19 - FUSE_POSIX_ACL = 1 << 20 - FUSE_ABORT_ERROR = 1 << 21 - FUSE_MAX_PAGES = 1 << 22 - FUSE_CACHE_SYMLINKS = 1 << 23 - FUSE_NO_OPENDIR_SUPPORT = 1 << 24 - FUSE_EXPLICIT_INVAL_DATA = 1 << 25 - FUSE_MAP_ALIGNMENT = 1 << 26 + FUSE_ASYNC_READ = 1 << 0 + FUSE_POSIX_LOCKS = 1 << 1 + FUSE_FILE_OPS = 1 << 2 + FUSE_ATOMIC_O_TRUNC = 1 << 3 + FUSE_EXPORT_SUPPORT = 1 << 4 + FUSE_BIG_WRITES = 1 << 5 + FUSE_DONT_MASK = 1 << 6 + FUSE_SPLICE_WRITE = 1 << 7 + FUSE_SPLICE_MOVE = 1 << 8 + FUSE_SPLICE_READ = 1 << 9 + FUSE_FLOCK_LOCKS = 1 << 10 + FUSE_HAS_IOCTL_DIR = 1 << 11 + FUSE_AUTO_INVAL_DATA = 1 << 12 + FUSE_DO_READDIRPLUS = 1 << 13 + FUSE_READDIRPLUS_AUTO = 1 << 14 + FUSE_ASYNC_DIO = 1 << 15 + FUSE_WRITEBACK_CACHE = 1 << 16 + FUSE_NO_OPEN_SUPPORT = 1 << 17 + FUSE_MAX_PAGES = 1 << 22 // From FUSE 7.28 ) // currently supported FUSE protocol version numbers. @@ -179,6 +154,13 @@ const ( FUSE_KERNEL_MINOR_VERSION = 31 ) +// Constants relevant to FUSE operations. +const ( + FUSE_NAME_MAX = 1024 + FUSE_PAGE_SIZE = 4096 + FUSE_DIRENT_ALIGN = 8 +) + // FUSEInitIn is the request sent by the kernel to the daemon, // to negotiate the version and flags. // @@ -199,7 +181,7 @@ type FUSEInitIn struct { } // FUSEInitOut is the reply sent by the daemon to the kernel -// for FUSEInitIn. +// for FUSEInitIn. We target FUSE 7.23; this struct supports 7.28. // // +marshal type FUSEInitOut struct { @@ -240,13 +222,16 @@ type FUSEInitOut struct { // if the value from daemon is too large. MaxPages uint16 - // MapAlignment is an unknown field and not used by this package at this moment. - // Use as a placeholder to be consistent with the FUSE protocol. - MapAlignment uint16 + _ uint16 _ [8]uint32 } +// FUSE_GETATTR_FH is currently the only flag of FUSEGetAttrIn.GetAttrFlags. +// If it is set, the file handle (FUSEGetAttrIn.Fh) is used to indicate the +// object instead of the node id attribute in the request header. +const FUSE_GETATTR_FH = (1 << 0) + // FUSEGetAttrIn is the request sent by the kernel to the daemon, // to get the attribute of a inode. // @@ -267,22 +252,52 @@ type FUSEGetAttrIn struct { // // +marshal type FUSEAttr struct { - Ino uint64 - Size uint64 - Blocks uint64 - Atime uint64 - Mtime uint64 - Ctime uint64 + // Ino is the inode number of this file. + Ino uint64 + + // Size is the size of this file. + Size uint64 + + // Blocks is the number of the 512B blocks allocated by this file. + Blocks uint64 + + // Atime is the time of last access. + Atime uint64 + + // Mtime is the time of last modification. + Mtime uint64 + + // Ctime is the time of last status change. + Ctime uint64 + + // AtimeNsec is the nano second part of Atime. AtimeNsec uint32 + + // MtimeNsec is the nano second part of Mtime. MtimeNsec uint32 + + // CtimeNsec is the nano second part of Ctime. CtimeNsec uint32 - Mode uint32 - Nlink uint32 - UID uint32 - GID uint32 - Rdev uint32 - BlkSize uint32 - _ uint32 + + // Mode contains the file type and mode. + Mode uint32 + + // Nlink is the number of the hard links. + Nlink uint32 + + // UID is user ID of the owner. + UID uint32 + + // GID is group ID of the owner. + GID uint32 + + // Rdev is the device ID if this is a special file. + Rdev uint32 + + // BlkSize is the block size for filesystem I/O. + BlkSize uint32 + + _ uint32 } // FUSEGetAttrOut is the reply sent by the daemon to the kernel @@ -301,3 +316,556 @@ type FUSEGetAttrOut struct { // Attr contains the metadata returned from the FUSE server Attr FUSEAttr } + +// FUSEEntryOut is the reply sent by the daemon to the kernel +// for FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and +// FUSE_LOOKUP. +// +// +marshal +type FUSEEntryOut struct { + // NodeID is the ID for current inode. + NodeID uint64 + + // Generation is the generation number of inode. + // Used to identify an inode that have different ID at different time. + Generation uint64 + + // EntryValid indicates timeout for an entry. + EntryValid uint64 + + // AttrValid indicates timeout for an entry's attributes. + AttrValid uint64 + + // EntryValidNsec indicates timeout for an entry in nanosecond. + EntryValidNSec uint32 + + // AttrValidNsec indicates timeout for an entry's attributes in nanosecond. + AttrValidNSec uint32 + + // Attr contains the attributes of an entry. + Attr FUSEAttr +} + +// FUSELookupIn is the request sent by the kernel to the daemon +// to look up a file name. +// +// Dynamically-sized objects cannot be marshalled. +type FUSELookupIn struct { + marshal.StubMarshallable + + // Name is a file name to be looked up. + Name string +} + +// MarshalBytes serializes r.name to the dst buffer. +func (r *FUSELookupIn) MarshalBytes(buf []byte) { + copy(buf, r.Name) +} + +// SizeBytes is the size of the memory representation of FUSELookupIn. +// 1 extra byte for null-terminated string. +func (r *FUSELookupIn) SizeBytes() int { + return len(r.Name) + 1 +} + +// MAX_NON_LFS indicates the maximum offset without large file support. +const MAX_NON_LFS = ((1 << 31) - 1) + +// flags returned by OPEN request. +const ( + // FOPEN_DIRECT_IO indicates bypassing page cache for this opened file. + FOPEN_DIRECT_IO = 1 << 0 + // FOPEN_KEEP_CACHE avoids invalidate of data cache on open. + FOPEN_KEEP_CACHE = 1 << 1 + // FOPEN_NONSEEKABLE indicates the file cannot be seeked. + FOPEN_NONSEEKABLE = 1 << 2 +) + +// FUSEOpenIn is the request sent by the kernel to the daemon, +// to negotiate flags and get file handle. +// +// +marshal +type FUSEOpenIn struct { + // Flags of this open request. + Flags uint32 + + _ uint32 +} + +// FUSEOpenOut is the reply sent by the daemon to the kernel +// for FUSEOpenIn. +// +// +marshal +type FUSEOpenOut struct { + // Fh is the file handler for opened file. + Fh uint64 + + // OpenFlag for the opened file. + OpenFlag uint32 +} + +// FUSE_READ flags, consistent with the ones in include/uapi/linux/fuse.h. +const ( + FUSE_READ_LOCKOWNER = 1 << 1 +) + +// FUSEReadIn is the request sent by the kernel to the daemon +// for FUSE_READ. +// +// +marshal +type FUSEReadIn struct { + // Fh is the file handle in userspace. + Fh uint64 + + // Offset is the read offset. + Offset uint64 + + // Size is the number of bytes to read. + Size uint32 + + // ReadFlags for this FUSE_READ request. + // Currently only contains FUSE_READ_LOCKOWNER. + ReadFlags uint32 + + // LockOwner is the id of the lock owner if there is one. + LockOwner uint64 + + // Flags for the underlying file. + Flags uint32 + + _ uint32 +} + +// FUSEWriteIn is the first part of the payload of the +// request sent by the kernel to the daemon +// for FUSE_WRITE (struct for FUSE version >= 7.9). +// +// The second part of the payload is the +// binary bytes of the data to be written. +// +// +marshal +type FUSEWriteIn struct { + // Fh is the file handle in userspace. + Fh uint64 + + // Offset is the write offset. + Offset uint64 + + // Size is the number of bytes to write. + Size uint32 + + // ReadFlags for this FUSE_WRITE request. + WriteFlags uint32 + + // LockOwner is the id of the lock owner if there is one. + LockOwner uint64 + + // Flags for the underlying file. + Flags uint32 + + _ uint32 +} + +// FUSEWriteOut is the payload of the reply sent by the daemon to the kernel +// for a FUSE_WRITE request. +// +// +marshal +type FUSEWriteOut struct { + // Size is the number of bytes written. + Size uint32 + + _ uint32 +} + +// FUSEReleaseIn is the request sent by the kernel to the daemon +// when there is no more reference to a file. +// +// +marshal +type FUSEReleaseIn struct { + // Fh is the file handler for the file to be released. + Fh uint64 + + // Flags of the file. + Flags uint32 + + // ReleaseFlags of this release request. + ReleaseFlags uint32 + + // LockOwner is the id of the lock owner if there is one. + LockOwner uint64 +} + +// FUSECreateMeta contains all the static fields of FUSECreateIn, +// which is used for FUSE_CREATE. +// +// +marshal +type FUSECreateMeta struct { + // Flags of the creating file. + Flags uint32 + + // Mode is the mode of the creating file. + Mode uint32 + + // Umask is the current file mode creation mask. + Umask uint32 + _ uint32 +} + +// FUSECreateIn contains all the arguments sent by the kernel to the daemon, to +// atomically create and open a new regular file. +// +// Dynamically-sized objects cannot be marshalled. +type FUSECreateIn struct { + marshal.StubMarshallable + + // CreateMeta contains mode, rdev and umash field for FUSE_MKNODS. + CreateMeta FUSECreateMeta + + // Name is the name of the node to create. + Name string +} + +// MarshalBytes serializes r.CreateMeta and r.Name to the dst buffer. +func (r *FUSECreateIn) MarshalBytes(buf []byte) { + r.CreateMeta.MarshalBytes(buf[:r.CreateMeta.SizeBytes()]) + copy(buf[r.CreateMeta.SizeBytes():], r.Name) +} + +// SizeBytes is the size of the memory representation of FUSECreateIn. +// 1 extra byte for null-terminated string. +func (r *FUSECreateIn) SizeBytes() int { + return r.CreateMeta.SizeBytes() + len(r.Name) + 1 +} + +// FUSEMknodMeta contains all the static fields of FUSEMknodIn, +// which is used for FUSE_MKNOD. +// +// +marshal +type FUSEMknodMeta struct { + // Mode of the inode to create. + Mode uint32 + + // Rdev encodes device major and minor information. + Rdev uint32 + + // Umask is the current file mode creation mask. + Umask uint32 + + _ uint32 +} + +// FUSEMknodIn contains all the arguments sent by the kernel +// to the daemon, to create a new file node. +// +// Dynamically-sized objects cannot be marshalled. +type FUSEMknodIn struct { + marshal.StubMarshallable + + // MknodMeta contains mode, rdev and umash field for FUSE_MKNODS. + MknodMeta FUSEMknodMeta + + // Name is the name of the node to create. + Name string +} + +// MarshalBytes serializes r.MknodMeta and r.Name to the dst buffer. +func (r *FUSEMknodIn) MarshalBytes(buf []byte) { + r.MknodMeta.MarshalBytes(buf[:r.MknodMeta.SizeBytes()]) + copy(buf[r.MknodMeta.SizeBytes():], r.Name) +} + +// SizeBytes is the size of the memory representation of FUSEMknodIn. +// 1 extra byte for null-terminated string. +func (r *FUSEMknodIn) SizeBytes() int { + return r.MknodMeta.SizeBytes() + len(r.Name) + 1 +} + +// FUSESymLinkIn is the request sent by the kernel to the daemon, +// to create a symbolic link. +// +// Dynamically-sized objects cannot be marshalled. +type FUSESymLinkIn struct { + marshal.StubMarshallable + + // Name of symlink to create. + Name string + + // Target of the symlink. + Target string +} + +// MarshalBytes serializes r.Name and r.Target to the dst buffer. +// Left null-termination at end of r.Name and r.Target. +func (r *FUSESymLinkIn) MarshalBytes(buf []byte) { + copy(buf, r.Name) + copy(buf[len(r.Name)+1:], r.Target) +} + +// SizeBytes is the size of the memory representation of FUSESymLinkIn. +// 2 extra bytes for null-terminated string. +func (r *FUSESymLinkIn) SizeBytes() int { + return len(r.Name) + len(r.Target) + 2 +} + +// FUSEEmptyIn is used by operations without request body. +type FUSEEmptyIn struct{ marshal.StubMarshallable } + +// MarshalBytes do nothing for marshal. +func (r *FUSEEmptyIn) MarshalBytes(buf []byte) {} + +// SizeBytes is 0 for empty request. +func (r *FUSEEmptyIn) SizeBytes() int { + return 0 +} + +// FUSEMkdirMeta contains all the static fields of FUSEMkdirIn, +// which is used for FUSE_MKDIR. +// +// +marshal +type FUSEMkdirMeta struct { + // Mode of the directory of create. + Mode uint32 + + // Umask is the user file creation mask. + Umask uint32 +} + +// FUSEMkdirIn contains all the arguments sent by the kernel +// to the daemon, to create a new directory. +// +// Dynamically-sized objects cannot be marshalled. +type FUSEMkdirIn struct { + marshal.StubMarshallable + + // MkdirMeta contains Mode and Umask of the directory to create. + MkdirMeta FUSEMkdirMeta + + // Name of the directory to create. + Name string +} + +// MarshalBytes serializes r.MkdirMeta and r.Name to the dst buffer. +func (r *FUSEMkdirIn) MarshalBytes(buf []byte) { + r.MkdirMeta.MarshalBytes(buf[:r.MkdirMeta.SizeBytes()]) + copy(buf[r.MkdirMeta.SizeBytes():], r.Name) +} + +// SizeBytes is the size of the memory representation of FUSEMkdirIn. +// 1 extra byte for null-terminated Name string. +func (r *FUSEMkdirIn) SizeBytes() int { + return r.MkdirMeta.SizeBytes() + len(r.Name) + 1 +} + +// FUSERmDirIn is the request sent by the kernel to the daemon +// when trying to remove a directory. +// +// Dynamically-sized objects cannot be marshalled. +type FUSERmDirIn struct { + marshal.StubMarshallable + + // Name is a directory name to be removed. + Name string +} + +// MarshalBytes serializes r.name to the dst buffer. +func (r *FUSERmDirIn) MarshalBytes(buf []byte) { + copy(buf, r.Name) +} + +// SizeBytes is the size of the memory representation of FUSERmDirIn. +func (r *FUSERmDirIn) SizeBytes() int { + return len(r.Name) + 1 +} + +// FUSEDirents is a list of Dirents received from the FUSE daemon server. +// It is used for FUSE_READDIR. +// +// Dynamically-sized objects cannot be marshalled. +type FUSEDirents struct { + marshal.StubMarshallable + + Dirents []*FUSEDirent +} + +// FUSEDirent is a Dirent received from the FUSE daemon server. +// It is used for FUSE_READDIR. +// +// Dynamically-sized objects cannot be marshalled. +type FUSEDirent struct { + marshal.StubMarshallable + + // Meta contains all the static fields of FUSEDirent. + Meta FUSEDirentMeta + + // Name is the filename of the dirent. + Name string +} + +// FUSEDirentMeta contains all the static fields of FUSEDirent. +// It is used for FUSE_READDIR. +// +// +marshal +type FUSEDirentMeta struct { + // Inode of the dirent. + Ino uint64 + + // Offset of the dirent. + Off uint64 + + // NameLen is the length of the dirent name. + NameLen uint32 + + // Type of the dirent. + Type uint32 +} + +// SizeBytes is the size of the memory representation of FUSEDirents. +func (r *FUSEDirents) SizeBytes() int { + var sizeBytes int + for _, dirent := range r.Dirents { + sizeBytes += dirent.SizeBytes() + } + + return sizeBytes +} + +// UnmarshalBytes deserializes FUSEDirents from the src buffer. +func (r *FUSEDirents) UnmarshalBytes(src []byte) { + for { + if len(src) <= (*FUSEDirentMeta)(nil).SizeBytes() { + break + } + + // Its unclear how many dirents there are in src. Each dirent is dynamically + // sized and so we can't make assumptions about how many dirents we can allocate. + if r.Dirents == nil { + r.Dirents = make([]*FUSEDirent, 0) + } + + // We have to allocate a struct for each dirent - there must be a better way + // to do this. Linux allocates 1 page to store all the dirents and then + // simply reads them from the page. + var dirent FUSEDirent + dirent.UnmarshalBytes(src) + r.Dirents = append(r.Dirents, &dirent) + + src = src[dirent.SizeBytes():] + } +} + +// SizeBytes is the size of the memory representation of FUSEDirent. +func (r *FUSEDirent) SizeBytes() int { + dataSize := r.Meta.SizeBytes() + len(r.Name) + + // Each Dirent must be padded such that its size is a multiple + // of FUSE_DIRENT_ALIGN. Similar to the fuse dirent alignment + // in linux/fuse.h. + return (dataSize + (FUSE_DIRENT_ALIGN - 1)) & ^(FUSE_DIRENT_ALIGN - 1) +} + +// UnmarshalBytes deserializes FUSEDirent from the src buffer. +func (r *FUSEDirent) UnmarshalBytes(src []byte) { + r.Meta.UnmarshalBytes(src) + src = src[r.Meta.SizeBytes():] + + if r.Meta.NameLen > FUSE_NAME_MAX { + // The name is too long and therefore invalid. We don't + // need to unmarshal the name since it'll be thrown away. + return + } + + buf := make([]byte, r.Meta.NameLen) + name := primitive.ByteSlice(buf) + name.UnmarshalBytes(src[:r.Meta.NameLen]) + r.Name = string(name) +} + +// FATTR_* consts are the attribute flags defined in include/uapi/linux/fuse.h. +// These should be or-ed together for setattr to know what has been changed. +const ( + FATTR_MODE = (1 << 0) + FATTR_UID = (1 << 1) + FATTR_GID = (1 << 2) + FATTR_SIZE = (1 << 3) + FATTR_ATIME = (1 << 4) + FATTR_MTIME = (1 << 5) + FATTR_FH = (1 << 6) + FATTR_ATIME_NOW = (1 << 7) + FATTR_MTIME_NOW = (1 << 8) + FATTR_LOCKOWNER = (1 << 9) + FATTR_CTIME = (1 << 10) +) + +// FUSESetAttrIn is the request sent by the kernel to the daemon, +// to set the attribute(s) of a file. +// +// +marshal +type FUSESetAttrIn struct { + // Valid indicates which attributes are modified by this request. + Valid uint32 + + _ uint32 + + // Fh is used to identify the file if FATTR_FH is set in Valid. + Fh uint64 + + // Size is the size that the request wants to change to. + Size uint64 + + // LockOwner is the owner of the lock that the request wants to change to. + LockOwner uint64 + + // Atime is the access time that the request wants to change to. + Atime uint64 + + // Mtime is the modification time that the request wants to change to. + Mtime uint64 + + // Ctime is the status change time that the request wants to change to. + Ctime uint64 + + // AtimeNsec is the nano second part of Atime. + AtimeNsec uint32 + + // MtimeNsec is the nano second part of Mtime. + MtimeNsec uint32 + + // CtimeNsec is the nano second part of Ctime. + CtimeNsec uint32 + + // Mode is the file mode that the request wants to change to. + Mode uint32 + + _ uint32 + + // UID is the user ID of the owner that the request wants to change to. + UID uint32 + + // GID is the group ID of the owner that the request wants to change to. + GID uint32 + + _ uint32 +} + +// FUSEUnlinkIn is the request sent by the kernel to the daemon +// when trying to unlink a node. +// +// Dynamically-sized objects cannot be marshalled. +type FUSEUnlinkIn struct { + marshal.StubMarshallable + + // Name of the node to unlink. + Name string +} + +// MarshalBytes serializes r.name to the dst buffer, which should +// have size len(r.Name) + 1 and last byte set to 0. +func (r *FUSEUnlinkIn) MarshalBytes(buf []byte) { + copy(buf, r.Name) +} + +// SizeBytes is the size of the memory representation of FUSEUnlinkIn. +// 1 extra byte for null-terminated Name string. +func (r *FUSEUnlinkIn) SizeBytes() int { + return len(r.Name) + 1 +} diff --git a/pkg/abi/linux/linux_abi_autogen_unsafe.go b/pkg/abi/linux/linux_abi_autogen_unsafe.go index 4bb1c964c..4746c4b2b 100644 --- a/pkg/abi/linux/linux_abi_autogen_unsafe.go +++ b/pkg/abi/linux/linux_abi_autogen_unsafe.go @@ -16,15 +16,26 @@ import ( // Marshallable types used by this file. var _ marshal.Marshallable = (*ControlMessageCredentials)(nil) var _ marshal.Marshallable = (*FUSEAttr)(nil) +var _ marshal.Marshallable = (*FUSECreateMeta)(nil) +var _ marshal.Marshallable = (*FUSEDirentMeta)(nil) +var _ marshal.Marshallable = (*FUSEEntryOut)(nil) var _ marshal.Marshallable = (*FUSEGetAttrIn)(nil) var _ marshal.Marshallable = (*FUSEGetAttrOut)(nil) var _ marshal.Marshallable = (*FUSEHeaderIn)(nil) var _ marshal.Marshallable = (*FUSEHeaderOut)(nil) var _ marshal.Marshallable = (*FUSEInitIn)(nil) var _ marshal.Marshallable = (*FUSEInitOut)(nil) +var _ marshal.Marshallable = (*FUSEMkdirMeta)(nil) +var _ marshal.Marshallable = (*FUSEMknodMeta)(nil) var _ marshal.Marshallable = (*FUSEOpID)(nil) var _ marshal.Marshallable = (*FUSEOpcode)(nil) +var _ marshal.Marshallable = (*FUSEOpenIn)(nil) +var _ marshal.Marshallable = (*FUSEOpenOut)(nil) +var _ marshal.Marshallable = (*FUSEReadIn)(nil) +var _ marshal.Marshallable = (*FUSEReleaseIn)(nil) +var _ marshal.Marshallable = (*FUSESetAttrIn)(nil) var _ marshal.Marshallable = (*FUSEWriteIn)(nil) +var _ marshal.Marshallable = (*FUSEWriteOut)(nil) var _ marshal.Marshallable = (*IFConf)(nil) var _ marshal.Marshallable = (*IFReq)(nil) var _ marshal.Marshallable = (*IP6TEntry)(nil) @@ -170,7 +181,7 @@ func (s *Statx) MarshalUnsafe(dst []byte) { // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *Statx) UnmarshalUnsafe(src []byte) { - if s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { + if s.Ctime.Packed() && s.Mtime.Packed() && s.Atime.Packed() && s.Btime.Packed() { safecopy.CopyOut(unsafe.Pointer(s), src) } else { // Type Statx doesn't have a packed layout in memory, fallback to UnmarshalBytes. @@ -211,7 +222,7 @@ func (s *Statx) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { // CopyIn implements marshal.Marshallable.CopyIn. //go:nosplit func (s *Statx) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { - if !s.Mtime.Packed() && s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() { + if !s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { // Type Statx doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := task.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := task.CopyInBytes(addr, buf) // escapes: okay. @@ -656,7 +667,7 @@ func (f *FUSEHeaderIn) UnmarshalUnsafe(src []byte) { // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (f *FUSEHeaderIn) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { - if !f.Unique.Packed() && f.Opcode.Packed() { + if !f.Opcode.Packed() && f.Unique.Packed() { // Type FUSEHeaderIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := task.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. f.MarshalBytes(buf) // escapes: fallback. @@ -868,119 +879,6 @@ func (f *FUSEHeaderOut) WriteTo(writer io.Writer) (int64, error) { } // SizeBytes implements marshal.Marshallable.SizeBytes. -func (f *FUSEWriteIn) SizeBytes() int { - return 40 -} - -// MarshalBytes implements marshal.Marshallable.MarshalBytes. -func (f *FUSEWriteIn) MarshalBytes(dst []byte) { - usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) - dst = dst[8:] - usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Offset)) - dst = dst[8:] - usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Size)) - dst = dst[4:] - usermem.ByteOrder.PutUint32(dst[:4], uint32(f.WriteFlags)) - dst = dst[4:] - usermem.ByteOrder.PutUint64(dst[:8], uint64(f.LockOwner)) - dst = dst[8:] - usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) - dst = dst[4:] - // Padding: dst[:sizeof(uint32)] ~= uint32(0) - dst = dst[4:] -} - -// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. -func (f *FUSEWriteIn) UnmarshalBytes(src []byte) { - f.Fh = uint64(usermem.ByteOrder.Uint64(src[:8])) - src = src[8:] - f.Offset = uint64(usermem.ByteOrder.Uint64(src[:8])) - src = src[8:] - f.Size = uint32(usermem.ByteOrder.Uint32(src[:4])) - src = src[4:] - f.WriteFlags = uint32(usermem.ByteOrder.Uint32(src[:4])) - src = src[4:] - f.LockOwner = uint64(usermem.ByteOrder.Uint64(src[:8])) - src = src[8:] - f.Flags = uint32(usermem.ByteOrder.Uint32(src[:4])) - src = src[4:] - // Padding: var _ uint32 ~= src[:sizeof(uint32)] - src = src[4:] -} - -// Packed implements marshal.Marshallable.Packed. -//go:nosplit -func (f *FUSEWriteIn) Packed() bool { - return true -} - -// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. -func (f *FUSEWriteIn) MarshalUnsafe(dst []byte) { - safecopy.CopyIn(dst, unsafe.Pointer(f)) -} - -// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. -func (f *FUSEWriteIn) UnmarshalUnsafe(src []byte) { - safecopy.CopyOut(unsafe.Pointer(f), src) -} - -// CopyOutN implements marshal.Marshallable.CopyOutN. -//go:nosplit -func (f *FUSEWriteIn) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { - // Construct a slice backed by dst's underlying memory. - var buf []byte - hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) - hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) - hdr.Len = f.SizeBytes() - hdr.Cap = f.SizeBytes() - - length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. - // Since we bypassed the compiler's escape analysis, indicate that f - // must live until the use above. - runtime.KeepAlive(f) // escapes: replaced by intrinsic. - return length, err -} - -// CopyOut implements marshal.Marshallable.CopyOut. -//go:nosplit -func (f *FUSEWriteIn) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { - return f.CopyOutN(task, addr, f.SizeBytes()) -} - -// CopyIn implements marshal.Marshallable.CopyIn. -//go:nosplit -func (f *FUSEWriteIn) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { - // Construct a slice backed by dst's underlying memory. - var buf []byte - hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) - hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) - hdr.Len = f.SizeBytes() - hdr.Cap = f.SizeBytes() - - length, err := task.CopyInBytes(addr, buf) // escapes: okay. - // Since we bypassed the compiler's escape analysis, indicate that f - // must live until the use above. - runtime.KeepAlive(f) // escapes: replaced by intrinsic. - return length, err -} - -// WriteTo implements io.WriterTo.WriteTo. -func (f *FUSEWriteIn) WriteTo(writer io.Writer) (int64, error) { - // Construct a slice backed by dst's underlying memory. - var buf []byte - hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) - hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) - hdr.Len = f.SizeBytes() - hdr.Cap = f.SizeBytes() - - length, err := writer.Write(buf) - // Since we bypassed the compiler's escape analysis, indicate that f - // must live until the use above. - runtime.KeepAlive(f) // escapes: replaced by intrinsic. - return int64(length), err -} - -// SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEInitIn) SizeBytes() int { return 16 } @@ -1107,7 +1005,7 @@ func (f *FUSEInitOut) MarshalBytes(dst []byte) { dst = dst[4:] usermem.ByteOrder.PutUint16(dst[:2], uint16(f.MaxPages)) dst = dst[2:] - usermem.ByteOrder.PutUint16(dst[:2], uint16(f.MapAlignment)) + // Padding: dst[:sizeof(uint16)] ~= uint16(0) dst = dst[2:] // Padding: dst[:sizeof(uint32)*8] ~= [8]uint32{0} dst = dst[4*(8):] @@ -1133,7 +1031,7 @@ func (f *FUSEInitOut) UnmarshalBytes(src []byte) { src = src[4:] f.MaxPages = uint16(usermem.ByteOrder.Uint16(src[:2])) src = src[2:] - f.MapAlignment = uint16(usermem.ByteOrder.Uint16(src[:2])) + // Padding: var _ uint16 ~= src[:sizeof(uint16)] src = src[2:] // Padding: ~ copy([8]uint32(f._), src[:sizeof(uint32)*8]) src = src[4*(8):] @@ -1595,6 +1493,1306 @@ func (f *FUSEGetAttrOut) WriteTo(writer io.Writer) (int64, error) { } // SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSEEntryOut) SizeBytes() int { + return 40 + + (*FUSEAttr)(nil).SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEEntryOut) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.NodeID)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Generation)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.EntryValid)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.AttrValid)) + dst = dst[8:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.EntryValidNSec)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.AttrValidNSec)) + dst = dst[4:] + f.Attr.MarshalBytes(dst[:f.Attr.SizeBytes()]) + dst = dst[f.Attr.SizeBytes():] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEEntryOut) UnmarshalBytes(src []byte) { + f.NodeID = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Generation = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.EntryValid = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.AttrValid = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.EntryValidNSec = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.AttrValidNSec = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.Attr.UnmarshalBytes(src[:f.Attr.SizeBytes()]) + src = src[f.Attr.SizeBytes():] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEEntryOut) Packed() bool { + return f.Attr.Packed() +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEEntryOut) MarshalUnsafe(dst []byte) { + if f.Attr.Packed() { + safecopy.CopyIn(dst, unsafe.Pointer(f)) + } else { + // Type FUSEEntryOut doesn't have a packed layout in memory, fallback to MarshalBytes. + f.MarshalBytes(dst) + } +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEEntryOut) UnmarshalUnsafe(src []byte) { + if f.Attr.Packed() { + safecopy.CopyOut(unsafe.Pointer(f), src) + } else { + // Type FUSEEntryOut doesn't have a packed layout in memory, fallback to UnmarshalBytes. + f.UnmarshalBytes(src) + } +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEEntryOut) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + if !f.Attr.Packed() { + // Type FUSEEntryOut doesn't have a packed layout in memory, fall back to MarshalBytes. + buf := task.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. + f.MarshalBytes(buf) // escapes: fallback. + return task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + } + + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEEntryOut) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEEntryOut) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + if !f.Attr.Packed() { + // Type FUSEEntryOut doesn't have a packed layout in memory, fall back to UnmarshalBytes. + buf := task.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Unmarshal unconditionally. If we had a short copy-in, this results in a + // partially unmarshalled struct. + f.UnmarshalBytes(buf) // escapes: fallback. + return length, err + } + + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEEntryOut) WriteTo(writer io.Writer) (int64, error) { + if !f.Attr.Packed() { + // Type FUSEEntryOut doesn't have a packed layout in memory, fall back to MarshalBytes. + buf := make([]byte, f.SizeBytes()) + f.MarshalBytes(buf) + length, err := writer.Write(buf) + return int64(length), err + } + + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := writer.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSEOpenIn) SizeBytes() int { + return 8 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEOpenIn) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) + dst = dst[4:] + // Padding: dst[:sizeof(uint32)] ~= uint32(0) + dst = dst[4:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEOpenIn) UnmarshalBytes(src []byte) { + f.Flags = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + // Padding: var _ uint32 ~= src[:sizeof(uint32)] + src = src[4:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEOpenIn) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEOpenIn) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEOpenIn) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEOpenIn) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEOpenIn) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEOpenIn) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEOpenIn) WriteTo(writer io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := writer.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSEOpenOut) SizeBytes() int { + return 12 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEOpenOut) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) + dst = dst[8:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.OpenFlag)) + dst = dst[4:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEOpenOut) UnmarshalBytes(src []byte) { + f.Fh = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.OpenFlag = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEOpenOut) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEOpenOut) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEOpenOut) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEOpenOut) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEOpenOut) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEOpenOut) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEOpenOut) WriteTo(writer io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := writer.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSEReadIn) SizeBytes() int { + return 40 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEReadIn) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Offset)) + dst = dst[8:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Size)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.ReadFlags)) + dst = dst[4:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.LockOwner)) + dst = dst[8:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) + dst = dst[4:] + // Padding: dst[:sizeof(uint32)] ~= uint32(0) + dst = dst[4:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEReadIn) UnmarshalBytes(src []byte) { + f.Fh = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Offset = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Size = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.ReadFlags = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.LockOwner = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Flags = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + // Padding: var _ uint32 ~= src[:sizeof(uint32)] + src = src[4:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEReadIn) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEReadIn) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEReadIn) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEReadIn) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEReadIn) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEReadIn) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEReadIn) WriteTo(writer io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := writer.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSEWriteIn) SizeBytes() int { + return 40 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEWriteIn) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Offset)) + dst = dst[8:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Size)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.WriteFlags)) + dst = dst[4:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.LockOwner)) + dst = dst[8:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) + dst = dst[4:] + // Padding: dst[:sizeof(uint32)] ~= uint32(0) + dst = dst[4:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEWriteIn) UnmarshalBytes(src []byte) { + f.Fh = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Offset = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Size = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.WriteFlags = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.LockOwner = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Flags = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + // Padding: var _ uint32 ~= src[:sizeof(uint32)] + src = src[4:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEWriteIn) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEWriteIn) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEWriteIn) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEWriteIn) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEWriteIn) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEWriteIn) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEWriteIn) WriteTo(writer io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := writer.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSEWriteOut) SizeBytes() int { + return 8 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEWriteOut) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Size)) + dst = dst[4:] + // Padding: dst[:sizeof(uint32)] ~= uint32(0) + dst = dst[4:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEWriteOut) UnmarshalBytes(src []byte) { + f.Size = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + // Padding: var _ uint32 ~= src[:sizeof(uint32)] + src = src[4:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEWriteOut) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEWriteOut) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEWriteOut) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEWriteOut) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEWriteOut) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEWriteOut) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEWriteOut) WriteTo(writer io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := writer.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSEReleaseIn) SizeBytes() int { + return 24 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEReleaseIn) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) + dst = dst[8:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.ReleaseFlags)) + dst = dst[4:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.LockOwner)) + dst = dst[8:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEReleaseIn) UnmarshalBytes(src []byte) { + f.Fh = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Flags = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.ReleaseFlags = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.LockOwner = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEReleaseIn) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEReleaseIn) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEReleaseIn) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEReleaseIn) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEReleaseIn) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEReleaseIn) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEReleaseIn) WriteTo(writer io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := writer.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSECreateMeta) SizeBytes() int { + return 16 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSECreateMeta) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Mode)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Umask)) + dst = dst[4:] + // Padding: dst[:sizeof(uint32)] ~= uint32(0) + dst = dst[4:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSECreateMeta) UnmarshalBytes(src []byte) { + f.Flags = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.Mode = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.Umask = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + // Padding: var _ uint32 ~= src[:sizeof(uint32)] + src = src[4:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSECreateMeta) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSECreateMeta) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSECreateMeta) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSECreateMeta) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSECreateMeta) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSECreateMeta) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSECreateMeta) WriteTo(writer io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := writer.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSEMknodMeta) SizeBytes() int { + return 16 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEMknodMeta) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Mode)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Rdev)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Umask)) + dst = dst[4:] + // Padding: dst[:sizeof(uint32)] ~= uint32(0) + dst = dst[4:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEMknodMeta) UnmarshalBytes(src []byte) { + f.Mode = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.Rdev = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.Umask = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + // Padding: var _ uint32 ~= src[:sizeof(uint32)] + src = src[4:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEMknodMeta) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEMknodMeta) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEMknodMeta) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEMknodMeta) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEMknodMeta) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEMknodMeta) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEMknodMeta) WriteTo(writer io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := writer.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSEMkdirMeta) SizeBytes() int { + return 8 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEMkdirMeta) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Mode)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Umask)) + dst = dst[4:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEMkdirMeta) UnmarshalBytes(src []byte) { + f.Mode = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.Umask = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEMkdirMeta) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEMkdirMeta) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEMkdirMeta) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEMkdirMeta) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEMkdirMeta) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEMkdirMeta) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEMkdirMeta) WriteTo(writer io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := writer.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSEDirentMeta) SizeBytes() int { + return 24 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSEDirentMeta) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Ino)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Off)) + dst = dst[8:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.NameLen)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Type)) + dst = dst[4:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSEDirentMeta) UnmarshalBytes(src []byte) { + f.Ino = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Off = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.NameLen = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.Type = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSEDirentMeta) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSEDirentMeta) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSEDirentMeta) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSEDirentMeta) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSEDirentMeta) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSEDirentMeta) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSEDirentMeta) WriteTo(writer io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := writer.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (f *FUSESetAttrIn) SizeBytes() int { + return 88 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (f *FUSESetAttrIn) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Valid)) + dst = dst[4:] + // Padding: dst[:sizeof(uint32)] ~= uint32(0) + dst = dst[4:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Size)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.LockOwner)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Atime)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Mtime)) + dst = dst[8:] + usermem.ByteOrder.PutUint64(dst[:8], uint64(f.Ctime)) + dst = dst[8:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.AtimeNsec)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.MtimeNsec)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.CtimeNsec)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.Mode)) + dst = dst[4:] + // Padding: dst[:sizeof(uint32)] ~= uint32(0) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.UID)) + dst = dst[4:] + usermem.ByteOrder.PutUint32(dst[:4], uint32(f.GID)) + dst = dst[4:] + // Padding: dst[:sizeof(uint32)] ~= uint32(0) + dst = dst[4:] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (f *FUSESetAttrIn) UnmarshalBytes(src []byte) { + f.Valid = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + // Padding: var _ uint32 ~= src[:sizeof(uint32)] + src = src[4:] + f.Fh = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Size = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.LockOwner = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Atime = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Mtime = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.Ctime = uint64(usermem.ByteOrder.Uint64(src[:8])) + src = src[8:] + f.AtimeNsec = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.MtimeNsec = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.CtimeNsec = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.Mode = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + // Padding: var _ uint32 ~= src[:sizeof(uint32)] + src = src[4:] + f.UID = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + f.GID = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + // Padding: var _ uint32 ~= src[:sizeof(uint32)] + src = src[4:] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (f *FUSESetAttrIn) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (f *FUSESetAttrIn) MarshalUnsafe(dst []byte) { + safecopy.CopyIn(dst, unsafe.Pointer(f)) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (f *FUSESetAttrIn) UnmarshalUnsafe(src []byte) { + safecopy.CopyOut(unsafe.Pointer(f), src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (f *FUSESetAttrIn) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (f *FUSESetAttrIn) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + return f.CopyOutN(task, addr, f.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (f *FUSESetAttrIn) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := task.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (f *FUSESetAttrIn) WriteTo(writer io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) + hdr.Len = f.SizeBytes() + hdr.Cap = f.SizeBytes() + + length, err := writer.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that f + // must live until the use above. + runtime.KeepAlive(f) // escapes: replaced by intrinsic. + return int64(length), err +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. func (r *RobustListHead) SizeBytes() int { return 24 } @@ -2038,7 +3236,7 @@ func (i *IPTEntry) MarshalUnsafe(dst []byte) { // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IPTEntry) UnmarshalUnsafe(src []byte) { - if i.IP.Packed() && i.Counters.Packed() { + if i.Counters.Packed() && i.IP.Packed() { safecopy.CopyOut(unsafe.Pointer(i), src) } else { // Type IPTEntry doesn't have a packed layout in memory, fallback to UnmarshalBytes. @@ -2226,7 +3424,7 @@ func (i *IPTIP) MarshalUnsafe(dst []byte) { // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IPTIP) UnmarshalUnsafe(src []byte) { - if i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() && i.Src.Packed() { + if i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() { safecopy.CopyOut(unsafe.Pointer(i), src) } else { // Type IPTIP doesn't have a packed layout in memory, fallback to UnmarshalBytes. @@ -2237,7 +3435,7 @@ func (i *IPTIP) UnmarshalUnsafe(src []byte) { // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (i *IPTIP) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { - if !i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() { + if !i.SrcMask.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.Dst.Packed() { // Type IPTIP doesn't have a packed layout in memory, fall back to MarshalBytes. buf := task.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. @@ -2267,7 +3465,7 @@ func (i *IPTIP) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { // CopyIn implements marshal.Marshallable.CopyIn. //go:nosplit func (i *IPTIP) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { - if !i.SrcMask.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.Dst.Packed() { + if !i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() { // Type IPTIP doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := task.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := task.CopyInBytes(addr, buf) // escapes: okay. @@ -2293,7 +3491,7 @@ func (i *IPTIP) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { // WriteTo implements io.WriterTo.WriteTo. func (i *IPTIP) WriteTo(writer io.Writer) (int64, error) { - if !i.DstMask.Packed() && i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() { + if !i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() { // Type IPTIP doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) @@ -3002,7 +4200,7 @@ func (i *IP6TEntry) UnmarshalBytes(src []byte) { // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IP6TEntry) Packed() bool { - return i.IPv6.Packed() && i.Counters.Packed() + return i.Counters.Packed() && i.IPv6.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. @@ -3017,7 +4215,7 @@ func (i *IP6TEntry) MarshalUnsafe(dst []byte) { // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IP6TEntry) UnmarshalUnsafe(src []byte) { - if i.IPv6.Packed() && i.Counters.Packed() { + if i.Counters.Packed() && i.IPv6.Packed() { safecopy.CopyOut(unsafe.Pointer(i), src) } else { // Type IP6TEntry doesn't have a packed layout in memory, fallback to UnmarshalBytes. @@ -3199,12 +4397,12 @@ func (i *IP6TIP) UnmarshalBytes(src []byte) { // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IP6TIP) Packed() bool { - return i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() + return i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() && i.Src.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IP6TIP) MarshalUnsafe(dst []byte) { - if i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() { + if i.SrcMask.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.Dst.Packed() { safecopy.CopyIn(dst, unsafe.Pointer(i)) } else { // Type IP6TIP doesn't have a packed layout in memory, fallback to MarshalBytes. @@ -3225,7 +4423,7 @@ func (i *IP6TIP) UnmarshalUnsafe(src []byte) { // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (i *IP6TIP) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { - if !i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() { + if !i.SrcMask.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.Dst.Packed() { // Type IP6TIP doesn't have a packed layout in memory, fall back to MarshalBytes. buf := task.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. @@ -3281,7 +4479,7 @@ func (i *IP6TIP) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { // WriteTo implements io.WriterTo.WriteTo. func (i *IP6TIP) WriteTo(writer io.Writer) (int64, error) { - if !i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() && i.DstMask.Packed() { + if !i.DstMask.Packed() && i.Src.Packed() && i.Dst.Packed() && i.SrcMask.Packed() { // Type IP6TIP doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) diff --git a/pkg/abi/linux/linux_amd64_abi_autogen_unsafe.go b/pkg/abi/linux/linux_amd64_abi_autogen_unsafe.go index 11c93ba26..224d6c6a7 100644 --- a/pkg/abi/linux/linux_amd64_abi_autogen_unsafe.go +++ b/pkg/abi/linux/linux_amd64_abi_autogen_unsafe.go @@ -303,7 +303,7 @@ func (s *Stat) MarshalUnsafe(dst []byte) { // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *Stat) UnmarshalUnsafe(src []byte) { - if s.CTime.Packed() && s.ATime.Packed() && s.MTime.Packed() { + if s.MTime.Packed() && s.CTime.Packed() && s.ATime.Packed() { safecopy.CopyOut(unsafe.Pointer(s), src) } else { // Type Stat doesn't have a packed layout in memory, fallback to UnmarshalBytes. @@ -370,7 +370,7 @@ func (s *Stat) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { // WriteTo implements io.WriterTo.WriteTo. func (s *Stat) WriteTo(writer io.Writer) (int64, error) { - if !s.ATime.Packed() && s.MTime.Packed() && s.CTime.Packed() { + if !s.MTime.Packed() && s.CTime.Packed() && s.ATime.Packed() { // Type Stat doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) diff --git a/pkg/abi/linux/linux_arm64_abi_autogen_unsafe.go b/pkg/abi/linux/linux_arm64_abi_autogen_unsafe.go index 3f48c6279..2aa8821a3 100644 --- a/pkg/abi/linux/linux_arm64_abi_autogen_unsafe.go +++ b/pkg/abi/linux/linux_arm64_abi_autogen_unsafe.go @@ -295,7 +295,7 @@ func (s *Stat) UnmarshalBytes(src []byte) { // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *Stat) Packed() bool { - return s.ATime.Packed() && s.MTime.Packed() && s.CTime.Packed() + return s.MTime.Packed() && s.CTime.Packed() && s.ATime.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. @@ -321,7 +321,7 @@ func (s *Stat) UnmarshalUnsafe(src []byte) { // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (s *Stat) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { - if !s.MTime.Packed() && s.CTime.Packed() && s.ATime.Packed() { + if !s.ATime.Packed() && s.MTime.Packed() && s.CTime.Packed() { // Type Stat doesn't have a packed layout in memory, fall back to MarshalBytes. buf := task.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go index c8231e0aa..5d4f312cf 100644 --- a/pkg/sentry/fs/host/socket_unsafe.go +++ b/pkg/sentry/fs/host/socket_unsafe.go @@ -65,10 +65,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) ( controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC if n > length { - return length, n, msg.Controllen, controlTrunc, nil + return length, n, msg.Controllen, controlTrunc, err } - return n, n, msg.Controllen, controlTrunc, nil + return n, n, msg.Controllen, controlTrunc, err } // fdWriteVec sends from bufs to fd. diff --git a/pkg/sentry/fsimpl/devpts/root_inode_refs.go b/pkg/sentry/fsimpl/devpts/root_inode_refs.go index b860819f0..051801202 100644 --- a/pkg/sentry/fsimpl/devpts/root_inode_refs.go +++ b/pkg/sentry/fsimpl/devpts/root_inode_refs.go @@ -1,12 +1,12 @@ package devpts import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go index 6df2728ab..a7402c149 100644 --- a/pkg/sentry/fsimpl/fuse/connection.go +++ b/pkg/sentry/fsimpl/fuse/connection.go @@ -15,31 +15,17 @@ package fuse import ( - "errors" - "fmt" "sync" - "sync/atomic" - "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" - "gvisor.dev/gvisor/tools/go_marshal/marshal" ) -// maxActiveRequestsDefault is the default setting controlling the upper bound -// on the number of active requests at any given time. -const maxActiveRequestsDefault = 10000 - -// Ordinary requests have even IDs, while interrupts IDs are odd. -// Used to increment the unique ID for each FUSE request. -var reqIDStep uint64 = 2 - const ( // fuseDefaultMaxBackground is the default value for MaxBackground. fuseDefaultMaxBackground = 12 @@ -52,43 +38,36 @@ const ( fuseDefaultMaxPagesPerReq = 32 ) -// Request represents a FUSE operation request that hasn't been sent to the -// server yet. -// -// +stateify savable -type Request struct { - requestEntry - - id linux.FUSEOpID - hdr *linux.FUSEHeaderIn - data []byte -} - -// Response represents an actual response from the server, including the -// response payload. -// -// +stateify savable -type Response struct { - opcode linux.FUSEOpcode - hdr linux.FUSEHeaderOut - data []byte -} - // connection is the struct by which the sentry communicates with the FUSE server daemon. +// Lock order: +// - conn.fd.mu +// - conn.mu +// - conn.asyncMu type connection struct { fd *DeviceFD + // mu protects access to struct memebers. + mu sync.Mutex + + // attributeVersion is the version of connection's attributes. + attributeVersion uint64 + + // We target FUSE 7.23. // The following FUSE_INIT flags are currently unsupported by this implementation: - // - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC) // - FUSE_EXPORT_SUPPORT - // - FUSE_HANDLE_KILLPRIV // - FUSE_POSIX_LOCKS: requires POSIX locks // - FUSE_FLOCK_LOCKS: requires POSIX locks // - FUSE_AUTO_INVAL_DATA: requires page caching eviction - // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction // - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation // - FUSE_ASYNC_DIO - // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler + // - FUSE_PARALLEL_DIROPS (7.25) + // - FUSE_HANDLE_KILLPRIV (7.26) + // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler (7.26) + // - FUSE_ABORT_ERROR (7.27) + // - FUSE_CACHE_SYMLINKS (7.28) + // - FUSE_NO_OPENDIR_SUPPORT (7.29) + // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction (7.30) + // - FUSE_MAP_ALIGNMENT (7.31) // initialized after receiving FUSE_INIT reply. // Until it's set, suspend sending FUSE requests. @@ -98,10 +77,6 @@ type connection struct { // initializedChan is used to block requests before initialization. initializedChan chan struct{} - // blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground). - // TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block. - blocked bool - // connected (connection established) when a new FUSE file system is created. // Set to false when: // umount, @@ -109,48 +84,55 @@ type connection struct { // device release. connected bool - // aborted via sysfs. - // TODO(gvisor.dev/issue/3185): abort all queued requests. - aborted bool - // connInitError if FUSE_INIT encountered error (major version mismatch). // Only set in INIT. connInitError bool // connInitSuccess if FUSE_INIT is successful. // Only set in INIT. - // Used for destory. + // Used for destory (not yet implemented). connInitSuccess bool - // TODO(gvisor.dev/issue/3185): All the queue logic are working in progress. - - // NumberBackground is the number of requests in the background. - numBackground uint16 + // aborted via sysfs, and will send ECONNABORTED to read after disconnection (instead of ENODEV). + // Set only if abortErr is true and via fuse control fs (not yet implemented). + // TODO(gvisor.dev/issue/3525): set this to true when user aborts. + aborted bool - // congestionThreshold for NumBackground. - // Negotiated in FUSE_INIT. - congestionThreshold uint16 + // numWating is the number of requests waiting to be + // sent to FUSE device or being processed by FUSE daemon. + numWaiting uint32 - // maxBackground is the maximum number of NumBackground. - // Block connection when it is reached. - // Negotiated in FUSE_INIT. - maxBackground uint16 + // Terminology note: + // + // - `asyncNumMax` is the `MaxBackground` in the FUSE_INIT_IN struct. + // + // - `asyncCongestionThreshold` is the `CongestionThreshold` in the FUSE_INIT_IN struct. + // + // We call the "background" requests in unix term as async requests. + // The "async requests" in unix term is our async requests that expect a reply, + // i.e. `!requestOptions.noReply` - // numActiveBackground is the number of requests in background and has being marked as active. - numActiveBackground uint16 + // asyncMu protects the async request fields. + asyncMu sync.Mutex - // numWating is the number of requests waiting for completion. - numWaiting uint32 + // asyncNum is the number of async requests. + // Protected by asyncMu. + asyncNum uint16 - // TODO(gvisor.dev/issue/3185): BgQueue - // some queue for background queued requests. + // asyncCongestionThreshold the number of async requests. + // Negotiated in FUSE_INIT as "CongestionThreshold". + // TODO(gvisor.dev/issue/3529): add congestion control. + // Protected by asyncMu. + asyncCongestionThreshold uint16 - // bgLock protects: - // MaxBackground, CongestionThreshold, NumBackground, - // NumActiveBackground, BgQueue, Blocked. - bgLock sync.Mutex + // asyncNumMax is the maximum number of asyncNum. + // Connection blocks the async requests when it is reached. + // Negotiated in FUSE_INIT as "MaxBackground". + // Protected by asyncMu. + asyncNumMax uint16 // maxRead is the maximum size of a read buffer in in bytes. + // Initialized from a fuse fs parameter. maxRead uint32 // maxWrite is the maximum size of a write buffer in bytes. @@ -165,23 +147,20 @@ type connection struct { // Negotiated and only set in INIT. minor uint32 - // asyncRead if read pages asynchronously. + // atomicOTrunc is true when FUSE does not send a separate SETATTR request + // before open with O_TRUNC flag. // Negotiated and only set in INIT. - asyncRead bool + atomicOTrunc bool - // abortErr is true if kernel need to return an unique read error after abort. + // asyncRead if read pages asynchronously. // Negotiated and only set in INIT. - abortErr bool + asyncRead bool // writebackCache is true for write-back cache policy, // false for write-through policy. // Negotiated and only set in INIT. writebackCache bool - // cacheSymlinks if filesystem needs to cache READLINK responses in page cache. - // Negotiated and only set in INIT. - cacheSymlinks bool - // bigWrites if doing multi-page cached writes. // Negotiated and only set in INIT. bigWrites bool @@ -189,116 +168,70 @@ type connection struct { // dontMask if filestestem does not apply umask to creation modes. // Negotiated in INIT. dontMask bool + + // noOpen if FUSE server doesn't support open operation. + // This flag only influence performance, not correctness of the program. + noOpen bool } // newFUSEConnection creates a FUSE connection to fd. -func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) { +func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, opts *filesystemOptions) (*connection, error) { // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to // mount a FUSE filesystem. fuseFD := fd.Impl().(*DeviceFD) - fuseFD.mounted = true // Create the writeBuf for the header to be stored in. hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) fuseFD.writeBuf = make([]byte, hdrLen) fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse) - fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests) + fuseFD.fullQueueCh = make(chan struct{}, opts.maxActiveRequests) fuseFD.writeCursor = 0 return &connection{ - fd: fuseFD, - maxBackground: fuseDefaultMaxBackground, - congestionThreshold: fuseDefaultCongestionThreshold, - maxPages: fuseDefaultMaxPagesPerReq, - initializedChan: make(chan struct{}), - connected: true, - }, nil -} - -// SetInitialized atomically sets the connection as initialized. -func (conn *connection) SetInitialized() { - // Unblock the requests sent before INIT. - close(conn.initializedChan) - - // Close the channel first to avoid the non-atomic situation - // where conn.initialized is true but there are - // tasks being blocked on the channel. - // And it prevents the newer tasks from gaining - // unnecessary higher chance to be issued before the blocked one. - - atomic.StoreInt32(&(conn.initialized), int32(1)) -} - -// IsInitialized atomically check if the connection is initialized. -// pairs with SetInitialized(). -func (conn *connection) Initialized() bool { - return atomic.LoadInt32(&(conn.initialized)) != 0 -} - -// NewRequest creates a new request that can be sent to the FUSE server. -func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { - conn.fd.mu.Lock() - defer conn.fd.mu.Unlock() - conn.fd.nextOpID += linux.FUSEOpID(reqIDStep) - - hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes() - hdr := linux.FUSEHeaderIn{ - Len: uint32(hdrLen + payload.SizeBytes()), - Opcode: opcode, - Unique: conn.fd.nextOpID, - NodeID: ino, - UID: uint32(creds.EffectiveKUID), - GID: uint32(creds.EffectiveKGID), - PID: pid, - } - - buf := make([]byte, hdr.Len) - hdr.MarshalUnsafe(buf[:hdrLen]) - payload.MarshalUnsafe(buf[hdrLen:]) - - return &Request{ - id: hdr.Unique, - hdr: &hdr, - data: buf, + fd: fuseFD, + asyncNumMax: fuseDefaultMaxBackground, + asyncCongestionThreshold: fuseDefaultCongestionThreshold, + maxRead: opts.maxRead, + maxPages: fuseDefaultMaxPagesPerReq, + initializedChan: make(chan struct{}), + connected: true, }, nil } -// Call makes a request to the server and blocks the invoking task until a -// server responds with a response. Task should never be nil. -// Requests will not be sent before the connection is initialized. -// For async tasks, use CallAsync(). -func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) { - // Block requests sent before connection is initalized. - if !conn.Initialized() { - if err := t.Block(conn.initializedChan); err != nil { - return nil, err - } - } - - return conn.call(t, r) +// CallAsync makes an async (aka background) request. +// It's a simple wrapper around Call(). +func (conn *connection) CallAsync(t *kernel.Task, r *Request) error { + r.async = true + _, err := conn.Call(t, r) + return err } -// CallAsync makes an async (aka background) request. -// Those requests either do not expect a response (e.g. release) or -// the response should be handled by others (e.g. init). -// Return immediately unless the connection is blocked (before initialization). -// Async call example: init, release, forget, aio, interrupt. +// Call makes a request to the server. +// Block before the connection is initialized. // When the Request is FUSE_INIT, it will not be blocked before initialization. -func (conn *connection) CallAsync(t *kernel.Task, r *Request) error { +// Task should never be nil. +// +// For a sync request, it blocks the invoking task until +// a server responds with a response. +// +// For an async request (that do not expect a response immediately), +// it returns directly unless being blocked either before initialization +// or when there are too many async requests ongoing. +// +// Example for async request: +// init, readahead, write, async read/write, fuse_notify_reply, +// non-sync release, interrupt, forget. +// +// The forget request does not have a reply, +// as documented in include/uapi/linux/fuse.h:FUSE_FORGET. +func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) { // Block requests sent before connection is initalized. if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT { if err := t.Block(conn.initializedChan); err != nil { - return err + return nil, err } } - // This should be the only place that invokes call() with a nil task. - _, err := conn.call(nil, r) - return err -} - -// call makes a call without blocking checks. -func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) { if !conn.connected { return nil, syserror.ENOTCONN } @@ -315,31 +248,6 @@ func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) { return fut.resolve(t) } -// Error returns the error of the FUSE call. -func (r *Response) Error() error { - errno := r.hdr.Error - if errno >= 0 { - return nil - } - - sysErrNo := syscall.Errno(-errno) - return error(sysErrNo) -} - -// UnmarshalPayload unmarshals the response data into m. -func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { - hdrLen := r.hdr.SizeBytes() - haveDataLen := r.hdr.Len - uint32(hdrLen) - wantDataLen := uint32(m.SizeBytes()) - - if haveDataLen < wantDataLen { - return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen) - } - - m.UnmarshalUnsafe(r.data[hdrLen:]) - return nil -} - // callFuture makes a request to the server and returns a future response. // Call resolve() when the response needs to be fulfilled. func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) { @@ -358,11 +266,6 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, // if there are always too many ongoing requests all the time. The // supported maxActiveRequests setting should be really high to avoid this. for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests { - if t == nil { - // Since there is no task that is waiting. We must error out. - return nil, errors.New("FUSE request queue full") - } - log.Infof("Blocking request %v from being queued. Too many active requests: %v", r.id, conn.fd.numActiveRequests) conn.fd.mu.Unlock() @@ -378,9 +281,19 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, // callFutureLocked makes a request to the server and returns a future response. func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) { + // Check connected again holding conn.mu. + conn.mu.Lock() + if !conn.connected { + conn.mu.Unlock() + // we checked connected before, + // this must be due to aborted connection. + return nil, syserror.ECONNABORTED + } + conn.mu.Unlock() + conn.fd.queue.PushBack(r) - conn.fd.numActiveRequests += 1 - fut := newFutureResponse(r.hdr.Opcode) + conn.fd.numActiveRequests++ + fut := newFutureResponse(r) conn.fd.completions[r.id] = fut // Signal the readers that there is something to read. @@ -388,50 +301,3 @@ func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureRes return fut, nil } - -// futureResponse represents an in-flight request, that may or may not have -// completed yet. Convert it to a resolved Response by calling Resolve, but note -// that this may block. -// -// +stateify savable -type futureResponse struct { - opcode linux.FUSEOpcode - ch chan struct{} - hdr *linux.FUSEHeaderOut - data []byte -} - -// newFutureResponse creates a future response to a FUSE request. -func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse { - return &futureResponse{ - opcode: opcode, - ch: make(chan struct{}), - } -} - -// resolve blocks the task until the server responds to its corresponding request, -// then returns a resolved response. -func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) { - // If there is no Task associated with this request - then we don't try to resolve - // the response. Instead, the task writing the response (proxy to the server) will - // process the response on our behalf. - if t == nil { - log.Infof("fuse.Response.resolve: Not waiting on a response from server.") - return nil, nil - } - - if err := t.Block(f.ch); err != nil { - return nil, err - } - - return f.getResponse(), nil -} - -// getResponse creates a Response from the data the futureResponse has. -func (f *futureResponse) getResponse() *Response { - return &Response{ - opcode: f.opcode, - hdr: *f.hdr, - data: f.data, - } -} diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/connection_control.go index 779c2bd3f..a63c66e7c 100644 --- a/pkg/sentry/fsimpl/fuse/init.go +++ b/pkg/sentry/fsimpl/fuse/connection_control.go @@ -15,7 +15,11 @@ package fuse import ( + "sync/atomic" + "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) @@ -29,9 +33,10 @@ const ( // Follow the same behavior as unix fuse implementation. fuseMaxTimeGranNs = 1000000000 - // Minimum value for MaxWrite. + // Minimum value for MaxWrite and MaxRead. // Follow the same behavior as unix fuse implementation. fuseMinMaxWrite = 4096 + fuseMinMaxRead = 4096 // Temporary default value for max readahead, 128kb. fuseDefaultMaxReadahead = 131072 @@ -49,6 +54,26 @@ var ( MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold ) +// SetInitialized atomically sets the connection as initialized. +func (conn *connection) SetInitialized() { + // Unblock the requests sent before INIT. + close(conn.initializedChan) + + // Close the channel first to avoid the non-atomic situation + // where conn.initialized is true but there are + // tasks being blocked on the channel. + // And it prevents the newer tasks from gaining + // unnecessary higher chance to be issued before the blocked one. + + atomic.StoreInt32(&(conn.initialized), int32(1)) +} + +// IsInitialized atomically check if the connection is initialized. +// pairs with SetInitialized(). +func (conn *connection) Initialized() bool { + return atomic.LoadInt32(&(conn.initialized)) != 0 +} + // InitSend sends a FUSE_INIT request. func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error { in := linux.FUSEInitIn{ @@ -75,24 +100,24 @@ func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error { return err } - var out linux.FUSEInitOut - if err := res.UnmarshalPayload(&out); err != nil { + initRes := fuseInitRes{initLen: res.DataLen()} + if err := res.UnmarshalPayload(&initRes); err != nil { return err } - return conn.initProcessReply(&out, hasSysAdminCap) + return conn.initProcessReply(&initRes.initOut, hasSysAdminCap) } // Process the FUSE_INIT reply from the FUSE server. +// It tries to acquire the conn.asyncMu lock if minor version is newer than 13. func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error { + // No matter error or not, always set initialzied. + // to unblock the blocked requests. + defer conn.SetInitialized() + // No support for old major fuse versions. if out.Major != linux.FUSE_KERNEL_VERSION { conn.connInitError = true - - // Set the connection as initialized and unblock the blocked requests - // (i.e. return error for them). - conn.SetInitialized() - return nil } @@ -100,29 +125,14 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap conn.connInitSuccess = true conn.minor = out.Minor - // No support for limits before minor version 13. - if out.Minor >= 13 { - conn.bgLock.Lock() - - if out.MaxBackground > 0 { - conn.maxBackground = out.MaxBackground - - if !hasSysAdminCap && - conn.maxBackground > MaxUserBackgroundRequest { - conn.maxBackground = MaxUserBackgroundRequest - } - } - - if out.CongestionThreshold > 0 { - conn.congestionThreshold = out.CongestionThreshold - - if !hasSysAdminCap && - conn.congestionThreshold > MaxUserCongestionThreshold { - conn.congestionThreshold = MaxUserCongestionThreshold - } - } - - conn.bgLock.Unlock() + // No support for negotiating MaxWrite before minor version 5. + if out.Minor >= 5 { + conn.maxWrite = out.MaxWrite + } else { + conn.maxWrite = fuseMinMaxWrite + } + if conn.maxWrite < fuseMinMaxWrite { + conn.maxWrite = fuseMinMaxWrite } // No support for the following flags before minor version 6. @@ -131,8 +141,6 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0 conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0 conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0 - conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0 - conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0 // TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs). @@ -148,19 +156,90 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap } } - // No support for negotiating MaxWrite before minor version 5. - if out.Minor >= 5 { - conn.maxWrite = out.MaxWrite - } else { - conn.maxWrite = fuseMinMaxWrite + // No support for limits before minor version 13. + if out.Minor >= 13 { + conn.asyncMu.Lock() + + if out.MaxBackground > 0 { + conn.asyncNumMax = out.MaxBackground + + if !hasSysAdminCap && + conn.asyncNumMax > MaxUserBackgroundRequest { + conn.asyncNumMax = MaxUserBackgroundRequest + } + } + + if out.CongestionThreshold > 0 { + conn.asyncCongestionThreshold = out.CongestionThreshold + + if !hasSysAdminCap && + conn.asyncCongestionThreshold > MaxUserCongestionThreshold { + conn.asyncCongestionThreshold = MaxUserCongestionThreshold + } + } + + conn.asyncMu.Unlock() } - if conn.maxWrite < fuseMinMaxWrite { - conn.maxWrite = fuseMinMaxWrite + + return nil +} + +// Abort this FUSE connection. +// It tries to acquire conn.fd.mu, conn.lock, conn.bgLock in order. +// All possible requests waiting or blocking will be aborted. +func (conn *connection) Abort(ctx context.Context) { + conn.fd.mu.Lock() + conn.mu.Lock() + conn.asyncMu.Lock() + + if !conn.connected { + conn.asyncMu.Unlock() + conn.mu.Unlock() + conn.fd.mu.Unlock() + return } - // Set connection as initialized and unblock the requests - // issued before init. - conn.SetInitialized() + conn.connected = false - return nil + // Empty the `fd.queue` that holds the requests + // not yet read by the FUSE daemon yet. + // These are a subset of the requests in `fuse.completion` map. + for !conn.fd.queue.Empty() { + req := conn.fd.queue.Front() + conn.fd.queue.Remove(req) + } + + var terminate []linux.FUSEOpID + + // 2. Collect the requests have not been sent to FUSE daemon, + // or have not received a reply. + for unique := range conn.fd.completions { + terminate = append(terminate, unique) + } + + // Release all locks to avoid deadlock. + conn.asyncMu.Unlock() + conn.mu.Unlock() + conn.fd.mu.Unlock() + + // 1. The requets blocked before initialization. + // Will reach call() `connected` check and return. + if !conn.Initialized() { + conn.SetInitialized() + } + + // 2. Terminate the requests collected above. + // Set ECONNABORTED error. + // sendError() will remove them from `fd.completion` map. + // Will enter the path of a normally received error. + for _, toTerminate := range terminate { + conn.fd.sendError(ctx, -int32(syscall.ECONNABORTED), toTerminate) + } + + // 3. The requests not yet written to FUSE device. + // Early terminate. + // Will reach callFutureLocked() `connected` check and return. + close(conn.fd.fullQueueCh) + + // TODO(gvisor.dev/issue/3528): Forget all pending forget reqs. } diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index e522ff9a0..64c3e32e2 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -56,9 +55,6 @@ type DeviceFD struct { vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD - // mounted specifies whether a FUSE filesystem was mounted using the DeviceFD. - mounted bool - // nextOpID is used to create new requests. nextOpID linux.FUSEOpID @@ -100,13 +96,15 @@ type DeviceFD struct { // Release implements vfs.FileDescriptionImpl.Release. func (fd *DeviceFD) Release(context.Context) { - fd.fs.conn.connected = false + if fd.fs != nil { + fd.fs.conn.connected = false + } } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if !fd.mounted { + if fd.fs == nil { return 0, syserror.EPERM } @@ -116,10 +114,16 @@ func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset in // Read implements vfs.FileDescriptionImpl.Read. func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if !fd.mounted { + if fd.fs == nil { return 0, syserror.EPERM } + // Return ENODEV if the filesystem is umounted. + if fd.fs.umounted { + // TODO(gvisor.dev/issue/3525): return ECONNABORTED if aborted via fuse control fs. + return 0, syserror.ENODEV + } + // We require that any Read done on this filesystem have a sane minimum // read buffer. It must have the capacity for the fixed parts of any request // header (Linux uses the request header and the FUSEWriteIn header for this @@ -143,58 +147,82 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R } // readLocked implements the reading of the fuse device while locked with DeviceFD.mu. +// +// Preconditions: dst is large enough for any reasonable request. func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - if fd.queue.Empty() { - return 0, syserror.ErrWouldBlock - } + var req *Request - var readCursor uint32 - var bytesRead int64 - for { - req := fd.queue.Front() - if dst.NumBytes() < int64(req.hdr.Len) { - // The request is too large. Cannot process it. All requests must be smaller than the - // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT - // handshake. - errno := -int32(syscall.EIO) - if req.hdr.Opcode == linux.FUSE_SETXATTR { - errno = -int32(syscall.E2BIG) - } + // Find the first valid request. + // For the normal case this loop only execute once. + for !fd.queue.Empty() { + req = fd.queue.Front() - // Return the error to the calling task. - if err := fd.sendError(ctx, errno, req); err != nil { - return 0, err - } + if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() { + break + } - // We're done with this request. - fd.queue.Remove(req) + // The request is too large. Cannot process it. All requests must be smaller than the + // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT + // handshake. + errno := -int32(syscall.EIO) + if req.hdr.Opcode == linux.FUSE_SETXATTR { + errno = -int32(syscall.E2BIG) + } - // Restart the read as this request was invalid. - log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.") - return fd.readLocked(ctx, dst, opts) + // Return the error to the calling task. + if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil { + return 0, err } - n, err := dst.CopyOut(ctx, req.data[readCursor:]) + // We're done with this request. + fd.queue.Remove(req) + req = nil + } + + if req == nil { + return 0, syserror.ErrWouldBlock + } + + // We already checked the size: dst must be able to fit the whole request. + // Now we write the marshalled header, the payload, + // and the potential additional payload + // to the user memory IOSequence. + + n, err := dst.CopyOut(ctx, req.data) + if err != nil { + return 0, err + } + if n != len(req.data) { + return 0, syserror.EIO + } + + if req.hdr.Opcode == linux.FUSE_WRITE { + written, err := dst.DropFirst(n).CopyOut(ctx, req.payload) if err != nil { return 0, err } - readCursor += uint32(n) - bytesRead += int64(n) - - if readCursor >= req.hdr.Len { - // Fully done with this req, remove it from the queue. - fd.queue.Remove(req) - break + if written != len(req.payload) { + return 0, syserror.EIO } + n += int(written) + } + + // Fully done with this req, remove it from the queue. + fd.queue.Remove(req) + + // Remove noReply ones from map of requests expecting a reply. + if req.noReply { + fd.numActiveRequests -= 1 + delete(fd.completions, req.hdr.Unique) } - return bytesRead, nil + return int64(n), nil } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if !fd.mounted { + if fd.fs == nil { return 0, syserror.EPERM } @@ -211,10 +239,15 @@ func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs. // writeLocked implements writing to the fuse device while locked with DeviceFD.mu. func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if !fd.mounted { + if fd.fs == nil { return 0, syserror.EPERM } + // Return ENODEV if the filesystem is umounted. + if fd.fs.umounted { + return 0, syserror.ENODEV + } + var cn, n int64 hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) @@ -276,7 +309,12 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt fut, ok := fd.completions[hdr.Unique] if !ok { - // Server sent us a response for a request we never sent? + if fut.hdr.Unique == linux.FUSE_RELEASE { + // Currently we simply discard the reply for FUSE_RELEASE. + return n + src.NumBytes(), nil + } + // Server sent us a response for a request we never sent, + // or for which we already received a reply (e.g. aborted), an unlikely event. return 0, syserror.EINVAL } @@ -307,8 +345,23 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt // Readiness implements vfs.FileDescriptionImpl.Readiness. func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask { + fd.mu.Lock() + defer fd.mu.Unlock() + return fd.readinessLocked(mask) +} + +// readinessLocked implements checking the readiness of the fuse device while +// locked with DeviceFD.mu. +func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask { var ready waiter.EventMask - ready |= waiter.EventOut // FD is always writable + + if fd.fs.umounted { + ready |= waiter.EventErr + return ready & mask + } + + // FD is always writable. + ready |= waiter.EventOut if !fd.queue.Empty() { // Have reqs available, FD is readable. ready |= waiter.EventIn @@ -330,7 +383,7 @@ func (fd *DeviceFD) EventUnregister(e *waiter.Entry) { // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if !fd.mounted { + if fd.fs == nil { return 0, syserror.EPERM } @@ -339,58 +392,54 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64 // sendResponse sends a response to the waiting task (if any). func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error { - // See if the running task need to perform some action before returning. - // Since we just finished writing the future, we can be sure that - // getResponse generates a populated response. - if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil { - return err - } + // Signal the task waiting on a response if any. + defer close(fut.ch) // Signal that the queue is no longer full. select { case fd.fullQueueCh <- struct{}{}: default: } - fd.numActiveRequests -= 1 + fd.numActiveRequests-- + + if fut.async { + return fd.asyncCallBack(ctx, fut.getResponse()) + } - // Signal the task waiting on a response. - close(fut.ch) return nil } -// sendError sends an error response to the waiting task (if any). -func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error { +// sendError sends an error response to the waiting task (if any) by calling sendResponse(). +func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error { // Return the error to the calling task. outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) respHdr := linux.FUSEHeaderOut{ Len: outHdrLen, Error: errno, - Unique: req.hdr.Unique, + Unique: unique, } fut, ok := fd.completions[respHdr.Unique] if !ok { - // Server sent us a response for a request we never sent? + // A response for a request we never sent, + // or for which we already received a reply (e.g. aborted). return syserror.EINVAL } delete(fd.completions, respHdr.Unique) fut.hdr = &respHdr - if err := fd.sendResponse(ctx, fut); err != nil { - return err - } - - return nil + return fd.sendResponse(ctx, fut) } -// noReceiverAction has the calling kernel.Task do some action if its known that no -// receiver is going to be waiting on the future channel. This is to be used by: -// FUSE_INIT. -func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error { - if r.opcode == linux.FUSE_INIT { +// asyncCallBack executes pre-defined callback function for async requests. +// Currently used by: FUSE_INIT. +func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error { + switch r.opcode { + case linux.FUSE_INIT: creds := auth.CredentialsFromContext(ctx) rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace() return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs)) + // TODO(gvisor.dev/issue/3247): support async read: correctly process the response. } return nil diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go new file mode 100644 index 000000000..798c4a6f3 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/directory.go @@ -0,0 +1,105 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type directoryFD struct { + fileDescription +} + +// Allocate implements directoryFD.Allocate. +func (directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.EISDIR +} + +// PRead implements FileDescriptionImpl.PRead. +func (directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Read implements FileDescriptionImpl.Read. +func (directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// PWrite implements FileDescriptionImpl.PWrite. +func (directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Write implements FileDescriptionImpl.Write. +func (directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// IterDirents implements FileDescriptionImpl.IterDirents. +func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error { + fusefs := dir.inode().fs + task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) + + in := linux.FUSEReadIn{ + Fh: dir.Fh, + Offset: uint64(atomic.LoadInt64(&dir.off)), + Size: linux.FUSE_PAGE_SIZE, + Flags: dir.statusFlags(), + } + + // TODO(gVisor.dev/issue/3404): Support FUSE_READDIRPLUS. + req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), dir.inode().nodeID, linux.FUSE_READDIR, &in) + if err != nil { + return err + } + + res, err := fusefs.conn.Call(task, req) + if err != nil { + return err + } + if err := res.Error(); err != nil { + return err + } + + var out linux.FUSEDirents + if err := res.UnmarshalPayload(&out); err != nil { + return err + } + + for _, fuseDirent := range out.Dirents { + nextOff := int64(fuseDirent.Meta.Off) + dirent := vfs.Dirent{ + Name: fuseDirent.Name, + Type: uint8(fuseDirent.Meta.Type), + Ino: fuseDirent.Meta.Ino, + NextOff: nextOff, + } + + if err := callback.Handle(dirent); err != nil { + return err + } + atomic.StoreInt64(&dir.off, nextOff) + } + + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/file.go b/pkg/sentry/fsimpl/fuse/file.go new file mode 100644 index 000000000..991efcda4 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/file.go @@ -0,0 +1,133 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +// fileDescription implements vfs.FileDescriptionImpl for fuse. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD + + // the file handle used in userspace. + Fh uint64 + + // Nonseekable is indicate cannot perform seek on a file. + Nonseekable bool + + // DirectIO suggest fuse to use direct io operation. + DirectIO bool + + // OpenFlag is the flag returned by open. + OpenFlag uint32 + + // off is the file offset. + off int64 +} + +func (fd *fileDescription) dentry() *kernfs.Dentry { + return fd.vfsfd.Dentry().Impl().(*kernfs.Dentry) +} + +func (fd *fileDescription) inode() *inode { + return fd.dentry().Inode().(*inode) +} + +func (fd *fileDescription) filesystem() *vfs.Filesystem { + return fd.vfsfd.VirtualDentry().Mount().Filesystem() +} + +func (fd *fileDescription) statusFlags() uint32 { + return fd.vfsfd.StatusFlags() +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *fileDescription) Release(ctx context.Context) { + // no need to release if FUSE server doesn't implement Open. + conn := fd.inode().fs.conn + if conn.noOpen { + return + } + + in := linux.FUSEReleaseIn{ + Fh: fd.Fh, + Flags: fd.statusFlags(), + } + // TODO(gvisor.dev/issue/3245): add logic when we support file lock owner. + var opcode linux.FUSEOpcode + if fd.inode().Mode().IsDir() { + opcode = linux.FUSE_RELEASEDIR + } else { + opcode = linux.FUSE_RELEASE + } + kernelTask := kernel.TaskFromContext(ctx) + // ignoring errors and FUSE server reply is analogous to Linux's behavior. + req, err := conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), fd.inode().nodeID, opcode, &in) + if err != nil { + // No way to invoke Call() with an errored request. + return + } + req.noReply = true + conn.CallAsync(kernelTask, req) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, nil +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return 0, nil +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, nil +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, nil +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.filesystem() + inode := fd.inode() + return inode.Stat(ctx, fs, opts) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + fs := fd.filesystem() + creds := auth.CredentialsFromContext(ctx) + return fd.inode().setAttr(ctx, fs, creds, opts, true, fd.Fh) +} diff --git a/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go b/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go index f72fe342e..6c82dce30 100644 --- a/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go +++ b/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go @@ -6,179 +6,191 @@ import ( "gvisor.dev/gvisor/pkg/state" ) -func (x *Request) StateTypeName() string { - return "pkg/sentry/fsimpl/fuse.Request" +func (x *inodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.inodeRefs" } -func (x *Request) StateFields() []string { +func (x *inodeRefs) StateFields() []string { return []string{ - "requestEntry", - "id", - "hdr", - "data", + "refCount", } } -func (x *Request) beforeSave() {} +func (x *inodeRefs) beforeSave() {} -func (x *Request) StateSave(m state.Sink) { +func (x *inodeRefs) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.requestEntry) - m.Save(1, &x.id) - m.Save(2, &x.hdr) - m.Save(3, &x.data) + m.Save(0, &x.refCount) } -func (x *Request) afterLoad() {} +func (x *inodeRefs) afterLoad() {} -func (x *Request) StateLoad(m state.Source) { - m.Load(0, &x.requestEntry) - m.Load(1, &x.id) - m.Load(2, &x.hdr) - m.Load(3, &x.data) +func (x *inodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) } -func (x *Response) StateTypeName() string { - return "pkg/sentry/fsimpl/fuse.Response" +func (x *requestList) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.requestList" } -func (x *Response) StateFields() []string { +func (x *requestList) StateFields() []string { return []string{ - "opcode", - "hdr", - "data", + "head", + "tail", } } -func (x *Response) beforeSave() {} +func (x *requestList) beforeSave() {} -func (x *Response) StateSave(m state.Sink) { +func (x *requestList) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.opcode) - m.Save(1, &x.hdr) - m.Save(2, &x.data) + m.Save(0, &x.head) + m.Save(1, &x.tail) } -func (x *Response) afterLoad() {} +func (x *requestList) afterLoad() {} -func (x *Response) StateLoad(m state.Source) { - m.Load(0, &x.opcode) - m.Load(1, &x.hdr) - m.Load(2, &x.data) +func (x *requestList) StateLoad(m state.Source) { + m.Load(0, &x.head) + m.Load(1, &x.tail) } -func (x *futureResponse) StateTypeName() string { - return "pkg/sentry/fsimpl/fuse.futureResponse" +func (x *requestEntry) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.requestEntry" } -func (x *futureResponse) StateFields() []string { +func (x *requestEntry) StateFields() []string { return []string{ - "opcode", - "ch", - "hdr", - "data", + "next", + "prev", } } -func (x *futureResponse) beforeSave() {} +func (x *requestEntry) beforeSave() {} -func (x *futureResponse) StateSave(m state.Sink) { +func (x *requestEntry) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.opcode) - m.Save(1, &x.ch) - m.Save(2, &x.hdr) - m.Save(3, &x.data) + m.Save(0, &x.next) + m.Save(1, &x.prev) } -func (x *futureResponse) afterLoad() {} +func (x *requestEntry) afterLoad() {} -func (x *futureResponse) StateLoad(m state.Source) { - m.Load(0, &x.opcode) - m.Load(1, &x.ch) - m.Load(2, &x.hdr) - m.Load(3, &x.data) +func (x *requestEntry) StateLoad(m state.Source) { + m.Load(0, &x.next) + m.Load(1, &x.prev) } -func (x *inodeRefs) StateTypeName() string { - return "pkg/sentry/fsimpl/fuse.inodeRefs" +func (x *Request) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.Request" } -func (x *inodeRefs) StateFields() []string { +func (x *Request) StateFields() []string { return []string{ - "refCount", + "requestEntry", + "id", + "hdr", + "data", + "payload", + "async", + "noReply", } } -func (x *inodeRefs) beforeSave() {} +func (x *Request) beforeSave() {} -func (x *inodeRefs) StateSave(m state.Sink) { +func (x *Request) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.refCount) + m.Save(0, &x.requestEntry) + m.Save(1, &x.id) + m.Save(2, &x.hdr) + m.Save(3, &x.data) + m.Save(4, &x.payload) + m.Save(5, &x.async) + m.Save(6, &x.noReply) } -func (x *inodeRefs) afterLoad() {} +func (x *Request) afterLoad() {} -func (x *inodeRefs) StateLoad(m state.Source) { - m.Load(0, &x.refCount) +func (x *Request) StateLoad(m state.Source) { + m.Load(0, &x.requestEntry) + m.Load(1, &x.id) + m.Load(2, &x.hdr) + m.Load(3, &x.data) + m.Load(4, &x.payload) + m.Load(5, &x.async) + m.Load(6, &x.noReply) } -func (x *requestList) StateTypeName() string { - return "pkg/sentry/fsimpl/fuse.requestList" +func (x *futureResponse) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.futureResponse" } -func (x *requestList) StateFields() []string { +func (x *futureResponse) StateFields() []string { return []string{ - "head", - "tail", + "opcode", + "ch", + "hdr", + "data", + "async", } } -func (x *requestList) beforeSave() {} +func (x *futureResponse) beforeSave() {} -func (x *requestList) StateSave(m state.Sink) { +func (x *futureResponse) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.head) - m.Save(1, &x.tail) + m.Save(0, &x.opcode) + m.Save(1, &x.ch) + m.Save(2, &x.hdr) + m.Save(3, &x.data) + m.Save(4, &x.async) } -func (x *requestList) afterLoad() {} +func (x *futureResponse) afterLoad() {} -func (x *requestList) StateLoad(m state.Source) { - m.Load(0, &x.head) - m.Load(1, &x.tail) +func (x *futureResponse) StateLoad(m state.Source) { + m.Load(0, &x.opcode) + m.Load(1, &x.ch) + m.Load(2, &x.hdr) + m.Load(3, &x.data) + m.Load(4, &x.async) } -func (x *requestEntry) StateTypeName() string { - return "pkg/sentry/fsimpl/fuse.requestEntry" +func (x *Response) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.Response" } -func (x *requestEntry) StateFields() []string { +func (x *Response) StateFields() []string { return []string{ - "next", - "prev", + "opcode", + "hdr", + "data", } } -func (x *requestEntry) beforeSave() {} +func (x *Response) beforeSave() {} -func (x *requestEntry) StateSave(m state.Sink) { +func (x *Response) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.next) - m.Save(1, &x.prev) + m.Save(0, &x.opcode) + m.Save(1, &x.hdr) + m.Save(2, &x.data) } -func (x *requestEntry) afterLoad() {} +func (x *Response) afterLoad() {} -func (x *requestEntry) StateLoad(m state.Source) { - m.Load(0, &x.next) - m.Load(1, &x.prev) +func (x *Response) StateLoad(m state.Source) { + m.Load(0, &x.opcode) + m.Load(1, &x.hdr) + m.Load(2, &x.data) } func init() { - state.Register((*Request)(nil)) - state.Register((*Response)(nil)) - state.Register((*futureResponse)(nil)) state.Register((*inodeRefs)(nil)) state.Register((*requestList)(nil)) state.Register((*requestEntry)(nil)) + state.Register((*Request)(nil)) + state.Register((*futureResponse)(nil)) + state.Register((*Response)(nil)) } diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 810819ae4..402dabe5a 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -16,7 +16,10 @@ package fuse import ( + "math" "strconv" + "sync" + "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -26,11 +29,17 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" + "gvisor.dev/gvisor/tools/go_marshal/marshal" ) // Name is the default filesystem name. const Name = "fuse" +// maxActiveRequestsDefault is the default setting controlling the upper bound +// on the number of active requests at any given time. +const maxActiveRequestsDefault = 10000 + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} @@ -56,6 +65,11 @@ type filesystemOptions struct { // exist at any time. Any further requests will block when trying to // Call the server. maxActiveRequests uint64 + + // maxRead is the max number of bytes to read, + // specified as "max_read" in fs parameters. + // If not specified by user, use math.MaxUint32 as default value. + maxRead uint32 } // filesystem implements vfs.FilesystemImpl. @@ -69,6 +83,9 @@ type filesystem struct { // opts is the options the fusefs is initialized with. opts *filesystemOptions + + // umounted is true if filesystem.Release() has been called. + umounted bool } // Name implements vfs.FilesystemType.Name. @@ -142,14 +159,29 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Set the maxInFlightRequests option. fsopts.maxActiveRequests = maxActiveRequestsDefault + if maxReadStr, ok := mopts["max_read"]; ok { + delete(mopts, "max_read") + maxRead, err := strconv.ParseUint(maxReadStr, 10, 32) + if err != nil { + log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr) + return nil, nil, syserror.EINVAL + } + if maxRead < fuseMinMaxRead { + maxRead = fuseMinMaxRead + } + fsopts.maxRead = uint32(maxRead) + } else { + fsopts.maxRead = math.MaxUint32 + } + // Check for unparsed options. if len(mopts) != 0 { - log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts) + log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts) return nil, nil, syserror.EINVAL } // Create a new FUSE filesystem. - fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd) + fs, err := newFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd) if err != nil { log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err) return nil, nil, err @@ -165,26 +197,27 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // root is the fusefs root directory. - root := fs.newInode(creds, fsopts.rootMode) + root := fs.newRootInode(creds, fsopts.rootMode) return fs.VFSFilesystem(), root.VFSDentry(), nil } -// NewFUSEFilesystem creates a new FUSE filesystem. -func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) { - fs := &filesystem{ - devMinor: devMinor, - opts: opts, - } - - conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests) +// newFUSEFilesystem creates a new FUSE filesystem. +func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) { + conn, err := newFUSEConnection(ctx, device, opts) if err != nil { log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) return nil, syserror.EINVAL } - fs.conn = conn fuseFD := device.Impl().(*DeviceFD) + + fs := &filesystem{ + devMinor: devMinor, + opts: opts, + conn: conn, + } + fuseFD.fs = fs return fs, nil @@ -192,6 +225,14 @@ func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { + fs.umounted = true + fs.conn.Abort(ctx) + + fs.conn.fd.mu.Lock() + // Notify all the waiters on this fd. + fs.conn.fd.waitQueue.Notify(waiter.EventIn) + fs.conn.fd.mu.Unlock() + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } @@ -205,14 +246,49 @@ type inode struct { kernfs.InodeNotSymlink kernfs.OrderedChildren + dentry kernfs.Dentry + + // the owning filesystem. fs is immutable. + fs *filesystem + + // metaDataMu protects the metadata of this inode. + metadataMu sync.Mutex + + nodeID uint64 + locks vfs.FileLocks - dentry kernfs.Dentry + // size of the file. + size uint64 + + // attributeVersion is the version of inode's attributes. + attributeVersion uint64 + + // attributeTime is the remaining vaild time of attributes. + attributeTime uint64 + + // version of the inode. + version uint64 + + // link is result of following a symbolic link. + link string +} + +func (fs *filesystem) newRootInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { + i := &inode{fs: fs} + i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755) + i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + i.dentry.Init(i) + i.nodeID = 1 + + return &i.dentry } -func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { - i := &inode{} - i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) +func (fs *filesystem) newInode(nodeID uint64, attr linux.FUSEAttr) *kernfs.Dentry { + i := &inode{fs: fs, nodeID: nodeID} + creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)} + i.InodeAttrs.Init(&creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode)) + atomic.StoreUint64(&i.size, attr.Size) i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) i.EnableLeakCheck() i.dentry.Init(i) @@ -222,13 +298,301 @@ func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *ke // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ - SeekEnd: kernfs.SeekEndStaticEntries, - }) + isDir := i.InodeAttrs.Mode().IsDir() + // return error if specified to open directory but inode is not a directory. + if !isDir && opts.Mode.IsDir() { + return nil, syserror.ENOTDIR + } + if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS { + return nil, syserror.EOVERFLOW + } + + var fd *fileDescription + var fdImpl vfs.FileDescriptionImpl + if isDir { + directoryFD := &directoryFD{} + fd = &(directoryFD.fileDescription) + fdImpl = directoryFD + } else { + regularFD := ®ularFileFD{} + fd = &(regularFD.fileDescription) + fdImpl = regularFD + } + // FOPEN_KEEP_CACHE is the defualt flag for noOpen. + fd.OpenFlag = linux.FOPEN_KEEP_CACHE + + // Only send open request when FUSE server support open or is opening a directory. + if !i.fs.conn.noOpen || isDir { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context") + return nil, syserror.EINVAL + } + + // Build the request. + var opcode linux.FUSEOpcode + if isDir { + opcode = linux.FUSE_OPENDIR + } else { + opcode = linux.FUSE_OPEN + } + + in := linux.FUSEOpenIn{Flags: opts.Flags & ^uint32(linux.O_CREAT|linux.O_EXCL|linux.O_NOCTTY)} + if !i.fs.conn.atomicOTrunc { + in.Flags &= ^uint32(linux.O_TRUNC) + } + + req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in) + if err != nil { + return nil, err + } + + // Send the request and receive the reply. + res, err := i.fs.conn.Call(kernelTask, req) + if err != nil { + return nil, err + } + if err := res.Error(); err == syserror.ENOSYS && !isDir { + i.fs.conn.noOpen = true + } else if err != nil { + return nil, err + } else { + out := linux.FUSEOpenOut{} + if err := res.UnmarshalPayload(&out); err != nil { + return nil, err + } + + // Process the reply. + fd.OpenFlag = out.OpenFlag + if isDir { + fd.OpenFlag &= ^uint32(linux.FOPEN_DIRECT_IO) + } + + fd.Fh = out.Fh + } + } + + // TODO(gvisor.dev/issue/3234): invalidate mmap after implemented it for FUSE Inode + fd.DirectIO = fd.OpenFlag&linux.FOPEN_DIRECT_IO != 0 + fdOptions := &vfs.FileDescriptionOptions{} + if fd.OpenFlag&linux.FOPEN_NONSEEKABLE != 0 { + fdOptions.DenyPRead = true + fdOptions.DenyPWrite = true + fd.Nonseekable = true + } + + // If we don't send SETATTR before open (which is indicated by atomicOTrunc) + // and O_TRUNC is set, update the inode's version number and clean existing data + // by setting the file size to 0. + if i.fs.conn.atomicOTrunc && opts.Flags&linux.O_TRUNC != 0 { + i.fs.conn.mu.Lock() + i.fs.conn.attributeVersion++ + i.attributeVersion = i.fs.conn.attributeVersion + atomic.StoreUint64(&i.size, 0) + i.fs.conn.mu.Unlock() + i.attributeTime = 0 + } + + if err := fd.vfsfd.Init(fdImpl, opts.Flags, rp.Mount(), vfsd, fdOptions); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// Lookup implements kernfs.Inode.Lookup. +func (i *inode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + in := linux.FUSELookupIn{Name: name} + return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in) +} + +// IterDirents implements kernfs.Inode.IterDirents. +func (*inode) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + return offset, nil +} + +// Valid implements kernfs.Inode.Valid. +func (*inode) Valid(ctx context.Context) bool { + return true +} + +// NewFile implements kernfs.Inode.NewFile. +func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID) + return nil, syserror.EINVAL + } + in := linux.FUSECreateIn{ + CreateMeta: linux.FUSECreateMeta{ + Flags: opts.Flags, + Mode: uint32(opts.Mode) | linux.S_IFREG, + Umask: uint32(kernelTask.FSContext().Umask()), + }, + Name: name, + } + return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in) +} + +// NewNode implements kernfs.Inode.NewNode. +func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) { + in := linux.FUSEMknodIn{ + MknodMeta: linux.FUSEMknodMeta{ + Mode: uint32(opts.Mode), + Rdev: linux.MakeDeviceID(uint16(opts.DevMajor), opts.DevMinor), + Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()), + }, + Name: name, + } + return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in) +} + +// NewSymlink implements kernfs.Inode.NewSymlink. +func (i *inode) NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) { + in := linux.FUSESymLinkIn{ + Name: name, + Target: target, + } + return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in) +} + +// Unlink implements kernfs.Inode.Unlink. +func (i *inode) Unlink(ctx context.Context, name string, child *vfs.Dentry) error { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) + return syserror.EINVAL + } + in := linux.FUSEUnlinkIn{Name: name} + req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in) + if err != nil { + return err + } + res, err := i.fs.conn.Call(kernelTask, req) if err != nil { + return err + } + // only return error, discard res. + if err := res.Error(); err != nil { + return err + } + return i.dentry.RemoveChildLocked(name, child) +} + +// NewDir implements kernfs.Inode.NewDir. +func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) { + in := linux.FUSEMkdirIn{ + MkdirMeta: linux.FUSEMkdirMeta{ + Mode: uint32(opts.Mode), + Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()), + }, + Name: name, + } + return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in) +} + +// RmDir implements kernfs.Inode.RmDir. +func (i *inode) RmDir(ctx context.Context, name string, child *vfs.Dentry) error { + fusefs := i.fs + task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) + + in := linux.FUSERmDirIn{Name: name} + req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in) + if err != nil { + return err + } + + res, err := i.fs.conn.Call(task, req) + if err != nil { + return err + } + if err := res.Error(); err != nil { + return err + } + + // TODO(Before merging): When creating new nodes, should we add nodes to the ordered children? + // If so we'll probably need to call this. We will also need to add them with the writable flag when + // appropriate. + // return i.OrderedChildren.RmDir(ctx, name, child) + + return nil +} + +// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response. +// Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP. +func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*vfs.Dentry, error) { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) + return nil, syserror.EINVAL + } + req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload) + if err != nil { + return nil, err + } + res, err := i.fs.conn.Call(kernelTask, req) + if err != nil { + return nil, err + } + if err := res.Error(); err != nil { return nil, err } - return fd.VFSFileDescription(), nil + out := linux.FUSEEntryOut{} + if err := res.UnmarshalPayload(&out); err != nil { + return nil, err + } + if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) { + return nil, syserror.EIO + } + child := i.fs.newInode(out.NodeID, out.Attr) + if opcode == linux.FUSE_LOOKUP { + i.dentry.InsertChildLocked(name, child) + } else { + i.dentry.InsertChild(name, child) + } + return child.VFSDentry(), nil +} + +// Getlink implements kernfs.Inode.Getlink. +func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { + path, err := i.Readlink(ctx, mnt) + return vfs.VirtualDentry{}, path, err +} + +// Readlink implements kernfs.Inode.Readlink. +func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { + if i.Mode().FileType()&linux.S_IFLNK == 0 { + return "", syserror.EINVAL + } + if len(i.link) == 0 { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context") + return "", syserror.EINVAL + } + req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{}) + if err != nil { + return "", err + } + res, err := i.fs.conn.Call(kernelTask, req) + if err != nil { + return "", err + } + i.link = string(res.data[res.hdr.SizeBytes():]) + if !mnt.Options().ReadOnly { + i.attributeTime = 0 + } + } + return i.link, nil +} + +// getFUSEAttr returns a linux.FUSEAttr of this inode stored in local cache. +// TODO(gvisor.dev/issue/3679): Add support for other fields. +func (i *inode) getFUSEAttr() linux.FUSEAttr { + return linux.FUSEAttr{ + Ino: i.Ino(), + Size: atomic.LoadUint64(&i.size), + Mode: uint32(i.Mode()), + } } // statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The @@ -284,47 +648,89 @@ func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx { return stat } -// Stat implements kernfs.Inode.Stat. -func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - fusefs := fs.Impl().(*filesystem) - conn := fusefs.conn - task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) +// getAttr gets the attribute of this inode by issuing a FUSE_GETATTR request +// or read from local cache. It updates the corresponding attributes if +// necessary. +func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions, flags uint32, fh uint64) (linux.FUSEAttr, error) { + attributeVersion := atomic.LoadUint64(&i.fs.conn.attributeVersion) + + // TODO(gvisor.dev/issue/3679): send the request only if + // - invalid local cache for fields specified in the opts.Mask + // - forced update + // - i.attributeTime expired + // If local cache is still valid, return local cache. + // Currently we always send a request, + // and we always set the metadata with the new result, + // unless attributeVersion has changed. + + task := kernel.TaskFromContext(ctx) if task == nil { log.Warningf("couldn't get kernel task from context") - return linux.Statx{}, syserror.EINVAL + return linux.FUSEAttr{}, syserror.EINVAL } - var in linux.FUSEGetAttrIn - // We don't set any attribute in the request, because in VFS2 fstat(2) will - // finally be translated into vfs.FilesystemImpl.StatAt() (see - // pkg/sentry/syscalls/linux/vfs2/stat.go), resulting in the same flow - // as stat(2). Thus GetAttrFlags and Fh variable will never be used in VFS2. - req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.Ino(), linux.FUSE_GETATTR, &in) + creds := auth.CredentialsFromContext(ctx) + + in := linux.FUSEGetAttrIn{ + GetAttrFlags: flags, + Fh: fh, + } + req, err := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in) if err != nil { - return linux.Statx{}, err + return linux.FUSEAttr{}, err } - res, err := conn.Call(task, req) + res, err := i.fs.conn.Call(task, req) if err != nil { - return linux.Statx{}, err + return linux.FUSEAttr{}, err } if err := res.Error(); err != nil { - return linux.Statx{}, err + return linux.FUSEAttr{}, err } var out linux.FUSEGetAttrOut if err := res.UnmarshalPayload(&out); err != nil { - return linux.Statx{}, err + return linux.FUSEAttr{}, err } - // Set all metadata into kernfs.InodeAttrs. - if err := i.SetStat(ctx, fs, creds, vfs.SetStatOptions{ - Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, fusefs.devMinor), + // Local version is newer, return the local one. + // Skip the update. + if attributeVersion != 0 && atomic.LoadUint64(&i.attributeVersion) > attributeVersion { + return i.getFUSEAttr(), nil + } + + // Set the metadata of kernfs.InodeAttrs. + if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{ + Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor), }); err != nil { + return linux.FUSEAttr{}, err + } + + // Set the size if no error (after SetStat() check). + atomic.StoreUint64(&i.size, out.Attr.Size) + + return out.Attr, nil +} + +// reviseAttr attempts to update the attributes for internal purposes +// by calling getAttr with a pre-specified mask. +// Used by read, write, lseek. +func (i *inode) reviseAttr(ctx context.Context, flags uint32, fh uint64) error { + // Never need atime for internal purposes. + _, err := i.getAttr(ctx, i.fs.VFSFilesystem(), vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS &^ linux.STATX_ATIME, + }, flags, fh) + return err +} + +// Stat implements kernfs.Inode.Stat. +func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + attr, err := i.getAttr(ctx, fs, opts, 0, 0) + if err != nil { return linux.Statx{}, err } - return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil + return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil } // DecRef implements kernfs.Inode. @@ -337,3 +743,84 @@ func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, e // TODO(gvisor.dev/issues/3413): Complete the implementation of statfs. return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil } + +// fattrMaskFromStats converts vfs.SetStatOptions.Stat.Mask to linux stats mask +// aligned with the attribute mask defined in include/linux/fs.h. +func fattrMaskFromStats(mask uint32) uint32 { + var fuseAttrMask uint32 + maskMap := map[uint32]uint32{ + linux.STATX_MODE: linux.FATTR_MODE, + linux.STATX_UID: linux.FATTR_UID, + linux.STATX_GID: linux.FATTR_GID, + linux.STATX_SIZE: linux.FATTR_SIZE, + linux.STATX_ATIME: linux.FATTR_ATIME, + linux.STATX_MTIME: linux.FATTR_MTIME, + linux.STATX_CTIME: linux.FATTR_CTIME, + } + for statxMask, fattrMask := range maskMap { + if mask&statxMask != 0 { + fuseAttrMask |= fattrMask + } + } + return fuseAttrMask +} + +// SetStat implements kernfs.Inode.SetStat. +func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + return i.setAttr(ctx, fs, creds, opts, false, 0) +} + +func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions, useFh bool, fh uint64) error { + conn := i.fs.conn + task := kernel.TaskFromContext(ctx) + if task == nil { + log.Warningf("couldn't get kernel task from context") + return syserror.EINVAL + } + + // We should retain the original file type when assigning new mode. + fileType := uint16(i.Mode()) & linux.S_IFMT + fattrMask := fattrMaskFromStats(opts.Stat.Mask) + if useFh { + fattrMask |= linux.FATTR_FH + } + in := linux.FUSESetAttrIn{ + Valid: fattrMask, + Fh: fh, + Size: opts.Stat.Size, + Atime: uint64(opts.Stat.Atime.Sec), + Mtime: uint64(opts.Stat.Mtime.Sec), + Ctime: uint64(opts.Stat.Ctime.Sec), + AtimeNsec: opts.Stat.Atime.Nsec, + MtimeNsec: opts.Stat.Mtime.Nsec, + CtimeNsec: opts.Stat.Ctime.Nsec, + Mode: uint32(fileType | opts.Stat.Mode), + UID: opts.Stat.UID, + GID: opts.Stat.GID, + } + req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in) + if err != nil { + return err + } + + res, err := conn.Call(task, req) + if err != nil { + return err + } + if err := res.Error(); err != nil { + return err + } + out := linux.FUSEGetAttrOut{} + if err := res.UnmarshalPayload(&out); err != nil { + return err + } + + // Set the metadata of kernfs.InodeAttrs. + if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{ + Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor), + }); err != nil { + return err + } + + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/inode_refs.go b/pkg/sentry/fsimpl/fuse/inode_refs.go index 646042846..6b9456e1d 100644 --- a/pkg/sentry/fsimpl/fuse/inode_refs.go +++ b/pkg/sentry/fsimpl/fuse/inode_refs.go @@ -1,12 +1,12 @@ package fuse import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go new file mode 100644 index 000000000..625d1547f --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/read_write.go @@ -0,0 +1,242 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "io" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// ReadInPages sends FUSE_READ requests for the size after round it up to +// a multiple of page size, blocks on it for reply, processes the reply +// and returns the payload (or joined payloads) as a byte slice. +// This is used for the general purpose reading. +// We do not support direct IO (which read the exact number of bytes) +// at this moment. +func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off uint64, size uint32) ([][]byte, uint32, error) { + attributeVersion := atomic.LoadUint64(&fs.conn.attributeVersion) + + t := kernel.TaskFromContext(ctx) + if t == nil { + log.Warningf("fusefs.Read: couldn't get kernel task from context") + return nil, 0, syserror.EINVAL + } + + // Round up to a multiple of page size. + readSize, _ := usermem.PageRoundUp(uint64(size)) + + // One request cannnot exceed either maxRead or maxPages. + maxPages := fs.conn.maxRead >> usermem.PageShift + if maxPages > uint32(fs.conn.maxPages) { + maxPages = uint32(fs.conn.maxPages) + } + + var outs [][]byte + var sizeRead uint32 + + // readSize is a multiple of usermem.PageSize. + // Always request bytes as a multiple of pages. + pagesRead, pagesToRead := uint32(0), uint32(readSize>>usermem.PageShift) + + // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation. + in := linux.FUSEReadIn{ + Fh: fd.Fh, + LockOwner: 0, // TODO(gvisor.dev/issue/3245): file lock + ReadFlags: 0, // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER + Flags: fd.statusFlags(), + } + + // This loop is intended for fragmented read where the bytes to read is + // larger than either the maxPages or maxRead. + // For the majority of reads with normal size, this loop should only + // execute once. + for pagesRead < pagesToRead { + pagesCanRead := pagesToRead - pagesRead + if pagesCanRead > maxPages { + pagesCanRead = maxPages + } + + in.Offset = off + (uint64(pagesRead) << usermem.PageShift) + in.Size = pagesCanRead << usermem.PageShift + + req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_READ, &in) + if err != nil { + return nil, 0, err + } + + // TODO(gvisor.dev/issue/3247): support async read. + + res, err := fs.conn.Call(t, req) + if err != nil { + return nil, 0, err + } + if err := res.Error(); err != nil { + return nil, 0, err + } + + // Not enough bytes in response, + // either we reached EOF, + // or the FUSE server sends back a response + // that cannot even fit the hdr. + if len(res.data) <= res.hdr.SizeBytes() { + // We treat both case as EOF here for now + // since there is no reliable way to detect + // the over-short hdr case. + break + } + + // Directly using the slice to avoid extra copy. + out := res.data[res.hdr.SizeBytes():] + + outs = append(outs, out) + sizeRead += uint32(len(out)) + + pagesRead += pagesCanRead + } + + defer fs.ReadCallback(ctx, fd, off, size, sizeRead, attributeVersion) + + // No bytes returned: offset >= EOF. + if len(outs) == 0 { + return nil, 0, io.EOF + } + + return outs, sizeRead, nil +} + +// ReadCallback updates several information after receiving a read response. +// Due to readahead, sizeRead can be larger than size. +func (fs *filesystem) ReadCallback(ctx context.Context, fd *regularFileFD, off uint64, size uint32, sizeRead uint32, attributeVersion uint64) { + // TODO(gvisor.dev/issue/3247): support async read. + // If this is called by an async read, correctly process it. + // May need to update the signature. + + i := fd.inode() + // TODO(gvisor.dev/issue/1193): Invalidate or update atime. + + // Reached EOF. + if sizeRead < size { + // TODO(gvisor.dev/issue/3630): If we have writeback cache, then we need to fill this hole. + // Might need to update the buf to be returned from the Read(). + + // Update existing size. + newSize := off + uint64(sizeRead) + fs.conn.mu.Lock() + if attributeVersion == i.attributeVersion && newSize < atomic.LoadUint64(&i.size) { + fs.conn.attributeVersion++ + i.attributeVersion = i.fs.conn.attributeVersion + atomic.StoreUint64(&i.size, newSize) + } + fs.conn.mu.Unlock() + } +} + +// Write sends FUSE_WRITE requests and return the bytes +// written according to the response. +// +// Preconditions: len(data) == size. +func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, size uint32, data []byte) (uint32, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + log.Warningf("fusefs.Read: couldn't get kernel task from context") + return 0, syserror.EINVAL + } + + // One request cannnot exceed either maxWrite or maxPages. + maxWrite := uint32(fs.conn.maxPages) << usermem.PageShift + if maxWrite > fs.conn.maxWrite { + maxWrite = fs.conn.maxWrite + } + + // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation. + in := linux.FUSEWriteIn{ + Fh: fd.Fh, + // TODO(gvisor.dev/issue/3245): file lock + LockOwner: 0, + // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER + // TODO(gvisor.dev/issue/3237): |= linux.FUSE_WRITE_CACHE (not added yet) + WriteFlags: 0, + Flags: fd.statusFlags(), + } + + var written uint32 + + // This loop is intended for fragmented write where the bytes to write is + // larger than either the maxWrite or maxPages or when bigWrites is false. + // Unless a small value for max_write is explicitly used, this loop + // is expected to execute only once for the majority of the writes. + for written < size { + toWrite := size - written + + // Limit the write size to one page. + // Note that the bigWrites flag is obsolete, + // latest libfuse always sets it on. + if !fs.conn.bigWrites && toWrite > usermem.PageSize { + toWrite = usermem.PageSize + } + + // Limit the write size to maxWrite. + if toWrite > maxWrite { + toWrite = maxWrite + } + + in.Offset = off + uint64(written) + in.Size = toWrite + + req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_WRITE, &in) + if err != nil { + return 0, err + } + + req.payload = data[written : written+toWrite] + + // TODO(gvisor.dev/issue/3247): support async write. + + res, err := fs.conn.Call(t, req) + if err != nil { + return 0, err + } + if err := res.Error(); err != nil { + return 0, err + } + + out := linux.FUSEWriteOut{} + if err := res.UnmarshalPayload(&out); err != nil { + return 0, err + } + + // Write more than requested? EIO. + if out.Size > toWrite { + return 0, syserror.EIO + } + + written += out.Size + + // Break if short write. Not necessarily an error. + if out.Size != toWrite { + break + } + } + + return written, nil +} diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go new file mode 100644 index 000000000..5bdd096c3 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/regular_file.go @@ -0,0 +1,230 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "io" + "math" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type regularFileFD struct { + fileDescription + + // off is the file offset. + off int64 + // offMu protects off. + offMu sync.Mutex +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, syserror.EOPNOTSUPP + } + + size := dst.NumBytes() + if size == 0 { + // Early return if count is 0. + return 0, nil + } else if size > math.MaxUint32 { + // FUSE only supports uint32 for size. + // Overflow. + return 0, syserror.EINVAL + } + + // TODO(gvisor.dev/issue/3678): Add direct IO support. + + inode := fd.inode() + + // Reading beyond EOF, update file size if outdated. + if uint64(offset+size) > atomic.LoadUint64(&inode.size) { + if err := inode.reviseAttr(ctx, linux.FUSE_GETATTR_FH, fd.Fh); err != nil { + return 0, err + } + // If the offset after update is still too large, return error. + if uint64(offset) >= atomic.LoadUint64(&inode.size) { + return 0, io.EOF + } + } + + // Truncate the read with updated file size. + fileSize := atomic.LoadUint64(&inode.size) + if uint64(offset+size) > fileSize { + size = int64(fileSize) - offset + } + + buffers, n, err := inode.fs.ReadInPages(ctx, fd, uint64(offset), uint32(size)) + if err != nil { + return 0, err + } + + // TODO(gvisor.dev/issue/3237): support indirect IO (e.g. caching), + // store the bytes that were read ahead. + + // Update the number of bytes to copy for short read. + if n < uint32(size) { + size = int64(n) + } + + // Copy the bytes read to the dst. + // This loop is intended for fragmented reads. + // For the majority of reads, this loop only execute once. + var copied int64 + for _, buffer := range buffers { + toCopy := int64(len(buffer)) + if copied+toCopy > size { + toCopy = size - copied + } + cp, err := dst.DropFirst64(copied).CopyOut(ctx, buffer[:toCopy]) + if err != nil { + return 0, err + } + if int64(cp) != toCopy { + return 0, syserror.EIO + } + copied += toCopy + } + + return copied, nil +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.offMu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, _, err := fd.pwrite(ctx, src, offset, opts) + return n, err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.offMu.Lock() + n, off, err := fd.pwrite(ctx, src, fd.off, opts) + fd.off = off + fd.offMu.Unlock() + return n, err +} + +// pwrite returns the number of bytes written, final offset and error. The +// final offset should be ignored by PWrite. +func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { + if offset < 0 { + return 0, offset, syserror.EINVAL + } + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, offset, syserror.EOPNOTSUPP + } + + inode := fd.inode() + inode.metadataMu.Lock() + defer inode.metadataMu.Unlock() + + // If the file is opened with O_APPEND, update offset to file size. + // Note: since our Open() implements the interface of kernfs, + // and kernfs currently does not support O_APPEND, this will never + // be true before we switch out from kernfs. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + // Locking inode.metadataMu is sufficient for reading size + offset = int64(inode.size) + } + + srclen := src.NumBytes() + + if srclen > math.MaxUint32 { + // FUSE only supports uint32 for size. + // Overflow. + return 0, offset, syserror.EINVAL + } + if end := offset + srclen; end < offset { + // Overflow. + return 0, offset, syserror.EINVAL + } + + srclen, err = vfs.CheckLimit(ctx, offset, srclen) + if err != nil { + return 0, offset, err + } + + if srclen == 0 { + // Return before causing any side effects. + return 0, offset, nil + } + + src = src.TakeFirst64(srclen) + + // TODO(gvisor.dev/issue/3237): Add cache support: + // buffer cache. Ideally we write from src to our buffer cache first. + // The slice passed to fs.Write() should be a slice from buffer cache. + data := make([]byte, srclen) + // Reason for making a copy here: connection.Call() blocks on kerneltask, + // which in turn acquires mm.activeMu lock. Functions like CopyInTo() will + // attemp to acquire the mm.activeMu lock as well -> deadlock. + // We must finish reading from the userspace memory before + // t.Block() deactivates it. + cp, err := src.CopyIn(ctx, data) + if err != nil { + return 0, offset, err + } + if int64(cp) != srclen { + return 0, offset, syserror.EIO + } + + n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data) + if err != nil { + return 0, offset, err + } + + if n == 0 { + // We have checked srclen != 0 previously. + // If err == nil, then it's a short write and we return EIO. + return 0, offset, syserror.EIO + } + + written = int64(n) + finalOff = offset + written + + if finalOff > int64(inode.size) { + atomic.StoreUint64(&inode.size, uint64(finalOff)) + atomic.AddUint64(&inode.fs.conn.attributeVersion, 1) + } + + return +} diff --git a/pkg/sentry/fsimpl/fuse/request_response.go b/pkg/sentry/fsimpl/fuse/request_response.go new file mode 100644 index 000000000..70db50f38 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/request_response.go @@ -0,0 +1,229 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "fmt" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/tools/go_marshal/marshal" +) + +// fuseInitRes is a variable-length wrapper of linux.FUSEInitOut. The FUSE +// server may implement an older version of FUSE protocol, which contains a +// linux.FUSEInitOut with less attributes. +// +// Dynamically-sized objects cannot be marshalled. +type fuseInitRes struct { + marshal.StubMarshallable + + // initOut contains the response from the FUSE server. + initOut linux.FUSEInitOut + + // initLen is the total length of bytes of the response. + initLen uint32 +} + +// UnmarshalBytes deserializes src to the initOut attribute in a fuseInitRes. +func (r *fuseInitRes) UnmarshalBytes(src []byte) { + out := &r.initOut + + // Introduced before FUSE kernel version 7.13. + out.Major = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + out.Minor = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + out.MaxReadahead = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + out.Flags = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + out.MaxBackground = uint16(usermem.ByteOrder.Uint16(src[:2])) + src = src[2:] + out.CongestionThreshold = uint16(usermem.ByteOrder.Uint16(src[:2])) + src = src[2:] + out.MaxWrite = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + + // Introduced in FUSE kernel version 7.23. + if len(src) >= 4 { + out.TimeGran = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + } + // Introduced in FUSE kernel version 7.28. + if len(src) >= 2 { + out.MaxPages = uint16(usermem.ByteOrder.Uint16(src[:2])) + src = src[2:] + } +} + +// SizeBytes is the size of the payload of the FUSE_INIT response. +func (r *fuseInitRes) SizeBytes() int { + return int(r.initLen) +} + +// Ordinary requests have even IDs, while interrupts IDs are odd. +// Used to increment the unique ID for each FUSE request. +var reqIDStep uint64 = 2 + +// Request represents a FUSE operation request that hasn't been sent to the +// server yet. +// +// +stateify savable +type Request struct { + requestEntry + + id linux.FUSEOpID + hdr *linux.FUSEHeaderIn + data []byte + + // payload for this request: extra bytes to write after + // the data slice. Used by FUSE_WRITE. + payload []byte + + // If this request is async. + async bool + // If we don't care its response. + // Manually set by the caller. + noReply bool +} + +// NewRequest creates a new request that can be sent to the FUSE server. +func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { + conn.fd.mu.Lock() + defer conn.fd.mu.Unlock() + conn.fd.nextOpID += linux.FUSEOpID(reqIDStep) + + hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes() + hdr := linux.FUSEHeaderIn{ + Len: uint32(hdrLen + payload.SizeBytes()), + Opcode: opcode, + Unique: conn.fd.nextOpID, + NodeID: ino, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + PID: pid, + } + + buf := make([]byte, hdr.Len) + + // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again. + hdr.MarshalBytes(buf[:hdrLen]) + payload.MarshalBytes(buf[hdrLen:]) + + return &Request{ + id: hdr.Unique, + hdr: &hdr, + data: buf, + }, nil +} + +// futureResponse represents an in-flight request, that may or may not have +// completed yet. Convert it to a resolved Response by calling Resolve, but note +// that this may block. +// +// +stateify savable +type futureResponse struct { + opcode linux.FUSEOpcode + ch chan struct{} + hdr *linux.FUSEHeaderOut + data []byte + + // If this request is async. + async bool +} + +// newFutureResponse creates a future response to a FUSE request. +func newFutureResponse(req *Request) *futureResponse { + return &futureResponse{ + opcode: req.hdr.Opcode, + ch: make(chan struct{}), + async: req.async, + } +} + +// resolve blocks the task until the server responds to its corresponding request, +// then returns a resolved response. +func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) { + // Return directly for async requests. + if f.async { + return nil, nil + } + + if err := t.Block(f.ch); err != nil { + return nil, err + } + + return f.getResponse(), nil +} + +// getResponse creates a Response from the data the futureResponse has. +func (f *futureResponse) getResponse() *Response { + return &Response{ + opcode: f.opcode, + hdr: *f.hdr, + data: f.data, + } +} + +// Response represents an actual response from the server, including the +// response payload. +// +// +stateify savable +type Response struct { + opcode linux.FUSEOpcode + hdr linux.FUSEHeaderOut + data []byte +} + +// Error returns the error of the FUSE call. +func (r *Response) Error() error { + errno := r.hdr.Error + if errno >= 0 { + return nil + } + + sysErrNo := syscall.Errno(-errno) + return error(sysErrNo) +} + +// DataLen returns the size of the response without the header. +func (r *Response) DataLen() uint32 { + return r.hdr.Len - uint32(r.hdr.SizeBytes()) +} + +// UnmarshalPayload unmarshals the response data into m. +func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { + hdrLen := r.hdr.SizeBytes() + haveDataLen := r.hdr.Len - uint32(hdrLen) + wantDataLen := uint32(m.SizeBytes()) + + if haveDataLen < wantDataLen { + return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen) + } + + // The response data is empty unless there is some payload. And so, doesn't + // need to be unmarshalled. + if r.data == nil { + return nil + } + + // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again. + m.UnmarshalBytes(r.data[hdrLen:]) + return nil +} diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 0e21c31a4..fa4e19113 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -195,7 +195,11 @@ const ( // and consistent with Linux's semantics (in particular, it is not always // possible for clients to set arbitrary atimes and mtimes depending on the // remote filesystem implementation, and never possible for clients to set - // arbitrary ctimes.) + // arbitrary ctimes.) If a dentry containing a client-defined atime or + // mtime is evicted from cache, client timestamps will be sent to the + // remote filesystem on a best-effort basis to attempt to ensure that + // timestamps will be preserved when another dentry representing the same + // file is instantiated. InteropModeExclusive InteropMode = iota // InteropModeWritethrough is appropriate when there are read-only users of @@ -1311,15 +1315,32 @@ func (d *dentry) destroyLocked(ctx context.Context) { d.handleMu.Unlock() if !d.file.isNil() { - // Note that it's possible that d.atimeDirty or d.mtimeDirty are true, - // i.e. client and server timestamps may differ (because e.g. a client - // write was serviced by the page cache, and only written back to the - // remote file later). Ideally, we'd write client timestamps back to - // the remote filesystem so that timestamps for a new dentry - // instantiated for the same file would remain coherent. Unfortunately, - // this turns out to be too expensive in many cases, so for now we - // don't do this. - if err := d.file.close(ctx); err != nil { + valid := p9.SetAttrMask{} + attr := p9.SetAttr{} + if !d.isDeleted() { + // Write dirty timestamps back to the remote filesystem. + if atomic.LoadUint32(&d.atimeDirty) != 0 { + valid.ATime = true + valid.ATimeNotSystemTime = true + atime := atomic.LoadInt64(&d.atime) + attr.ATimeSeconds = uint64(atime / 1e9) + attr.ATimeNanoSeconds = uint64(atime % 1e9) + } + if atomic.LoadUint32(&d.mtimeDirty) != 0 { + valid.MTime = true + valid.MTimeNotSystemTime = true + mtime := atomic.LoadInt64(&d.mtime) + attr.MTimeSeconds = uint64(mtime / 1e9) + attr.MTimeNanoSeconds = uint64(mtime % 1e9) + } + } + + // Check if attributes need to be changed before closing the file. + if valid.ATime || valid.MTime { + if err := d.file.setAttrClose(ctx, valid, attr); err != nil { + log.Warningf("gofer.dentry.destroyLocked: failed to close file with write dirty timestamps: %v", err) + } + } else if err := d.file.close(ctx); err != nil { log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err) } d.file = p9file{} diff --git a/pkg/sentry/fsimpl/host/connected_endpoint_refs.go b/pkg/sentry/fsimpl/host/connected_endpoint_refs.go index 670166c76..babb3f664 100644 --- a/pkg/sentry/fsimpl/host/connected_endpoint_refs.go +++ b/pkg/sentry/fsimpl/host/connected_endpoint_refs.go @@ -1,12 +1,12 @@ package host import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/host/inode_refs.go b/pkg/sentry/fsimpl/host/inode_refs.go index 49c57c113..17f90ce4a 100644 --- a/pkg/sentry/fsimpl/host/inode_refs.go +++ b/pkg/sentry/fsimpl/host/inode_refs.go @@ -1,12 +1,12 @@ package host import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go index c0bf45f08..35ded24bc 100644 --- a/pkg/sentry/fsimpl/host/socket_unsafe.go +++ b/pkg/sentry/fsimpl/host/socket_unsafe.go @@ -63,10 +63,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) ( controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC if n > length { - return length, n, msg.Controllen, controlTrunc, nil + return length, n, msg.Controllen, controlTrunc, err } - return n, n, msg.Controllen, controlTrunc, nil + return n, n, msg.Controllen, controlTrunc, err } // fdWriteVec sends from bufs to fd. diff --git a/pkg/sentry/fsimpl/kernfs/dentry_refs.go b/pkg/sentry/fsimpl/kernfs/dentry_refs.go index 0269851c9..79863b3bc 100644 --- a/pkg/sentry/fsimpl/kernfs/dentry_refs.go +++ b/pkg/sentry/fsimpl/kernfs/dentry_refs.go @@ -1,12 +1,12 @@ package kernfs import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index d7d3e8f48..49f6a0f1d 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -140,7 +140,7 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir } // Reference on childVFSD dropped by a corresponding Valid. child = childVFSD.Impl().(*Dentry) - parent.insertChildLocked(name, child) + parent.InsertChildLocked(name, child) } return child, nil } @@ -548,7 +548,7 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st if !d.Impl().(*Dentry).isSymlink() { return "", syserror.EINVAL } - return inode.Readlink(ctx) + return inode.Readlink(ctx, rp.Mount()) } // RenameAt implements vfs.FilesystemImpl.RenameAt. @@ -657,6 +657,10 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() + + // Store the name before walkExistingLocked as rp will be advanced past the + // name in the following call. + name := rp.Component() vfsd, inode, err := fs.walkExistingLocked(ctx, rp) fs.processDeferredDecRefsLocked(ctx) if err != nil { @@ -686,7 +690,8 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } - if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil { + + if err := parentDentry.inode.RmDir(ctx, name, vfsd); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } @@ -765,6 +770,10 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() + + // Store the name before walkExistingLocked as rp will be advanced past the + // name in the following call. + name := rp.Component() vfsd, _, err := fs.walkExistingLocked(ctx, rp) fs.processDeferredDecRefsLocked(ctx) if err != nil { @@ -790,7 +799,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } - if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil { + if err := parentDentry.inode.Unlink(ctx, name, vfsd); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index c0b863ba4..ea4f679c2 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -172,7 +172,7 @@ func (InodeNoDynamicLookup) Valid(ctx context.Context) bool { type InodeNotSymlink struct{} // Readlink implements Inode.Readlink. -func (InodeNotSymlink) Readlink(context.Context) (string, error) { +func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) { return "", syserror.EINVAL } @@ -256,6 +256,13 @@ func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (li // SetStat implements Inode.SetStat. func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + return a.SetInodeStat(ctx, fs, creds, opts) +} + +// SetInodeStat sets the corresponding attributes from opts to InodeAttrs. +// This function can be used by other kernfs-based filesystem implementation to +// sets the unexported attributes into kernfs.InodeAttrs. +func (a *InodeAttrs) SetInodeStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask == 0 { return nil } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 88fcd54aa..163f26ceb 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -60,6 +60,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" ) // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory @@ -246,15 +247,15 @@ func (d *Dentry) OnZeroWatches(context.Context) {} // Precondition: d must represent a directory inode. func (d *Dentry) InsertChild(name string, child *Dentry) { d.dirMu.Lock() - d.insertChildLocked(name, child) + d.InsertChildLocked(name, child) d.dirMu.Unlock() } -// insertChildLocked is equivalent to InsertChild, with additional +// InsertChildLocked is equivalent to InsertChild, with additional // preconditions. // // Precondition: d.dirMu must be locked. -func (d *Dentry) insertChildLocked(name string, child *Dentry) { +func (d *Dentry) InsertChildLocked(name string, child *Dentry) { if !d.isDir() { panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) } @@ -267,6 +268,36 @@ func (d *Dentry) insertChildLocked(name string, child *Dentry) { d.children[name] = child } +// RemoveChild removes child from the vfs dentry cache. This does not update the +// directory inode or modify the inode to be unlinked. So calling this on its own +// isn't sufficient to remove a child from a directory. +// +// Precondition: d must represent a directory inode. +func (d *Dentry) RemoveChild(name string, child *vfs.Dentry) error { + d.dirMu.Lock() + defer d.dirMu.Unlock() + return d.RemoveChildLocked(name, child) +} + +// RemoveChildLocked is equivalent to RemoveChild, with additional +// preconditions. +// +// Precondition: d.dirMu must be locked. +func (d *Dentry) RemoveChildLocked(name string, child *vfs.Dentry) error { + if !d.isDir() { + panic(fmt.Sprintf("RemoveChild called on non-directory Dentry: %+v.", d)) + } + c, ok := d.children[name] + if !ok { + return syserror.ENOENT + } + if &c.vfsd != child { + panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! Child: %+v, vfs: %+v", c, child)) + } + delete(d.children, name) + return nil +} + // Inode returns the dentry's inode. func (d *Dentry) Inode() Inode { return d.inode @@ -425,7 +456,7 @@ type inodeDynamicLookup interface { Valid(ctx context.Context) bool // IterDirents is used to iterate over dynamically created entries. It invokes - // cb on each entry in the directory represented by the FileDescription. + // cb on each entry in the directory represented by the Inode. // 'offset' is the offset for the entire IterDirents call, which may include // results from the caller (e.g. "." and ".."). 'relOffset' is the offset // inside the entries returned by this IterDirents invocation. In other words, @@ -437,7 +468,7 @@ type inodeDynamicLookup interface { type inodeSymlink interface { // Readlink returns the target of a symbolic link. If an inode is not a // symlink, the implementation should return EINVAL. - Readlink(ctx context.Context) (string, error) + Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) // Getlink returns the target of a symbolic link, as used by path // resolution: diff --git a/pkg/sentry/fsimpl/kernfs/static_directory_refs.go b/pkg/sentry/fsimpl/kernfs/static_directory_refs.go index 866620ef9..478b04bdd 100644 --- a/pkg/sentry/fsimpl/kernfs/static_directory_refs.go +++ b/pkg/sentry/fsimpl/kernfs/static_directory_refs.go @@ -1,12 +1,12 @@ package kernfs import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 64731a3e4..a9812fcef 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -52,7 +52,7 @@ func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor } // Readlink implements Inode. -func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { +func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) { return s.target, nil } diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go index 360b77ef6..c589b4746 100644 --- a/pkg/sentry/fsimpl/overlay/copy_up.go +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -82,8 +81,6 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { Start: d.parent.upperVD, Path: fspath.Parse(d.name), } - // Used during copy-up of memory-mapped regular files. - var mmapOpts *memmap.MMapOpts cleanupUndoCopyUp := func() { var err error if ftype == linux.S_IFDIR { @@ -139,25 +136,6 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { break } } - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - if d.wrappedMappable != nil { - // We may have memory mappings of the file on the lower layer. - // Switch to mapping the file on the upper layer instead. - mmapOpts = &memmap.MMapOpts{ - Perms: usermem.ReadWrite, - MaxPerms: usermem.ReadWrite, - } - if err := newFD.ConfigureMMap(ctx, mmapOpts); err != nil { - cleanupUndoCopyUp() - return err - } - if mmapOpts.MappingIdentity != nil { - mmapOpts.MappingIdentity.DecRef(ctx) - } - // Don't actually switch Mappables until the end of copy-up; see - // below for why. - } if err := newFD.SetStat(ctx, vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_UID | linux.STATX_GID, @@ -287,62 +265,6 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { atomic.StoreUint64(&d.ino, upperStat.Ino) } - if mmapOpts != nil && mmapOpts.Mappable != nil { - // Note that if mmapOpts != nil, then d.mapsMu is locked for writing - // (from the S_IFREG path above). - - // Propagate mappings of d to the new Mappable. Remember which mappings - // we added so we can remove them on failure. - upperMappable := mmapOpts.Mappable - allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange) - for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { - added := make(memmap.MappingsOfRange) - for m := range seg.Value() { - if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil { - for m := range added { - upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable) - } - for mr, mappings := range allAdded { - for m := range mappings { - upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable) - } - } - return err - } - added[m] = struct{}{} - } - allAdded[seg.Range()] = added - } - - // Switch to the new Mappable. We do this at the end of copy-up - // because: - // - // - We need to switch Mappables (by changing d.wrappedMappable) before - // invalidating Translations from the old Mappable (to pick up - // Translations from the new one). - // - // - We need to lock d.dataMu while changing d.wrappedMappable, but - // must invalidate Translations with d.dataMu unlocked (due to lock - // ordering). - // - // - Consequently, once we unlock d.dataMu, other threads may - // immediately observe the new (copied-up) Mappable, which we want to - // delay until copy-up is guaranteed to succeed. - d.dataMu.Lock() - lowerMappable := d.wrappedMappable - d.wrappedMappable = upperMappable - d.dataMu.Unlock() - d.lowerMappings.InvalidateAll(memmap.InvalidateOpts{}) - - // Remove mappings from the old Mappable. - for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { - for m := range seg.Value() { - lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable) - } - } - d.lowerMappings.RemoveAll() - } - atomic.StoreUint32(&d.copiedUp, 1) return nil } diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go index 74cfd3799..268b32537 100644 --- a/pkg/sentry/fsimpl/overlay/non_directory.go +++ b/pkg/sentry/fsimpl/overlay/non_directory.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -257,105 +256,10 @@ func (fd *nonDirectoryFD) Sync(ctx context.Context) error { // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - if err := fd.ensureMappable(ctx, opts); err != nil { - return err - } - return vfs.GenericConfigureMMap(&fd.vfsfd, fd.dentry(), opts) -} - -// ensureMappable ensures that fd.dentry().wrappedMappable is not nil. -func (fd *nonDirectoryFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error { - d := fd.dentry() - - // Fast path if we already have a Mappable for the current top layer. - if atomic.LoadUint32(&d.isMappable) != 0 { - return nil - } - - // Only permit mmap of regular files, since other file types may have - // unpredictable behavior when mmapped (e.g. /dev/zero). - if atomic.LoadUint32(&d.mode)&linux.S_IFMT != linux.S_IFREG { - return syserror.ENODEV - } - - // Get a Mappable for the current top layer. - fd.mu.Lock() - defer fd.mu.Unlock() - d.copyMu.RLock() - defer d.copyMu.RUnlock() - if atomic.LoadUint32(&d.isMappable) != 0 { - return nil - } - wrappedFD, err := fd.currentFDLocked(ctx) + wrappedFD, err := fd.getCurrentFD(ctx) if err != nil { return err } - if err := wrappedFD.ConfigureMMap(ctx, opts); err != nil { - return err - } - if opts.MappingIdentity != nil { - opts.MappingIdentity.DecRef(ctx) - opts.MappingIdentity = nil - } - // Use this Mappable for all mappings of this layer (unless we raced with - // another call to ensureMappable). - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.dataMu.Lock() - defer d.dataMu.Unlock() - if d.wrappedMappable == nil { - d.wrappedMappable = opts.Mappable - atomic.StoreUint32(&d.isMappable, 1) - } - return nil -} - -// AddMapping implements memmap.Mappable.AddMapping. -func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - if err := d.wrappedMappable.AddMapping(ctx, ms, ar, offset, writable); err != nil { - return err - } - if !d.isCopiedUp() { - d.lowerMappings.AddMapping(ms, ar, offset, writable) - } - return nil -} - -// RemoveMapping implements memmap.Mappable.RemoveMapping. -func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.wrappedMappable.RemoveMapping(ctx, ms, ar, offset, writable) - if !d.isCopiedUp() { - d.lowerMappings.RemoveMapping(ms, ar, offset, writable) - } -} - -// CopyMapping implements memmap.Mappable.CopyMapping. -func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - if err := d.wrappedMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil { - return err - } - if !d.isCopiedUp() { - d.lowerMappings.AddMapping(ms, dstAR, offset, writable) - } - return nil -} - -// Translate implements memmap.Mappable.Translate. -func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { - d.dataMu.RLock() - defer d.dataMu.RUnlock() - return d.wrappedMappable.Translate(ctx, required, optional, at) -} - -// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. -func (d *dentry) InvalidateUnsavable(ctx context.Context) error { - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - return d.wrappedMappable.InvalidateUnsavable(ctx) + defer wrappedFD.DecRef(ctx) + return wrappedFD.ConfigureMMap(ctx, opts) } diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index b2efe5f80..9a8f7010e 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -22,10 +22,6 @@ // filesystem.renameMu // dentry.dirMu // dentry.copyMu -// *** "memmap.Mappable locks" below this point -// dentry.mapsMu -// *** "memmap.Mappable locks taken by Translate" below this point -// dentry.dataMu // // Locking dentry.dirMu in multiple dentries requires that parent dentries are // locked before child dentries, and that filesystem.renameMu is locked to @@ -41,7 +37,6 @@ import ( "gvisor.dev/gvisor/pkg/fspath" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -424,35 +419,6 @@ type dentry struct { devMinor uint32 ino uint64 - // If this dentry represents a regular file, then: - // - // - mapsMu is used to synchronize between copy-up and memmap.Mappable - // methods on dentry preceding mm.MemoryManager.activeMu in the lock order. - // - // - dataMu is used to synchronize between copy-up and - // dentry.(memmap.Mappable).Translate. - // - // - lowerMappings tracks memory mappings of the file. lowerMappings is - // used to invalidate mappings of the lower layer when the file is copied - // up to ensure that they remain coherent with subsequent writes to the - // file. (Note that, as of this writing, Linux overlayfs does not do this; - // this feature is a gVisor extension.) lowerMappings is protected by - // mapsMu. - // - // - If this dentry is copied-up, then wrappedMappable is the Mappable - // obtained from a call to the current top layer's - // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil - // (from a call to nonDirectoryFD.ensureMappable()), it cannot become nil. - // wrappedMappable is protected by mapsMu and dataMu. - // - // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is - // accessed using atomic memory operations. - mapsMu sync.Mutex - lowerMappings memmap.MappingSet - dataMu sync.RWMutex - wrappedMappable memmap.Mappable - isMappable uint32 - locks vfs.FileLocks } diff --git a/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go b/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go index 84c03e6cd..9431c1506 100644 --- a/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go +++ b/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go @@ -1,12 +1,12 @@ package proc import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go b/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go index 5b3c7cd1c..872b20eb0 100644 --- a/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go +++ b/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go @@ -1,12 +1,12 @@ package proc import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go b/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go index 3091deb89..c6d9b3522 100644 --- a/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go +++ b/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go @@ -1,12 +1,12 @@ package proc import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 3f0d78461..5374538c9 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -209,7 +209,7 @@ func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *ker return d } -func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { +func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { file, _ := getTaskFD(s.task, s.fd) if file == nil { return "", syserror.ENOENT diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 356036b9b..4f7f9cb00 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -668,7 +668,7 @@ func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentr } // Readlink implements kernfs.Inode. -func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { +func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { if !kernel.ContextCanTrace(ctx, s.task, false) { return "", syserror.EACCES } @@ -808,11 +808,11 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri } // Readlink implements Inode. -func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) { +func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { if err := checkTaskState(s.task); err != nil { return "", err } - return s.StaticSymlink.Readlink(ctx) + return s.StaticSymlink.Readlink(ctx, mnt) } // Getlink implements Inode.Getlink. diff --git a/pkg/sentry/fsimpl/proc/task_inode_refs.go b/pkg/sentry/fsimpl/proc/task_inode_refs.go index 5833527c5..714488450 100644 --- a/pkg/sentry/fsimpl/proc/task_inode_refs.go +++ b/pkg/sentry/fsimpl/proc/task_inode_refs.go @@ -1,12 +1,12 @@ package proc import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 8c41729e4..68c541046 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -51,7 +51,7 @@ func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns return d } -func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { +func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? @@ -64,8 +64,8 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { return strconv.FormatUint(uint64(tgid), 10), nil } -func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { - target, err := s.Readlink(ctx) +func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx, mnt) return vfs.VirtualDentry{}, target, err } @@ -94,7 +94,7 @@ func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, return d } -func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { +func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? @@ -108,8 +108,8 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { return fmt.Sprintf("%d/task/%d", tgid, tid), nil } -func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { - target, err := s.Readlink(ctx) +func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx, mnt) return vfs.VirtualDentry{}, target, err } diff --git a/pkg/sentry/fsimpl/proc/tasks_inode_refs.go b/pkg/sentry/fsimpl/proc/tasks_inode_refs.go index 0961dd629..22d9cc488 100644 --- a/pkg/sentry/fsimpl/proc/tasks_inode_refs.go +++ b/pkg/sentry/fsimpl/proc/tasks_inode_refs.go @@ -1,12 +1,12 @@ package proc import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/sys/dir_refs.go b/pkg/sentry/fsimpl/sys/dir_refs.go index 718d542eb..89609b198 100644 --- a/pkg/sentry/fsimpl/sys/dir_refs.go +++ b/pkg/sentry/fsimpl/sys/dir_refs.go @@ -1,12 +1,12 @@ package sys import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/fsimpl/tmpfs/inode_refs.go b/pkg/sentry/fsimpl/tmpfs/inode_refs.go index 5ec208d9e..dbf0b2766 100644 --- a/pkg/sentry/fsimpl/tmpfs/inode_refs.go +++ b/pkg/sentry/fsimpl/tmpfs/inode_refs.go @@ -1,12 +1,12 @@ package tmpfs import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/kernel/fd_table_refs.go b/pkg/sentry/kernel/fd_table_refs.go index 9330e37e4..ecba138ac 100644 --- a/pkg/sentry/kernel/fd_table_refs.go +++ b/pkg/sentry/kernel/fd_table_refs.go @@ -1,12 +1,12 @@ package kernel import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/kernel/fs_context_refs.go b/pkg/sentry/kernel/fs_context_refs.go index e2280f400..fb2fde971 100644 --- a/pkg/sentry/kernel/fs_context_refs.go +++ b/pkg/sentry/kernel/fs_context_refs.go @@ -1,12 +1,12 @@ package kernel import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/kernel/process_group_refs.go b/pkg/sentry/kernel/process_group_refs.go index 832a99d66..4ed6e6458 100644 --- a/pkg/sentry/kernel/process_group_refs.go +++ b/pkg/sentry/kernel/process_group_refs.go @@ -1,12 +1,12 @@ package kernel import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go index 90148bbb2..8a2418c41 100644 --- a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go +++ b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go @@ -1,12 +1,12 @@ package kernel import ( - "fmt" - "reflect" "strings" "unsafe" + "fmt" "gvisor.dev/gvisor/pkg/sync" + "reflect" ) // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race diff --git a/pkg/sentry/kernel/session_refs.go b/pkg/sentry/kernel/session_refs.go index 850f56d3a..f2e1bb797 100644 --- a/pkg/sentry/kernel/session_refs.go +++ b/pkg/sentry/kernel/session_refs.go @@ -1,12 +1,12 @@ package kernel import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/kernel/shm/shm_refs.go b/pkg/sentry/kernel/shm/shm_refs.go index 35f74eb6c..51e07d0b3 100644 --- a/pkg/sentry/kernel/shm/shm_refs.go +++ b/pkg/sentry/kernel/shm/shm_refs.go @@ -1,12 +1,12 @@ package shm import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/mm/aio_mappable_refs.go b/pkg/sentry/mm/aio_mappable_refs.go index d4f278400..b99909f07 100644 --- a/pkg/sentry/mm/aio_mappable_refs.go +++ b/pkg/sentry/mm/aio_mappable_refs.go @@ -1,12 +1,12 @@ package mm import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/mm/special_mappable_refs.go b/pkg/sentry/mm/special_mappable_refs.go index 2a348bacf..035bbe690 100644 --- a/pkg/sentry/mm/special_mappable_refs.go +++ b/pkg/sentry/mm/special_mappable_refs.go @@ -1,12 +1,12 @@ package mm import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/platform/ring0/defs_impl_arm64.go b/pkg/sentry/platform/ring0/defs_impl_arm64.go index 2dac9ad14..424b66f76 100644 --- a/pkg/sentry/platform/ring0/defs_impl_arm64.go +++ b/pkg/sentry/platform/ring0/defs_impl_arm64.go @@ -3,11 +3,11 @@ package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "reflect" "fmt" "gvisor.dev/gvisor/pkg/usermem" "io" + "reflect" ) // Useful bits. diff --git a/pkg/sentry/platform/ring0/entry_impl_arm64.s b/pkg/sentry/platform/ring0/entry_impl_arm64.s index 5bf870815..f42800ebf 100644 --- a/pkg/sentry/platform/ring0/entry_impl_arm64.s +++ b/pkg/sentry/platform/ring0/entry_impl_arm64.s @@ -91,9 +91,7 @@ // ERET returns using the ELR and SPSR for the current exception level. #define ERET() \ - WORD $0xd69f03e0; \ - DSB $7; \ - ISB $15; + WORD $0xd69f03e0 // RSV_REG is a register that holds el1 information temporarily. #define RSV_REG R18_PLATFORM diff --git a/pkg/sentry/socket/unix/socket_refs.go b/pkg/sentry/socket/unix/socket_refs.go index a0e5d1393..dababb85f 100644 --- a/pkg/sentry/socket/unix/socket_refs.go +++ b/pkg/sentry/socket/unix/socket_refs.go @@ -1,12 +1,12 @@ package unix import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/socket/unix/transport/queue_refs.go b/pkg/sentry/socket/unix/transport/queue_refs.go index 21d43fc24..0d4e34988 100644 --- a/pkg/sentry/socket/unix/transport/queue_refs.go +++ b/pkg/sentry/socket/unix/transport/queue_refs.go @@ -1,12 +1,12 @@ package transport import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/time/seqatomic_parameters_unsafe.go b/pkg/sentry/time/seqatomic_parameters_unsafe.go index 2cb001080..14978ed91 100644 --- a/pkg/sentry/time/seqatomic_parameters_unsafe.go +++ b/pkg/sentry/time/seqatomic_parameters_unsafe.go @@ -1,12 +1,12 @@ package time import ( - "fmt" - "reflect" "strings" "unsafe" + "fmt" "gvisor.dev/gvisor/pkg/sync" + "reflect" ) // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 68b80a951..2b668fd89 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -107,7 +107,7 @@ func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSeque // file_operations::iterate == file_operations::iterate_shared == NULL in // Linux. func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error { - return syserror.ENOTDIR + return syserror.ENOSYS } // Seek implements FileDescriptionImpl.Seek analogously to diff --git a/pkg/sentry/vfs/file_description_refs.go b/pkg/sentry/vfs/file_description_refs.go index ebc089b4c..bdd7e6554 100644 --- a/pkg/sentry/vfs/file_description_refs.go +++ b/pkg/sentry/vfs/file_description_refs.go @@ -1,12 +1,12 @@ package vfs import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/vfs/filesystem_refs.go b/pkg/sentry/vfs/filesystem_refs.go index 32f21c044..38a9a986f 100644 --- a/pkg/sentry/vfs/filesystem_refs.go +++ b/pkg/sentry/vfs/filesystem_refs.go @@ -1,12 +1,12 @@ package vfs import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 9da09d4c1..06ca91989 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -18,12 +18,14 @@ import ( "bytes" "fmt" "math" + "path" "sort" "strings" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" ) @@ -738,23 +740,11 @@ func (mntns *MountNamespace) Root() VirtualDentry { // // Preconditions: taskRootDir.Ok(). func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { - rootMnt := taskRootDir.mount - vfs.mountMu.Lock() + defer vfs.mountMu.Unlock() + rootMnt := taskRootDir.mount mounts := rootMnt.submountsLocked() - // Take a reference on mounts since we need to drop vfs.mountMu before - // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()). - for _, mnt := range mounts { - mnt.IncRef() - } - vfs.mountMu.Unlock() - defer func() { - for _, mnt := range mounts { - mnt.DecRef(ctx) - } - }() sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) - for _, mnt := range mounts { // Get the path to this mount relative to task root. mntRootVD := VirtualDentry{ @@ -765,7 +755,7 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi if err != nil { // For some reason we didn't get a path. Log a warning // and run with empty path. - ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err) + ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) path = "" } if path == "" { @@ -799,25 +789,11 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi // // Preconditions: taskRootDir.Ok(). func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { - rootMnt := taskRootDir.mount - vfs.mountMu.Lock() + defer vfs.mountMu.Unlock() + rootMnt := taskRootDir.mount mounts := rootMnt.submountsLocked() - // Take a reference on mounts since we need to drop vfs.mountMu before - // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or - // vfs.StatAt() (=> FilesystemImpl.StatAt()). - for _, mnt := range mounts { - mnt.IncRef() - } - vfs.mountMu.Unlock() - defer func() { - for _, mnt := range mounts { - mnt.DecRef(ctx) - } - }() sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) - - creds := auth.CredentialsFromContext(ctx) for _, mnt := range mounts { // Get the path to this mount relative to task root. mntRootVD := VirtualDentry{ @@ -828,7 +804,7 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo if err != nil { // For some reason we didn't get a path. Log a warning // and run with empty path. - ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err) + ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) path = "" } if path == "" { @@ -841,10 +817,9 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo Root: mntRootVD, Start: mntRootVD, } - statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{}) + statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{}) if err != nil { // Well that's not good. Ignore this mount. - ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err) break } @@ -856,9 +831,6 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo fmt.Fprintf(buf, "%d ", mnt.ID) // (2) Parent ID (or this ID if there is no parent). - // Note that even if the call to mnt.parent() races with Mount - // destruction (which is possible since we're not holding vfs.mountMu), - // its Mount.ID will still be valid. pID := mnt.ID if p := mnt.parent(); p != nil { pID = p.ID @@ -907,6 +879,30 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo } } +// MakeSyntheticMountpoint creates parent directories of target if they do not +// exist and attempts to create a directory for the mountpoint. If a +// non-directory file already exists there then we allow it. +func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error { + mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} + + // Make sure the parent directory of target exists. + if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil { + return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err) + } + + // Attempt to mkdir the final component. If a file (of any type) exists + // then we let allow mounting on top of that because we do not require the + // target to be an existing directory, unlike Linux mount(2). + if err := vfs.MkdirAt(ctx, creds, &PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(target), + }, mkdirOpts); err != nil && err != syserror.EEXIST { + return fmt.Errorf("failed to create mountpoint %q: %w", target, err) + } + return nil +} + // manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents. // See Linux fs/seq_file.c:mangle_path. func manglePath(p string) string { diff --git a/pkg/sentry/vfs/mount_namespace_refs.go b/pkg/sentry/vfs/mount_namespace_refs.go index 2c920ac41..63285fb8e 100644 --- a/pkg/sentry/vfs/mount_namespace_refs.go +++ b/pkg/sentry/vfs/mount_namespace_refs.go @@ -1,12 +1,12 @@ package vfs import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 1ebf355ef..ed1cf99ba 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -819,30 +819,6 @@ func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string return nil } -// MakeSyntheticMountpoint creates parent directories of target if they do not -// exist and attempts to create a directory for the mountpoint. If a -// non-directory file already exists there then we allow it. -func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error { - mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} - - // Make sure the parent directory of target exists. - if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil { - return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err) - } - - // Attempt to mkdir the final component. If a file (of any type) exists - // then we let allow mounting on top of that because we do not require the - // target to be an existing directory, unlike Linux mount(2). - if err := vfs.MkdirAt(ctx, creds, &PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(target), - }, mkdirOpts); err != nil && err != syserror.EEXIST { - return fmt.Errorf("failed to create mountpoint %q: %w", target, err) - } - return nil -} - // A VirtualDentry represents a node in a VFS tree, by combining a Dentry // (which represents a node in a Filesystem's tree) and a Mount (which // represents the Filesystem's position in a VFS mount tree). diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go index fe9f50169..f516c8e46 100644 --- a/pkg/syserror/syserror.go +++ b/pkg/syserror/syserror.go @@ -33,6 +33,7 @@ var ( EBADFD = error(syscall.EBADFD) EBUSY = error(syscall.EBUSY) ECHILD = error(syscall.ECHILD) + ECONNABORTED = error(syscall.ECONNABORTED) ECONNREFUSED = error(syscall.ECONNREFUSED) ECONNRESET = error(syscall.ECONNRESET) EDEADLK = error(syscall.EDEADLK) diff --git a/pkg/tcpip/link/tun/tun_endpoint_refs.go b/pkg/tcpip/link/tun/tun_endpoint_refs.go index 06d11f0e1..e0595429c 100644 --- a/pkg/tcpip/link/tun/tun_endpoint_refs.go +++ b/pkg/tcpip/link/tun/tun_endpoint_refs.go @@ -1,12 +1,12 @@ package tun import ( + "fmt" + "runtime" "sync/atomic" - "fmt" "gvisor.dev/gvisor/pkg/log" refs_vfs1 "gvisor.dev/gvisor/pkg/refs" - "runtime" ) // ownerType is used to customize logging. Note that we use a pointer to T so |