diff options
Diffstat (limited to 'pkg/sentry/fs')
-rw-r--r-- | pkg/sentry/fs/README.md | 2 | ||||
-rw-r--r-- | pkg/sentry/fs/ext4/disklayout/BUILD | 8 | ||||
-rw-r--r-- | pkg/sentry/fs/ext4/disklayout/block_group.go | 22 | ||||
-rw-r--r-- | pkg/sentry/fs/ext4/disklayout/block_group_32.go | 4 | ||||
-rw-r--r-- | pkg/sentry/fs/ext4/disklayout/inode.go | 267 | ||||
-rw-r--r-- | pkg/sentry/fs/ext4/disklayout/superblock.go | 22 | ||||
-rw-r--r-- | pkg/sentry/fs/g3doc/inotify.md | 4 | ||||
-rw-r--r-- | pkg/sentry/fs/proc/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/fs/proc/fds.go | 38 | ||||
-rw-r--r-- | pkg/sentry/fs/proc/task.go | 4 |
10 files changed, 323 insertions, 49 deletions
diff --git a/pkg/sentry/fs/README.md b/pkg/sentry/fs/README.md index c4e8faa3c..db4a1b730 100644 --- a/pkg/sentry/fs/README.md +++ b/pkg/sentry/fs/README.md @@ -69,7 +69,7 @@ Specifically this state is: - An `fs.MountManager` containing mount points. -- A `kernel.FDMap` containing pointers to open files. +- A `kernel.FDTable` containing pointers to open files. Anything else managed by the VFS that can be easily loaded into memory from a filesystem is synced back to those filesystems and is not saved. Examples are diff --git a/pkg/sentry/fs/ext4/disklayout/BUILD b/pkg/sentry/fs/ext4/disklayout/BUILD index cdac63655..e3d2f8d71 100644 --- a/pkg/sentry/fs/ext4/disklayout/BUILD +++ b/pkg/sentry/fs/ext4/disklayout/BUILD @@ -8,6 +8,7 @@ go_library( "block_group.go", "block_group_32.go", "block_group_64.go", + "inode.go", "superblock.go", "superblock_32.go", "superblock_64.go", @@ -15,7 +16,12 @@ go_library( "test_utils.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext4/disklayout", - deps = ["//pkg/binary"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + ], ) go_test( diff --git a/pkg/sentry/fs/ext4/disklayout/block_group.go b/pkg/sentry/fs/ext4/disklayout/block_group.go index 7df76a036..32ea3d97d 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group.go +++ b/pkg/sentry/fs/ext4/disklayout/block_group.go @@ -12,26 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package disklayout provides ext4 disk level structures which can be directly -// filled with bytes from the underlying device. All structures on disk are in -// little-endian order. Only jbd2 (journal) structures are in big-endian order. -// Structs aim to emulate structures `exactly` how they are layed out on disk. -// -// Note: All fields in these structs are exported because binary.Read would -// panic otherwise. package disklayout -// BlockGroup represents Linux struct ext4_group_desc which is internally -// called a block group descriptor. An ext4 file system is split into a series -// of block groups. This provides an access layer to information needed to -// access and use a block group. +// BlockGroup represents a Linux ext block group descriptor. An ext file system +// is split into a series of block groups. This provides an access layer to +// information needed to access and use a block group. // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors. type BlockGroup interface { // InodeTable returns the absolute block number of the block containing the // inode table. This points to an array of Inode structs. Inode tables are // statically allocated at mkfs time. The superblock records the number of - // inodes per group (length of this table). + // inodes per group (length of this table) and the size of each inode struct. InodeTable() uint64 // BlockBitmap returns the absolute block number of the block containing the @@ -73,15 +65,15 @@ type BlockGroup interface { // Checksum returns this block group's checksum. // - // If RO_COMPAT_METADATA_CSUM feature is set: + // If SbMetadataCsum feature is set: // - checksum is crc32c(FS UUID + group number + group descriptor // structure) & 0xFFFF. // - // If RO_COMPAT_GDT_CSUM feature is set: + // If SbGdtCsum feature is set: // - checksum is crc16(FS UUID + group number + group descriptor // structure). // - // RO_COMPAT_METADATA_CSUM and RO_COMPAT_GDT_CSUM should not be both set. + // SbMetadataCsum and SbGdtCsum should not be both set. // If they are, Linux warns and asks to run fsck. Checksum() uint16 diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_32.go b/pkg/sentry/fs/ext4/disklayout/block_group_32.go index 087f1fb4a..dadecb3e3 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group_32.go +++ b/pkg/sentry/fs/ext4/disklayout/block_group_32.go @@ -15,8 +15,8 @@ package disklayout // BlockGroup32Bit emulates the first half of struct ext4_group_desc in -// fs/ext4/ext4.h. It is the block group descriptor struct for 32-bit ext4 -// filesystems. It implements BlockGroup interface. +// fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and +// 32-bit ext4 filesystems. It implements BlockGroup interface. // // The suffix `Lo` here stands for lower bits because this is also used in the // 64-bit version where these fields represent the lower half of the fields. diff --git a/pkg/sentry/fs/ext4/disklayout/inode.go b/pkg/sentry/fs/ext4/disklayout/inode.go new file mode 100644 index 000000000..b48001910 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/inode.go @@ -0,0 +1,267 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +// The Inode interface must be implemented by structs representing ext inodes. +// The inode stores all the metadata pertaining to the file (except for the +// file name which is held by the directory entry). It does NOT expose all +// fields and should be extended if need be. +// +// Some file systems (e.g. FAT) use the directory entry to store all this +// information. Ext file systems do not so that they can support hard links. +// However, ext4 cheats a little bit and duplicates the file type in the +// directory entry for performance gains. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes. +type Inode interface { + // Mode returns the linux file mode which is majorly used to extract + // information like: + // - File permissions (read/write/execute by user/group/others). + // - Sticky, set UID and GID bits. + // - File type. + // + // Masks to extract this information are provided in pkg/abi/linux/file.go. + Mode() linux.FileMode + + // UID returns the owner UID. + UID() auth.KUID + + // GID returns the owner GID. + GID() auth.KGID + + // Size returns the size of the file in bytes. + Size() uint64 + + // InodeSize returns the size of this inode struct in bytes. + // In ext2 and ext3, the inode struct and inode disk record size was fixed at + // 128 bytes. Ext4 makes it possible for the inode struct to be bigger. + // However, accessing any field beyond the 128 bytes marker must be verified + // using this method. + InodeSize() uint16 + + // AccessTime returns the last access time. Shows when the file was last read. + // + // If InExtendedAttr is set, then this should NOT be used because the + // underlying field is used to store the extended attribute value checksum. + AccessTime() time.Time + + // ChangeTime returns the last change time. Shows when the file meta data + // (like permissions) was last changed. + // + // If InExtendedAttr is set, then this should NOT be used because the + // underlying field is used to store the lower 32 bits of the attribute + // value’s reference count. + ChangeTime() time.Time + + // ModificationTime returns the last modification time. Shows when the file + // content was last modified. + // + // If InExtendedAttr is set, then this should NOT be used because + // the underlying field contains the number of the inode that owns the + // extended attribute. + ModificationTime() time.Time + + // DeletionTime returns the deletion time. Inodes are marked as deleted by + // writing to the underlying field. FS tools can restore files until they are + // actually overwritten. + DeletionTime() time.Time + + // LinksCount returns the number of hard links to this inode. + // + // Normally there is an upper limit on the number of hard links: + // - ext2/ext3 = 32,000 + // - ext4 = 65,000 + // + // This implies that an ext4 directory cannot have more than 64,998 + // subdirectories because each subdirectory will have a hard link to the + // directory via the `..` entry. The directory has hard link via the `.` entry + // of its own. And finally the inode is initiated with 1 hard link (itself). + // + // The underlying value is reset to 1 if all the following hold: + // - Inode is a directory. + // - SbDirNlink is enabled. + // - Number of hard links is incremented past 64,999. + // Hard link value of 1 for a directory would indicate that the number of hard + // links is unknown because a directory can have minimum 2 hard links (itself + // and `.` entry). + LinksCount() uint16 + + // Flags returns InodeFlags which represents the inode flags. + Flags() InodeFlags + + // Blocks returns the underlying inode.i_block array. This field is special + // and is used to store various kinds of things depending on the filesystem + // version and inode type. + // - In ext2/ext3, it contains the block map. + // - In ext4, it contains the extent tree. + // - For inline files, it contains the file contents. + // - For symlinks, it contains the link path (if it fits here). + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#the-contents-of-inode-i-block. + Blocks() [60]byte +} + +// Inode flags. This is not comprehensive and flags which were not used in +// the Linux kernel have been excluded. +const ( + // InSync indicates that all writes to the file must be synchronous. + InSync = 0x8 + + // InImmutable indicates that this file is immutable. + InImmutable = 0x10 + + // InAppend indicates that this file can only be appended to. + InAppend = 0x20 + + // InNoDump indicates that teh dump(1) utility should not dump this file. + InNoDump = 0x40 + + // InNoAccessTime indicates that the access time of this inode must not be + // updated. + InNoAccessTime = 0x80 + + // InIndex indicates that this directory has hashed indexes. + InIndex = 0x1000 + + // InJournalData indicates that file data must always be written through a + // journal device. + InJournalData = 0x4000 + + // InDirSync indicates that all the directory entiry data must be written + // synchronously. + InDirSync = 0x10000 + + // InTopDir indicates that this inode is at the top of the directory hierarchy. + InTopDir = 0x20000 + + // InHugeFile indicates that this is a huge file. + InHugeFile = 0x40000 + + // InExtents indicates that this inode uses extents. + InExtents = 0x80000 + + // InExtendedAttr indicates that this inode stores a large extended attribute + // value in its data blocks. + InExtendedAttr = 0x200000 + + // InInline indicates that this inode has inline data. + InInline = 0x10000000 + + // InReserved indicates that this inode is reserved for the ext4 library. + InReserved = 0x80000000 +) + +// InodeFlags represents all possible combinations of inode flags. It aims to +// cover the bit masks and provide a more user-friendly interface. +type InodeFlags struct { + Sync bool + Immutable bool + Append bool + NoDump bool + NoAccessTime bool + Index bool + JournalData bool + DirSync bool + TopDir bool + HugeFile bool + Extents bool + ExtendedAttr bool + Inline bool + Reserved bool +} + +// ToInt converts inode flags back to its 32-bit rep. +func (f InodeFlags) ToInt() uint32 { + var res uint32 + + if f.Sync { + res |= InSync + } + if f.Immutable { + res |= InImmutable + } + if f.Append { + res |= InAppend + } + if f.NoDump { + res |= InNoDump + } + if f.NoAccessTime { + res |= InNoAccessTime + } + if f.Index { + res |= InIndex + } + if f.JournalData { + res |= InJournalData + } + if f.DirSync { + res |= InDirSync + } + if f.TopDir { + res |= InTopDir + } + if f.HugeFile { + res |= InHugeFile + } + if f.Extents { + res |= InExtents + } + if f.ExtendedAttr { + res |= InExtendedAttr + } + if f.Inline { + res |= InInline + } + if f.Reserved { + res |= InReserved + } + + return res +} + +// InodeFlagsFromInt converts the integer representation of inode flags to +// a InodeFlags struct. +func InodeFlagsFromInt(f uint32) InodeFlags { + return InodeFlags{ + Sync: f&InSync > 0, + Immutable: f&InImmutable > 0, + Append: f&InAppend > 0, + NoDump: f&InNoDump > 0, + NoAccessTime: f&InNoAccessTime > 0, + Index: f&InIndex > 0, + JournalData: f&InJournalData > 0, + DirSync: f&InDirSync > 0, + TopDir: f&InTopDir > 0, + HugeFile: f&InHugeFile > 0, + Extents: f&InExtents > 0, + ExtendedAttr: f&InExtendedAttr > 0, + Inline: f&InInline > 0, + Reserved: f&InReserved > 0, + } +} + +// These masks define how users can view/modify inode flags. The rest of the +// flags are for internal kernel usage only. +const ( + InUserReadFlagMask = 0x4BDFFF + InUserWriteFlagMask = 0x4B80FF +) diff --git a/pkg/sentry/fs/ext4/disklayout/superblock.go b/pkg/sentry/fs/ext4/disklayout/superblock.go index d630ba8a6..030c73410 100644 --- a/pkg/sentry/fs/ext4/disklayout/superblock.go +++ b/pkg/sentry/fs/ext4/disklayout/superblock.go @@ -12,9 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Package disklayout provides Linux ext file system's disk level structures +// which can be directly read into from the underlying device. All structures +// on disk are in little-endian order. Only jbd2 (journal) structures are in +// big-endian order. Structs aim to emulate structures `exactly` how they are +// layed out on disk. +// +// This library aims to be compatible with all ext(2/3/4) systems so it +// provides a generic interface for all major structures and various +// implementations (for different versions). The user code is responsible for +// using appropriate implementations based on the underlying device. +// +// Notes: +// - All fields in these structs are exported because binary.Read would +// panic otherwise. +// - All OS dependent fields in these structures will be interpretted using +// the Linux version of that field. package disklayout -// SuperBlock should be implemented by structs representing ext4 superblock. +// SuperBlock should be implemented by structs representing the ext superblock. // The superblock holds a lot of information about the enclosing filesystem. // This interface aims to provide access methods to important information held // by the superblock. It does NOT expose all fields of the superblock, only the @@ -23,11 +39,11 @@ package disklayout // Location and replication: // - The superblock is located at offset 1024 in block group 0. // - Redundant copies of the superblock and group descriptors are kept in -// all groups if sparse_super feature flag is NOT set. If it is set, the +// all groups if SbSparse feature flag is NOT set. If it is set, the // replicas only exist in groups whose group number is either 0 or a // power of 3, 5, or 7. // - There is also a sparse superblock feature v2 in which there are just -// two replicas saved in block groups pointed by the s_backup_bgs field. +// two replicas saved in the block groups pointed by sb.s_backup_bgs. // // Replicas should eventually be updated if the superblock is updated. // diff --git a/pkg/sentry/fs/g3doc/inotify.md b/pkg/sentry/fs/g3doc/inotify.md index e1630553b..71a577d9d 100644 --- a/pkg/sentry/fs/g3doc/inotify.md +++ b/pkg/sentry/fs/g3doc/inotify.md @@ -9,7 +9,7 @@ For the most part, the sentry implementation of inotify mirrors the Linux architecture. Inotify instances (i.e. the fd returned by inotify_init(2)) are backed by a pseudo-filesystem. Events are generated from various places in the sentry, including the [syscall layer][syscall_dir], the [vfs layer][dirent] and -the [process fd table][fd_map]. Watches are stored in inodes and generated +the [process fd table][fd_table]. Watches are stored in inodes and generated events are queued to the inotify instance owning the watches for delivery to the user. @@ -114,7 +114,7 @@ ordering. [dirent]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/dirent.go [event]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_event.go -[fd_map]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_map.go +[fd_table]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_table.go [inode]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode.go [inode_watches]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode_inotify.go [inotify]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify.go diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index da41a10ab..70ed854a8 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -42,7 +42,6 @@ go_library( "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/mm", diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index ea7aded9a..bee421d76 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -42,8 +41,8 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF var file *fs.File var fdFlags kernel.FDFlags t.WithMuLocked(func(t *kernel.Task) { - if fdm := t.FDMap(); fdm != nil { - file, fdFlags = fdm.GetDescriptor(kdefs.FD(n)) + if fdTable := t.FDTable(); fdTable != nil { + file, fdFlags = fdTable.Get(int32(n)) } }) if file == nil { @@ -56,36 +55,31 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF // toDentAttr callback for each to get a DentAttr, which it then emits. This is // a helper for implementing fs.InodeOperations.Readdir. func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) { - var fds kernel.FDs + var fds []int32 t.WithMuLocked(func(t *kernel.Task) { - if fdm := t.FDMap(); fdm != nil { - fds = fdm.GetFDs() + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.GetFDs() } }) - fdInts := make([]int, 0, len(fds)) - for _, fd := range fds { - fdInts = append(fdInts, int(fd)) - } - - // Find the fd to start at. - idx := sort.SearchInts(fdInts, int(offset)) - if idx == len(fdInts) { + // Find the appropriate starting point. + idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(offset) }) + if idx == len(fds) { return offset, nil } - fdInts = fdInts[idx:] + fds = fds[idx:] - var fd int - for _, fd = range fdInts { + // Serialize all FDs. + for _, fd := range fds { name := strconv.FormatUint(uint64(fd), 10) - if err := c.DirEmit(name, toDentAttr(fd)); err != nil { + if err := c.DirEmit(name, toDentAttr(int(fd))); err != nil { // Returned offset is the next fd to serialize. return int64(fd), err } } // We serialized them all. Next offset should be higher than last // serialized fd. - return int64(fd + 1), nil + return int64(fds[len(fds)-1] + 1), nil } // fd implements fs.InodeOperations for a file in /proc/TID/fd/. @@ -154,9 +148,9 @@ func (f *fd) Close() error { type fdDir struct { ramfs.Dir - // We hold a reference on the task's fdmap but only keep an indirect - // task pointer to avoid Dirent loading circularity caused by fdmap's - // potential back pointers into the dirent tree. + // We hold a reference on the task's FDTable but only keep an indirect + // task pointer to avoid Dirent loading circularity caused by the + // table's back pointers into the dirent tree. t *kernel.Task } diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index b2e36aeee..ef0ca3301 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -580,8 +580,8 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ( var fds int var vss, rss, data uint64 s.t.WithMuLocked(func(t *kernel.Task) { - if fdm := t.FDMap(); fdm != nil { - fds = fdm.Size() + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.Size() } if mm := t.MemoryManager(); mm != nil { vss = mm.VirtualMemorySize() |