summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fs
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/fs')
-rw-r--r--pkg/sentry/fs/README.md2
-rw-r--r--pkg/sentry/fs/ext4/disklayout/BUILD8
-rw-r--r--pkg/sentry/fs/ext4/disklayout/block_group.go22
-rw-r--r--pkg/sentry/fs/ext4/disklayout/block_group_32.go4
-rw-r--r--pkg/sentry/fs/ext4/disklayout/inode.go267
-rw-r--r--pkg/sentry/fs/ext4/disklayout/superblock.go22
-rw-r--r--pkg/sentry/fs/g3doc/inotify.md4
-rw-r--r--pkg/sentry/fs/proc/BUILD1
-rw-r--r--pkg/sentry/fs/proc/fds.go38
-rw-r--r--pkg/sentry/fs/proc/task.go4
10 files changed, 323 insertions, 49 deletions
diff --git a/pkg/sentry/fs/README.md b/pkg/sentry/fs/README.md
index c4e8faa3c..db4a1b730 100644
--- a/pkg/sentry/fs/README.md
+++ b/pkg/sentry/fs/README.md
@@ -69,7 +69,7 @@ Specifically this state is:
- An `fs.MountManager` containing mount points.
-- A `kernel.FDMap` containing pointers to open files.
+- A `kernel.FDTable` containing pointers to open files.
Anything else managed by the VFS that can be easily loaded into memory from a
filesystem is synced back to those filesystems and is not saved. Examples are
diff --git a/pkg/sentry/fs/ext4/disklayout/BUILD b/pkg/sentry/fs/ext4/disklayout/BUILD
index cdac63655..e3d2f8d71 100644
--- a/pkg/sentry/fs/ext4/disklayout/BUILD
+++ b/pkg/sentry/fs/ext4/disklayout/BUILD
@@ -8,6 +8,7 @@ go_library(
"block_group.go",
"block_group_32.go",
"block_group_64.go",
+ "inode.go",
"superblock.go",
"superblock_32.go",
"superblock_64.go",
@@ -15,7 +16,12 @@ go_library(
"test_utils.go",
],
importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext4/disklayout",
- deps = ["//pkg/binary"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/binary",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/time",
+ ],
)
go_test(
diff --git a/pkg/sentry/fs/ext4/disklayout/block_group.go b/pkg/sentry/fs/ext4/disklayout/block_group.go
index 7df76a036..32ea3d97d 100644
--- a/pkg/sentry/fs/ext4/disklayout/block_group.go
+++ b/pkg/sentry/fs/ext4/disklayout/block_group.go
@@ -12,26 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Package disklayout provides ext4 disk level structures which can be directly
-// filled with bytes from the underlying device. All structures on disk are in
-// little-endian order. Only jbd2 (journal) structures are in big-endian order.
-// Structs aim to emulate structures `exactly` how they are layed out on disk.
-//
-// Note: All fields in these structs are exported because binary.Read would
-// panic otherwise.
package disklayout
-// BlockGroup represents Linux struct ext4_group_desc which is internally
-// called a block group descriptor. An ext4 file system is split into a series
-// of block groups. This provides an access layer to information needed to
-// access and use a block group.
+// BlockGroup represents a Linux ext block group descriptor. An ext file system
+// is split into a series of block groups. This provides an access layer to
+// information needed to access and use a block group.
//
// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors.
type BlockGroup interface {
// InodeTable returns the absolute block number of the block containing the
// inode table. This points to an array of Inode structs. Inode tables are
// statically allocated at mkfs time. The superblock records the number of
- // inodes per group (length of this table).
+ // inodes per group (length of this table) and the size of each inode struct.
InodeTable() uint64
// BlockBitmap returns the absolute block number of the block containing the
@@ -73,15 +65,15 @@ type BlockGroup interface {
// Checksum returns this block group's checksum.
//
- // If RO_COMPAT_METADATA_CSUM feature is set:
+ // If SbMetadataCsum feature is set:
// - checksum is crc32c(FS UUID + group number + group descriptor
// structure) & 0xFFFF.
//
- // If RO_COMPAT_GDT_CSUM feature is set:
+ // If SbGdtCsum feature is set:
// - checksum is crc16(FS UUID + group number + group descriptor
// structure).
//
- // RO_COMPAT_METADATA_CSUM and RO_COMPAT_GDT_CSUM should not be both set.
+ // SbMetadataCsum and SbGdtCsum should not be both set.
// If they are, Linux warns and asks to run fsck.
Checksum() uint16
diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_32.go b/pkg/sentry/fs/ext4/disklayout/block_group_32.go
index 087f1fb4a..dadecb3e3 100644
--- a/pkg/sentry/fs/ext4/disklayout/block_group_32.go
+++ b/pkg/sentry/fs/ext4/disklayout/block_group_32.go
@@ -15,8 +15,8 @@
package disklayout
// BlockGroup32Bit emulates the first half of struct ext4_group_desc in
-// fs/ext4/ext4.h. It is the block group descriptor struct for 32-bit ext4
-// filesystems. It implements BlockGroup interface.
+// fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and
+// 32-bit ext4 filesystems. It implements BlockGroup interface.
//
// The suffix `Lo` here stands for lower bits because this is also used in the
// 64-bit version where these fields represent the lower half of the fields.
diff --git a/pkg/sentry/fs/ext4/disklayout/inode.go b/pkg/sentry/fs/ext4/disklayout/inode.go
new file mode 100644
index 000000000..b48001910
--- /dev/null
+++ b/pkg/sentry/fs/ext4/disklayout/inode.go
@@ -0,0 +1,267 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+)
+
+// The Inode interface must be implemented by structs representing ext inodes.
+// The inode stores all the metadata pertaining to the file (except for the
+// file name which is held by the directory entry). It does NOT expose all
+// fields and should be extended if need be.
+//
+// Some file systems (e.g. FAT) use the directory entry to store all this
+// information. Ext file systems do not so that they can support hard links.
+// However, ext4 cheats a little bit and duplicates the file type in the
+// directory entry for performance gains.
+//
+// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes.
+type Inode interface {
+ // Mode returns the linux file mode which is majorly used to extract
+ // information like:
+ // - File permissions (read/write/execute by user/group/others).
+ // - Sticky, set UID and GID bits.
+ // - File type.
+ //
+ // Masks to extract this information are provided in pkg/abi/linux/file.go.
+ Mode() linux.FileMode
+
+ // UID returns the owner UID.
+ UID() auth.KUID
+
+ // GID returns the owner GID.
+ GID() auth.KGID
+
+ // Size returns the size of the file in bytes.
+ Size() uint64
+
+ // InodeSize returns the size of this inode struct in bytes.
+ // In ext2 and ext3, the inode struct and inode disk record size was fixed at
+ // 128 bytes. Ext4 makes it possible for the inode struct to be bigger.
+ // However, accessing any field beyond the 128 bytes marker must be verified
+ // using this method.
+ InodeSize() uint16
+
+ // AccessTime returns the last access time. Shows when the file was last read.
+ //
+ // If InExtendedAttr is set, then this should NOT be used because the
+ // underlying field is used to store the extended attribute value checksum.
+ AccessTime() time.Time
+
+ // ChangeTime returns the last change time. Shows when the file meta data
+ // (like permissions) was last changed.
+ //
+ // If InExtendedAttr is set, then this should NOT be used because the
+ // underlying field is used to store the lower 32 bits of the attribute
+ // value’s reference count.
+ ChangeTime() time.Time
+
+ // ModificationTime returns the last modification time. Shows when the file
+ // content was last modified.
+ //
+ // If InExtendedAttr is set, then this should NOT be used because
+ // the underlying field contains the number of the inode that owns the
+ // extended attribute.
+ ModificationTime() time.Time
+
+ // DeletionTime returns the deletion time. Inodes are marked as deleted by
+ // writing to the underlying field. FS tools can restore files until they are
+ // actually overwritten.
+ DeletionTime() time.Time
+
+ // LinksCount returns the number of hard links to this inode.
+ //
+ // Normally there is an upper limit on the number of hard links:
+ // - ext2/ext3 = 32,000
+ // - ext4 = 65,000
+ //
+ // This implies that an ext4 directory cannot have more than 64,998
+ // subdirectories because each subdirectory will have a hard link to the
+ // directory via the `..` entry. The directory has hard link via the `.` entry
+ // of its own. And finally the inode is initiated with 1 hard link (itself).
+ //
+ // The underlying value is reset to 1 if all the following hold:
+ // - Inode is a directory.
+ // - SbDirNlink is enabled.
+ // - Number of hard links is incremented past 64,999.
+ // Hard link value of 1 for a directory would indicate that the number of hard
+ // links is unknown because a directory can have minimum 2 hard links (itself
+ // and `.` entry).
+ LinksCount() uint16
+
+ // Flags returns InodeFlags which represents the inode flags.
+ Flags() InodeFlags
+
+ // Blocks returns the underlying inode.i_block array. This field is special
+ // and is used to store various kinds of things depending on the filesystem
+ // version and inode type.
+ // - In ext2/ext3, it contains the block map.
+ // - In ext4, it contains the extent tree.
+ // - For inline files, it contains the file contents.
+ // - For symlinks, it contains the link path (if it fits here).
+ //
+ // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#the-contents-of-inode-i-block.
+ Blocks() [60]byte
+}
+
+// Inode flags. This is not comprehensive and flags which were not used in
+// the Linux kernel have been excluded.
+const (
+ // InSync indicates that all writes to the file must be synchronous.
+ InSync = 0x8
+
+ // InImmutable indicates that this file is immutable.
+ InImmutable = 0x10
+
+ // InAppend indicates that this file can only be appended to.
+ InAppend = 0x20
+
+ // InNoDump indicates that teh dump(1) utility should not dump this file.
+ InNoDump = 0x40
+
+ // InNoAccessTime indicates that the access time of this inode must not be
+ // updated.
+ InNoAccessTime = 0x80
+
+ // InIndex indicates that this directory has hashed indexes.
+ InIndex = 0x1000
+
+ // InJournalData indicates that file data must always be written through a
+ // journal device.
+ InJournalData = 0x4000
+
+ // InDirSync indicates that all the directory entiry data must be written
+ // synchronously.
+ InDirSync = 0x10000
+
+ // InTopDir indicates that this inode is at the top of the directory hierarchy.
+ InTopDir = 0x20000
+
+ // InHugeFile indicates that this is a huge file.
+ InHugeFile = 0x40000
+
+ // InExtents indicates that this inode uses extents.
+ InExtents = 0x80000
+
+ // InExtendedAttr indicates that this inode stores a large extended attribute
+ // value in its data blocks.
+ InExtendedAttr = 0x200000
+
+ // InInline indicates that this inode has inline data.
+ InInline = 0x10000000
+
+ // InReserved indicates that this inode is reserved for the ext4 library.
+ InReserved = 0x80000000
+)
+
+// InodeFlags represents all possible combinations of inode flags. It aims to
+// cover the bit masks and provide a more user-friendly interface.
+type InodeFlags struct {
+ Sync bool
+ Immutable bool
+ Append bool
+ NoDump bool
+ NoAccessTime bool
+ Index bool
+ JournalData bool
+ DirSync bool
+ TopDir bool
+ HugeFile bool
+ Extents bool
+ ExtendedAttr bool
+ Inline bool
+ Reserved bool
+}
+
+// ToInt converts inode flags back to its 32-bit rep.
+func (f InodeFlags) ToInt() uint32 {
+ var res uint32
+
+ if f.Sync {
+ res |= InSync
+ }
+ if f.Immutable {
+ res |= InImmutable
+ }
+ if f.Append {
+ res |= InAppend
+ }
+ if f.NoDump {
+ res |= InNoDump
+ }
+ if f.NoAccessTime {
+ res |= InNoAccessTime
+ }
+ if f.Index {
+ res |= InIndex
+ }
+ if f.JournalData {
+ res |= InJournalData
+ }
+ if f.DirSync {
+ res |= InDirSync
+ }
+ if f.TopDir {
+ res |= InTopDir
+ }
+ if f.HugeFile {
+ res |= InHugeFile
+ }
+ if f.Extents {
+ res |= InExtents
+ }
+ if f.ExtendedAttr {
+ res |= InExtendedAttr
+ }
+ if f.Inline {
+ res |= InInline
+ }
+ if f.Reserved {
+ res |= InReserved
+ }
+
+ return res
+}
+
+// InodeFlagsFromInt converts the integer representation of inode flags to
+// a InodeFlags struct.
+func InodeFlagsFromInt(f uint32) InodeFlags {
+ return InodeFlags{
+ Sync: f&InSync > 0,
+ Immutable: f&InImmutable > 0,
+ Append: f&InAppend > 0,
+ NoDump: f&InNoDump > 0,
+ NoAccessTime: f&InNoAccessTime > 0,
+ Index: f&InIndex > 0,
+ JournalData: f&InJournalData > 0,
+ DirSync: f&InDirSync > 0,
+ TopDir: f&InTopDir > 0,
+ HugeFile: f&InHugeFile > 0,
+ Extents: f&InExtents > 0,
+ ExtendedAttr: f&InExtendedAttr > 0,
+ Inline: f&InInline > 0,
+ Reserved: f&InReserved > 0,
+ }
+}
+
+// These masks define how users can view/modify inode flags. The rest of the
+// flags are for internal kernel usage only.
+const (
+ InUserReadFlagMask = 0x4BDFFF
+ InUserWriteFlagMask = 0x4B80FF
+)
diff --git a/pkg/sentry/fs/ext4/disklayout/superblock.go b/pkg/sentry/fs/ext4/disklayout/superblock.go
index d630ba8a6..030c73410 100644
--- a/pkg/sentry/fs/ext4/disklayout/superblock.go
+++ b/pkg/sentry/fs/ext4/disklayout/superblock.go
@@ -12,9 +12,25 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// Package disklayout provides Linux ext file system's disk level structures
+// which can be directly read into from the underlying device. All structures
+// on disk are in little-endian order. Only jbd2 (journal) structures are in
+// big-endian order. Structs aim to emulate structures `exactly` how they are
+// layed out on disk.
+//
+// This library aims to be compatible with all ext(2/3/4) systems so it
+// provides a generic interface for all major structures and various
+// implementations (for different versions). The user code is responsible for
+// using appropriate implementations based on the underlying device.
+//
+// Notes:
+// - All fields in these structs are exported because binary.Read would
+// panic otherwise.
+// - All OS dependent fields in these structures will be interpretted using
+// the Linux version of that field.
package disklayout
-// SuperBlock should be implemented by structs representing ext4 superblock.
+// SuperBlock should be implemented by structs representing the ext superblock.
// The superblock holds a lot of information about the enclosing filesystem.
// This interface aims to provide access methods to important information held
// by the superblock. It does NOT expose all fields of the superblock, only the
@@ -23,11 +39,11 @@ package disklayout
// Location and replication:
// - The superblock is located at offset 1024 in block group 0.
// - Redundant copies of the superblock and group descriptors are kept in
-// all groups if sparse_super feature flag is NOT set. If it is set, the
+// all groups if SbSparse feature flag is NOT set. If it is set, the
// replicas only exist in groups whose group number is either 0 or a
// power of 3, 5, or 7.
// - There is also a sparse superblock feature v2 in which there are just
-// two replicas saved in block groups pointed by the s_backup_bgs field.
+// two replicas saved in the block groups pointed by sb.s_backup_bgs.
//
// Replicas should eventually be updated if the superblock is updated.
//
diff --git a/pkg/sentry/fs/g3doc/inotify.md b/pkg/sentry/fs/g3doc/inotify.md
index e1630553b..71a577d9d 100644
--- a/pkg/sentry/fs/g3doc/inotify.md
+++ b/pkg/sentry/fs/g3doc/inotify.md
@@ -9,7 +9,7 @@ For the most part, the sentry implementation of inotify mirrors the Linux
architecture. Inotify instances (i.e. the fd returned by inotify_init(2)) are
backed by a pseudo-filesystem. Events are generated from various places in the
sentry, including the [syscall layer][syscall_dir], the [vfs layer][dirent] and
-the [process fd table][fd_map]. Watches are stored in inodes and generated
+the [process fd table][fd_table]. Watches are stored in inodes and generated
events are queued to the inotify instance owning the watches for delivery to the
user.
@@ -114,7 +114,7 @@ ordering.
[dirent]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/dirent.go
[event]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_event.go
-[fd_map]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_map.go
+[fd_table]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_table.go
[inode]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode.go
[inode_watches]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode_inotify.go
[inotify]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify.go
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index da41a10ab..70ed854a8 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -42,7 +42,6 @@ go_library(
"//pkg/sentry/inet",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/kernel/time",
"//pkg/sentry/limits",
"//pkg/sentry/mm",
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index ea7aded9a..bee421d76 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -25,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -42,8 +41,8 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF
var file *fs.File
var fdFlags kernel.FDFlags
t.WithMuLocked(func(t *kernel.Task) {
- if fdm := t.FDMap(); fdm != nil {
- file, fdFlags = fdm.GetDescriptor(kdefs.FD(n))
+ if fdTable := t.FDTable(); fdTable != nil {
+ file, fdFlags = fdTable.Get(int32(n))
}
})
if file == nil {
@@ -56,36 +55,31 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF
// toDentAttr callback for each to get a DentAttr, which it then emits. This is
// a helper for implementing fs.InodeOperations.Readdir.
func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) {
- var fds kernel.FDs
+ var fds []int32
t.WithMuLocked(func(t *kernel.Task) {
- if fdm := t.FDMap(); fdm != nil {
- fds = fdm.GetFDs()
+ if fdTable := t.FDTable(); fdTable != nil {
+ fds = fdTable.GetFDs()
}
})
- fdInts := make([]int, 0, len(fds))
- for _, fd := range fds {
- fdInts = append(fdInts, int(fd))
- }
-
- // Find the fd to start at.
- idx := sort.SearchInts(fdInts, int(offset))
- if idx == len(fdInts) {
+ // Find the appropriate starting point.
+ idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(offset) })
+ if idx == len(fds) {
return offset, nil
}
- fdInts = fdInts[idx:]
+ fds = fds[idx:]
- var fd int
- for _, fd = range fdInts {
+ // Serialize all FDs.
+ for _, fd := range fds {
name := strconv.FormatUint(uint64(fd), 10)
- if err := c.DirEmit(name, toDentAttr(fd)); err != nil {
+ if err := c.DirEmit(name, toDentAttr(int(fd))); err != nil {
// Returned offset is the next fd to serialize.
return int64(fd), err
}
}
// We serialized them all. Next offset should be higher than last
// serialized fd.
- return int64(fd + 1), nil
+ return int64(fds[len(fds)-1] + 1), nil
}
// fd implements fs.InodeOperations for a file in /proc/TID/fd/.
@@ -154,9 +148,9 @@ func (f *fd) Close() error {
type fdDir struct {
ramfs.Dir
- // We hold a reference on the task's fdmap but only keep an indirect
- // task pointer to avoid Dirent loading circularity caused by fdmap's
- // potential back pointers into the dirent tree.
+ // We hold a reference on the task's FDTable but only keep an indirect
+ // task pointer to avoid Dirent loading circularity caused by the
+ // table's back pointers into the dirent tree.
t *kernel.Task
}
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index b2e36aeee..ef0ca3301 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -580,8 +580,8 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) (
var fds int
var vss, rss, data uint64
s.t.WithMuLocked(func(t *kernel.Task) {
- if fdm := t.FDMap(); fdm != nil {
- fds = fdm.Size()
+ if fdTable := t.FDTable(); fdTable != nil {
+ fds = fdTable.Size()
}
if mm := t.MemoryManager(); mm != nil {
vss = mm.VirtualMemorySize()