summaryrefslogtreecommitdiffhomepage
path: root/pkg
diff options
context:
space:
mode:
Diffstat (limited to 'pkg')
-rw-r--r--pkg/refs/refcounter.go7
-rw-r--r--pkg/sentry/control/BUILD1
-rw-r--r--pkg/sentry/control/proc.go12
-rw-r--r--pkg/sentry/fs/README.md2
-rw-r--r--pkg/sentry/fs/ext4/disklayout/BUILD8
-rw-r--r--pkg/sentry/fs/ext4/disklayout/block_group.go22
-rw-r--r--pkg/sentry/fs/ext4/disklayout/block_group_32.go4
-rw-r--r--pkg/sentry/fs/ext4/disklayout/inode.go267
-rw-r--r--pkg/sentry/fs/ext4/disklayout/superblock.go22
-rw-r--r--pkg/sentry/fs/g3doc/inotify.md4
-rw-r--r--pkg/sentry/fs/proc/BUILD1
-rw-r--r--pkg/sentry/fs/proc/fds.go38
-rw-r--r--pkg/sentry/fs/proc/task.go4
-rw-r--r--pkg/sentry/kernel/BUILD9
-rw-r--r--pkg/sentry/kernel/epoll/BUILD1
-rw-r--r--pkg/sentry/kernel/epoll/epoll.go3
-rw-r--r--pkg/sentry/kernel/fd_map.go366
-rw-r--r--pkg/sentry/kernel/fd_map_test.go136
-rw-r--r--pkg/sentry/kernel/fd_table.go380
-rw-r--r--pkg/sentry/kernel/fd_table_test.go192
-rw-r--r--pkg/sentry/kernel/fd_table_unsafe.go103
-rw-r--r--pkg/sentry/kernel/kdefs/BUILD10
-rw-r--r--pkg/sentry/kernel/kdefs/kdefs.go20
-rw-r--r--pkg/sentry/kernel/kernel.go76
-rw-r--r--pkg/sentry/kernel/task.go65
-rw-r--r--pkg/sentry/kernel/task_clone.go40
-rw-r--r--pkg/sentry/kernel/task_exec.go2
-rw-r--r--pkg/sentry/kernel/task_exit.go4
-rw-r--r--pkg/sentry/kernel/task_log.go2
-rw-r--r--pkg/sentry/kernel/task_start.go13
-rw-r--r--pkg/sentry/loader/loader.go2
-rw-r--r--pkg/sentry/platform/kvm/machine.go7
-rw-r--r--pkg/sentry/socket/BUILD1
-rw-r--r--pkg/sentry/socket/control/BUILD1
-rw-r--r--pkg/sentry/socket/control/control.go10
-rw-r--r--pkg/sentry/socket/epsocket/BUILD1
-rw-r--r--pkg/sentry/socket/epsocket/epsocket.go8
-rw-r--r--pkg/sentry/socket/hostinet/BUILD1
-rw-r--r--pkg/sentry/socket/hostinet/socket.go9
-rw-r--r--pkg/sentry/socket/netlink/BUILD1
-rw-r--r--pkg/sentry/socket/netlink/socket.go3
-rw-r--r--pkg/sentry/socket/rpcinet/BUILD1
-rw-r--r--pkg/sentry/socket/rpcinet/socket.go8
-rw-r--r--pkg/sentry/socket/socket.go3
-rw-r--r--pkg/sentry/socket/unix/BUILD1
-rw-r--r--pkg/sentry/socket/unix/unix.go8
-rw-r--r--pkg/sentry/strace/BUILD1
-rw-r--r--pkg/sentry/strace/poll.go3
-rw-r--r--pkg/sentry/strace/strace.go7
-rw-r--r--pkg/sentry/syscalls/BUILD1
-rw-r--r--pkg/sentry/syscalls/epoll.go30
-rw-r--r--pkg/sentry/syscalls/linux/BUILD1
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go516
-rw-r--r--pkg/sentry/syscalls/linux/sys_aio.go9
-rw-r--r--pkg/sentry/syscalls/linux/sys_epoll.go7
-rw-r--r--pkg/sentry/syscalls/linux/sys_eventfd.go5
-rw-r--r--pkg/sentry/syscalls/linux/sys_file.go187
-rw-r--r--pkg/sentry/syscalls/linux/sys_getdents.go9
-rw-r--r--pkg/sentry/syscalls/linux/sys_inotify.go15
-rw-r--r--pkg/sentry/syscalls/linux/sys_lseek.go5
-rw-r--r--pkg/sentry/syscalls/linux/sys_mmap.go5
-rw-r--r--pkg/sentry/syscalls/linux/sys_pipe.go23
-rw-r--r--pkg/sentry/syscalls/linux/sys_poll.go5
-rw-r--r--pkg/sentry/syscalls/linux/sys_prctl.go5
-rw-r--r--pkg/sentry/syscalls/linux/sys_read.go21
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go90
-rw-r--r--pkg/sentry/syscalls/linux/sys_splice.go25
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat.go17
-rw-r--r--pkg/sentry/syscalls/linux/sys_sync.go17
-rw-r--r--pkg/sentry/syscalls/linux/sys_timerfd.go13
-rw-r--r--pkg/sentry/syscalls/linux/sys_write.go21
-rw-r--r--pkg/sentry/syscalls/syscalls.go17
-rw-r--r--pkg/tcpip/iptables/BUILD18
-rw-r--r--pkg/tcpip/iptables/iptables.go81
-rw-r--r--pkg/tcpip/iptables/targets.go35
-rw-r--r--pkg/tcpip/iptables/types.go183
76 files changed, 1988 insertions, 1263 deletions
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index 3dd6f0dd4..6ecbace9e 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -365,6 +365,8 @@ func (r *AtomicRefCount) ReadRefs() int64 {
//
// The sanity check here is limited to real references, since if they have
// dropped beneath zero then the object should have been destroyed.
+//
+//go:nosplit
func (r *AtomicRefCount) IncRef() {
if v := atomic.AddInt64(&r.refCount, 1); v <= 0 {
panic("Incrementing non-positive ref count")
@@ -379,6 +381,8 @@ func (r *AtomicRefCount) IncRef() {
// To do this safely without a loop, a speculative reference is first acquired
// on the object. This allows multiple concurrent TryIncRef calls to
// distinguish other TryIncRef calls from genuine references held.
+//
+//go:nosplit
func (r *AtomicRefCount) TryIncRef() bool {
const speculativeRef = 1 << 32
v := atomic.AddInt64(&r.refCount, speculativeRef)
@@ -420,6 +424,7 @@ func (r *AtomicRefCount) dropWeakRef(w *WeakRef) {
// B: DecRef [real decrease]
// A: TryIncRef [transform speculative to real]
//
+//go:nosplit
func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) {
switch v := atomic.AddInt64(&r.refCount, -1); {
case v < -1:
@@ -455,6 +460,8 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) {
}
// DecRef decrements this object's reference count.
+//
+//go:nosplit
func (r *AtomicRefCount) DecRef() {
r.DecRefWithDestructor(nil)
}
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 5dccb8e3c..bf802d1b6 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -23,7 +23,6 @@ go_library(
"//pkg/sentry/fs/host",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/kernel/time",
"//pkg/sentry/limits",
"//pkg/sentry/state",
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 66a506584..6ae60c5cb 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -28,7 +28,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/host"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usage"
@@ -123,9 +122,8 @@ func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID
// TTYFileOperations that wraps the TTY is also returned.
func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) {
// Import file descriptors.
- l := limits.NewLimitSet()
- fdm := proc.Kernel.NewFDMap()
- defer fdm.DecRef()
+ fdTable := proc.Kernel.NewFDTable()
+ defer fdTable.DecRef()
// No matter what happens, we should close all files in the FilePayload
// before returning. Any files that are imported will be duped.
@@ -149,9 +147,9 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
WorkingDirectory: args.WorkingDirectory,
Root: args.Root,
Credentials: creds,
- FDMap: fdm,
+ FDTable: fdTable,
Umask: 0022,
- Limits: l,
+ Limits: limits.NewLimitSet(),
MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
UTSNamespace: proc.Kernel.RootUTSNamespace(),
IPCNamespace: proc.Kernel.RootIPCNamespace(),
@@ -212,7 +210,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
}
// Add the file to the FD map.
- if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil {
+ if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
return nil, 0, nil, err
}
}
diff --git a/pkg/sentry/fs/README.md b/pkg/sentry/fs/README.md
index c4e8faa3c..db4a1b730 100644
--- a/pkg/sentry/fs/README.md
+++ b/pkg/sentry/fs/README.md
@@ -69,7 +69,7 @@ Specifically this state is:
- An `fs.MountManager` containing mount points.
-- A `kernel.FDMap` containing pointers to open files.
+- A `kernel.FDTable` containing pointers to open files.
Anything else managed by the VFS that can be easily loaded into memory from a
filesystem is synced back to those filesystems and is not saved. Examples are
diff --git a/pkg/sentry/fs/ext4/disklayout/BUILD b/pkg/sentry/fs/ext4/disklayout/BUILD
index cdac63655..e3d2f8d71 100644
--- a/pkg/sentry/fs/ext4/disklayout/BUILD
+++ b/pkg/sentry/fs/ext4/disklayout/BUILD
@@ -8,6 +8,7 @@ go_library(
"block_group.go",
"block_group_32.go",
"block_group_64.go",
+ "inode.go",
"superblock.go",
"superblock_32.go",
"superblock_64.go",
@@ -15,7 +16,12 @@ go_library(
"test_utils.go",
],
importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext4/disklayout",
- deps = ["//pkg/binary"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/binary",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/time",
+ ],
)
go_test(
diff --git a/pkg/sentry/fs/ext4/disklayout/block_group.go b/pkg/sentry/fs/ext4/disklayout/block_group.go
index 7df76a036..32ea3d97d 100644
--- a/pkg/sentry/fs/ext4/disklayout/block_group.go
+++ b/pkg/sentry/fs/ext4/disklayout/block_group.go
@@ -12,26 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Package disklayout provides ext4 disk level structures which can be directly
-// filled with bytes from the underlying device. All structures on disk are in
-// little-endian order. Only jbd2 (journal) structures are in big-endian order.
-// Structs aim to emulate structures `exactly` how they are layed out on disk.
-//
-// Note: All fields in these structs are exported because binary.Read would
-// panic otherwise.
package disklayout
-// BlockGroup represents Linux struct ext4_group_desc which is internally
-// called a block group descriptor. An ext4 file system is split into a series
-// of block groups. This provides an access layer to information needed to
-// access and use a block group.
+// BlockGroup represents a Linux ext block group descriptor. An ext file system
+// is split into a series of block groups. This provides an access layer to
+// information needed to access and use a block group.
//
// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors.
type BlockGroup interface {
// InodeTable returns the absolute block number of the block containing the
// inode table. This points to an array of Inode structs. Inode tables are
// statically allocated at mkfs time. The superblock records the number of
- // inodes per group (length of this table).
+ // inodes per group (length of this table) and the size of each inode struct.
InodeTable() uint64
// BlockBitmap returns the absolute block number of the block containing the
@@ -73,15 +65,15 @@ type BlockGroup interface {
// Checksum returns this block group's checksum.
//
- // If RO_COMPAT_METADATA_CSUM feature is set:
+ // If SbMetadataCsum feature is set:
// - checksum is crc32c(FS UUID + group number + group descriptor
// structure) & 0xFFFF.
//
- // If RO_COMPAT_GDT_CSUM feature is set:
+ // If SbGdtCsum feature is set:
// - checksum is crc16(FS UUID + group number + group descriptor
// structure).
//
- // RO_COMPAT_METADATA_CSUM and RO_COMPAT_GDT_CSUM should not be both set.
+ // SbMetadataCsum and SbGdtCsum should not be both set.
// If they are, Linux warns and asks to run fsck.
Checksum() uint16
diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_32.go b/pkg/sentry/fs/ext4/disklayout/block_group_32.go
index 087f1fb4a..dadecb3e3 100644
--- a/pkg/sentry/fs/ext4/disklayout/block_group_32.go
+++ b/pkg/sentry/fs/ext4/disklayout/block_group_32.go
@@ -15,8 +15,8 @@
package disklayout
// BlockGroup32Bit emulates the first half of struct ext4_group_desc in
-// fs/ext4/ext4.h. It is the block group descriptor struct for 32-bit ext4
-// filesystems. It implements BlockGroup interface.
+// fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and
+// 32-bit ext4 filesystems. It implements BlockGroup interface.
//
// The suffix `Lo` here stands for lower bits because this is also used in the
// 64-bit version where these fields represent the lower half of the fields.
diff --git a/pkg/sentry/fs/ext4/disklayout/inode.go b/pkg/sentry/fs/ext4/disklayout/inode.go
new file mode 100644
index 000000000..b48001910
--- /dev/null
+++ b/pkg/sentry/fs/ext4/disklayout/inode.go
@@ -0,0 +1,267 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+)
+
+// The Inode interface must be implemented by structs representing ext inodes.
+// The inode stores all the metadata pertaining to the file (except for the
+// file name which is held by the directory entry). It does NOT expose all
+// fields and should be extended if need be.
+//
+// Some file systems (e.g. FAT) use the directory entry to store all this
+// information. Ext file systems do not so that they can support hard links.
+// However, ext4 cheats a little bit and duplicates the file type in the
+// directory entry for performance gains.
+//
+// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes.
+type Inode interface {
+ // Mode returns the linux file mode which is majorly used to extract
+ // information like:
+ // - File permissions (read/write/execute by user/group/others).
+ // - Sticky, set UID and GID bits.
+ // - File type.
+ //
+ // Masks to extract this information are provided in pkg/abi/linux/file.go.
+ Mode() linux.FileMode
+
+ // UID returns the owner UID.
+ UID() auth.KUID
+
+ // GID returns the owner GID.
+ GID() auth.KGID
+
+ // Size returns the size of the file in bytes.
+ Size() uint64
+
+ // InodeSize returns the size of this inode struct in bytes.
+ // In ext2 and ext3, the inode struct and inode disk record size was fixed at
+ // 128 bytes. Ext4 makes it possible for the inode struct to be bigger.
+ // However, accessing any field beyond the 128 bytes marker must be verified
+ // using this method.
+ InodeSize() uint16
+
+ // AccessTime returns the last access time. Shows when the file was last read.
+ //
+ // If InExtendedAttr is set, then this should NOT be used because the
+ // underlying field is used to store the extended attribute value checksum.
+ AccessTime() time.Time
+
+ // ChangeTime returns the last change time. Shows when the file meta data
+ // (like permissions) was last changed.
+ //
+ // If InExtendedAttr is set, then this should NOT be used because the
+ // underlying field is used to store the lower 32 bits of the attribute
+ // value’s reference count.
+ ChangeTime() time.Time
+
+ // ModificationTime returns the last modification time. Shows when the file
+ // content was last modified.
+ //
+ // If InExtendedAttr is set, then this should NOT be used because
+ // the underlying field contains the number of the inode that owns the
+ // extended attribute.
+ ModificationTime() time.Time
+
+ // DeletionTime returns the deletion time. Inodes are marked as deleted by
+ // writing to the underlying field. FS tools can restore files until they are
+ // actually overwritten.
+ DeletionTime() time.Time
+
+ // LinksCount returns the number of hard links to this inode.
+ //
+ // Normally there is an upper limit on the number of hard links:
+ // - ext2/ext3 = 32,000
+ // - ext4 = 65,000
+ //
+ // This implies that an ext4 directory cannot have more than 64,998
+ // subdirectories because each subdirectory will have a hard link to the
+ // directory via the `..` entry. The directory has hard link via the `.` entry
+ // of its own. And finally the inode is initiated with 1 hard link (itself).
+ //
+ // The underlying value is reset to 1 if all the following hold:
+ // - Inode is a directory.
+ // - SbDirNlink is enabled.
+ // - Number of hard links is incremented past 64,999.
+ // Hard link value of 1 for a directory would indicate that the number of hard
+ // links is unknown because a directory can have minimum 2 hard links (itself
+ // and `.` entry).
+ LinksCount() uint16
+
+ // Flags returns InodeFlags which represents the inode flags.
+ Flags() InodeFlags
+
+ // Blocks returns the underlying inode.i_block array. This field is special
+ // and is used to store various kinds of things depending on the filesystem
+ // version and inode type.
+ // - In ext2/ext3, it contains the block map.
+ // - In ext4, it contains the extent tree.
+ // - For inline files, it contains the file contents.
+ // - For symlinks, it contains the link path (if it fits here).
+ //
+ // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#the-contents-of-inode-i-block.
+ Blocks() [60]byte
+}
+
+// Inode flags. This is not comprehensive and flags which were not used in
+// the Linux kernel have been excluded.
+const (
+ // InSync indicates that all writes to the file must be synchronous.
+ InSync = 0x8
+
+ // InImmutable indicates that this file is immutable.
+ InImmutable = 0x10
+
+ // InAppend indicates that this file can only be appended to.
+ InAppend = 0x20
+
+ // InNoDump indicates that teh dump(1) utility should not dump this file.
+ InNoDump = 0x40
+
+ // InNoAccessTime indicates that the access time of this inode must not be
+ // updated.
+ InNoAccessTime = 0x80
+
+ // InIndex indicates that this directory has hashed indexes.
+ InIndex = 0x1000
+
+ // InJournalData indicates that file data must always be written through a
+ // journal device.
+ InJournalData = 0x4000
+
+ // InDirSync indicates that all the directory entiry data must be written
+ // synchronously.
+ InDirSync = 0x10000
+
+ // InTopDir indicates that this inode is at the top of the directory hierarchy.
+ InTopDir = 0x20000
+
+ // InHugeFile indicates that this is a huge file.
+ InHugeFile = 0x40000
+
+ // InExtents indicates that this inode uses extents.
+ InExtents = 0x80000
+
+ // InExtendedAttr indicates that this inode stores a large extended attribute
+ // value in its data blocks.
+ InExtendedAttr = 0x200000
+
+ // InInline indicates that this inode has inline data.
+ InInline = 0x10000000
+
+ // InReserved indicates that this inode is reserved for the ext4 library.
+ InReserved = 0x80000000
+)
+
+// InodeFlags represents all possible combinations of inode flags. It aims to
+// cover the bit masks and provide a more user-friendly interface.
+type InodeFlags struct {
+ Sync bool
+ Immutable bool
+ Append bool
+ NoDump bool
+ NoAccessTime bool
+ Index bool
+ JournalData bool
+ DirSync bool
+ TopDir bool
+ HugeFile bool
+ Extents bool
+ ExtendedAttr bool
+ Inline bool
+ Reserved bool
+}
+
+// ToInt converts inode flags back to its 32-bit rep.
+func (f InodeFlags) ToInt() uint32 {
+ var res uint32
+
+ if f.Sync {
+ res |= InSync
+ }
+ if f.Immutable {
+ res |= InImmutable
+ }
+ if f.Append {
+ res |= InAppend
+ }
+ if f.NoDump {
+ res |= InNoDump
+ }
+ if f.NoAccessTime {
+ res |= InNoAccessTime
+ }
+ if f.Index {
+ res |= InIndex
+ }
+ if f.JournalData {
+ res |= InJournalData
+ }
+ if f.DirSync {
+ res |= InDirSync
+ }
+ if f.TopDir {
+ res |= InTopDir
+ }
+ if f.HugeFile {
+ res |= InHugeFile
+ }
+ if f.Extents {
+ res |= InExtents
+ }
+ if f.ExtendedAttr {
+ res |= InExtendedAttr
+ }
+ if f.Inline {
+ res |= InInline
+ }
+ if f.Reserved {
+ res |= InReserved
+ }
+
+ return res
+}
+
+// InodeFlagsFromInt converts the integer representation of inode flags to
+// a InodeFlags struct.
+func InodeFlagsFromInt(f uint32) InodeFlags {
+ return InodeFlags{
+ Sync: f&InSync > 0,
+ Immutable: f&InImmutable > 0,
+ Append: f&InAppend > 0,
+ NoDump: f&InNoDump > 0,
+ NoAccessTime: f&InNoAccessTime > 0,
+ Index: f&InIndex > 0,
+ JournalData: f&InJournalData > 0,
+ DirSync: f&InDirSync > 0,
+ TopDir: f&InTopDir > 0,
+ HugeFile: f&InHugeFile > 0,
+ Extents: f&InExtents > 0,
+ ExtendedAttr: f&InExtendedAttr > 0,
+ Inline: f&InInline > 0,
+ Reserved: f&InReserved > 0,
+ }
+}
+
+// These masks define how users can view/modify inode flags. The rest of the
+// flags are for internal kernel usage only.
+const (
+ InUserReadFlagMask = 0x4BDFFF
+ InUserWriteFlagMask = 0x4B80FF
+)
diff --git a/pkg/sentry/fs/ext4/disklayout/superblock.go b/pkg/sentry/fs/ext4/disklayout/superblock.go
index d630ba8a6..030c73410 100644
--- a/pkg/sentry/fs/ext4/disklayout/superblock.go
+++ b/pkg/sentry/fs/ext4/disklayout/superblock.go
@@ -12,9 +12,25 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// Package disklayout provides Linux ext file system's disk level structures
+// which can be directly read into from the underlying device. All structures
+// on disk are in little-endian order. Only jbd2 (journal) structures are in
+// big-endian order. Structs aim to emulate structures `exactly` how they are
+// layed out on disk.
+//
+// This library aims to be compatible with all ext(2/3/4) systems so it
+// provides a generic interface for all major structures and various
+// implementations (for different versions). The user code is responsible for
+// using appropriate implementations based on the underlying device.
+//
+// Notes:
+// - All fields in these structs are exported because binary.Read would
+// panic otherwise.
+// - All OS dependent fields in these structures will be interpretted using
+// the Linux version of that field.
package disklayout
-// SuperBlock should be implemented by structs representing ext4 superblock.
+// SuperBlock should be implemented by structs representing the ext superblock.
// The superblock holds a lot of information about the enclosing filesystem.
// This interface aims to provide access methods to important information held
// by the superblock. It does NOT expose all fields of the superblock, only the
@@ -23,11 +39,11 @@ package disklayout
// Location and replication:
// - The superblock is located at offset 1024 in block group 0.
// - Redundant copies of the superblock and group descriptors are kept in
-// all groups if sparse_super feature flag is NOT set. If it is set, the
+// all groups if SbSparse feature flag is NOT set. If it is set, the
// replicas only exist in groups whose group number is either 0 or a
// power of 3, 5, or 7.
// - There is also a sparse superblock feature v2 in which there are just
-// two replicas saved in block groups pointed by the s_backup_bgs field.
+// two replicas saved in the block groups pointed by sb.s_backup_bgs.
//
// Replicas should eventually be updated if the superblock is updated.
//
diff --git a/pkg/sentry/fs/g3doc/inotify.md b/pkg/sentry/fs/g3doc/inotify.md
index e1630553b..71a577d9d 100644
--- a/pkg/sentry/fs/g3doc/inotify.md
+++ b/pkg/sentry/fs/g3doc/inotify.md
@@ -9,7 +9,7 @@ For the most part, the sentry implementation of inotify mirrors the Linux
architecture. Inotify instances (i.e. the fd returned by inotify_init(2)) are
backed by a pseudo-filesystem. Events are generated from various places in the
sentry, including the [syscall layer][syscall_dir], the [vfs layer][dirent] and
-the [process fd table][fd_map]. Watches are stored in inodes and generated
+the [process fd table][fd_table]. Watches are stored in inodes and generated
events are queued to the inotify instance owning the watches for delivery to the
user.
@@ -114,7 +114,7 @@ ordering.
[dirent]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/dirent.go
[event]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_event.go
-[fd_map]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_map.go
+[fd_table]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_table.go
[inode]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode.go
[inode_watches]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode_inotify.go
[inotify]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify.go
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index da41a10ab..70ed854a8 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -42,7 +42,6 @@ go_library(
"//pkg/sentry/inet",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/kernel/time",
"//pkg/sentry/limits",
"//pkg/sentry/mm",
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index ea7aded9a..bee421d76 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -25,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -42,8 +41,8 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF
var file *fs.File
var fdFlags kernel.FDFlags
t.WithMuLocked(func(t *kernel.Task) {
- if fdm := t.FDMap(); fdm != nil {
- file, fdFlags = fdm.GetDescriptor(kdefs.FD(n))
+ if fdTable := t.FDTable(); fdTable != nil {
+ file, fdFlags = fdTable.Get(int32(n))
}
})
if file == nil {
@@ -56,36 +55,31 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF
// toDentAttr callback for each to get a DentAttr, which it then emits. This is
// a helper for implementing fs.InodeOperations.Readdir.
func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) {
- var fds kernel.FDs
+ var fds []int32
t.WithMuLocked(func(t *kernel.Task) {
- if fdm := t.FDMap(); fdm != nil {
- fds = fdm.GetFDs()
+ if fdTable := t.FDTable(); fdTable != nil {
+ fds = fdTable.GetFDs()
}
})
- fdInts := make([]int, 0, len(fds))
- for _, fd := range fds {
- fdInts = append(fdInts, int(fd))
- }
-
- // Find the fd to start at.
- idx := sort.SearchInts(fdInts, int(offset))
- if idx == len(fdInts) {
+ // Find the appropriate starting point.
+ idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(offset) })
+ if idx == len(fds) {
return offset, nil
}
- fdInts = fdInts[idx:]
+ fds = fds[idx:]
- var fd int
- for _, fd = range fdInts {
+ // Serialize all FDs.
+ for _, fd := range fds {
name := strconv.FormatUint(uint64(fd), 10)
- if err := c.DirEmit(name, toDentAttr(fd)); err != nil {
+ if err := c.DirEmit(name, toDentAttr(int(fd))); err != nil {
// Returned offset is the next fd to serialize.
return int64(fd), err
}
}
// We serialized them all. Next offset should be higher than last
// serialized fd.
- return int64(fd + 1), nil
+ return int64(fds[len(fds)-1] + 1), nil
}
// fd implements fs.InodeOperations for a file in /proc/TID/fd/.
@@ -154,9 +148,9 @@ func (f *fd) Close() error {
type fdDir struct {
ramfs.Dir
- // We hold a reference on the task's fdmap but only keep an indirect
- // task pointer to avoid Dirent loading circularity caused by fdmap's
- // potential back pointers into the dirent tree.
+ // We hold a reference on the task's FDTable but only keep an indirect
+ // task pointer to avoid Dirent loading circularity caused by the
+ // table's back pointers into the dirent tree.
t *kernel.Task
}
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index b2e36aeee..ef0ca3301 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -580,8 +580,8 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) (
var fds int
var vss, rss, data uint64
s.t.WithMuLocked(func(t *kernel.Task) {
- if fdm := t.FDMap(); fdm != nil {
- fds = fdm.Size()
+ if fdTable := t.FDTable(); fdTable != nil {
+ fds = fdTable.Size()
}
if mm := t.MemoryManager(); mm != nil {
vss = mm.VirtualMemorySize()
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index c172d399e..7b92f1b8d 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -96,7 +96,8 @@ go_library(
srcs = [
"abstract_socket_namespace.go",
"context.go",
- "fd_map.go",
+ "fd_table.go",
+ "fd_table_unsafe.go",
"fs_context.go",
"ipc_namespace.go",
"kernel.go",
@@ -179,7 +180,6 @@ go_library(
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/epoll",
"//pkg/sentry/kernel/futex",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/kernel/sched",
"//pkg/sentry/kernel/semaphore",
"//pkg/sentry/kernel/shm",
@@ -214,7 +214,7 @@ go_test(
name = "kernel_test",
size = "small",
srcs = [
- "fd_map_test.go",
+ "fd_table_test.go",
"table_test.go",
"task_test.go",
"timekeeper_test.go",
@@ -223,9 +223,10 @@ go_test(
deps = [
"//pkg/abi",
"//pkg/sentry/arch",
+ "//pkg/sentry/context",
"//pkg/sentry/context/contexttest",
+ "//pkg/sentry/fs",
"//pkg/sentry/fs/filetest",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/kernel/sched",
"//pkg/sentry/limits",
"//pkg/sentry/pgalloc",
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index fb99cfc8f..f46c43128 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -30,7 +30,6 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/fs/anon",
"//pkg/sentry/fs/fsutil",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/usermem",
"//pkg/waiter",
],
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 33c7dccae..9c0a4e1b4 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -26,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/anon"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -61,7 +60,7 @@ const (
// +stateify savable
type FileIdentifier struct {
File *fs.File `state:"wait"`
- Fd kdefs.FD
+ Fd int32
}
// pollEntry holds all the state associated with an event poll entry, that is,
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
deleted file mode 100644
index 1b84bfe14..000000000
--- a/pkg/sentry/kernel/fd_map.go
+++ /dev/null
@@ -1,366 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kernel
-
-import (
- "bytes"
- "fmt"
- "sort"
- "sync"
- "sync/atomic"
- "syscall"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/refs"
- "gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/fs/lock"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
- "gvisor.dev/gvisor/pkg/sentry/limits"
-)
-
-// FDs is an ordering of FD's that can be made stable.
-type FDs []kdefs.FD
-
-func (f FDs) Len() int {
- return len(f)
-}
-
-func (f FDs) Swap(i, j int) {
- f[i], f[j] = f[j], f[i]
-}
-
-func (f FDs) Less(i, j int) bool {
- return f[i] < f[j]
-}
-
-// FDFlags define flags for an individual descriptor.
-//
-// +stateify savable
-type FDFlags struct {
- // CloseOnExec indicates the descriptor should be closed on exec.
- CloseOnExec bool
-}
-
-// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
-// representation.
-func (f FDFlags) ToLinuxFileFlags() (mask uint) {
- if f.CloseOnExec {
- mask |= linux.O_CLOEXEC
- }
- return
-}
-
-// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
-// representation.
-func (f FDFlags) ToLinuxFDFlags() (mask uint) {
- if f.CloseOnExec {
- mask |= linux.FD_CLOEXEC
- }
- return
-}
-
-// descriptor holds the details about a file descriptor, namely a pointer the
-// file itself and the descriptor flags.
-//
-// +stateify savable
-type descriptor struct {
- file *fs.File
- flags FDFlags
-}
-
-// FDMap is used to manage File references and flags.
-//
-// +stateify savable
-type FDMap struct {
- refs.AtomicRefCount
- k *Kernel
- files map[kdefs.FD]descriptor
- mu sync.RWMutex `state:"nosave"`
- uid uint64
-}
-
-// ID returns a unique identifier for this FDMap.
-func (f *FDMap) ID() uint64 {
- return f.uid
-}
-
-// NewFDMap allocates a new FDMap that may be used by tasks in k.
-func (k *Kernel) NewFDMap() *FDMap {
- f := FDMap{
- k: k,
- files: make(map[kdefs.FD]descriptor),
- uid: atomic.AddUint64(&k.fdMapUids, 1),
- }
- f.EnableLeakCheck("kernel.FDMap")
- return &f
-}
-
-// destroy removes all of the file descriptors from the map.
-func (f *FDMap) destroy() {
- f.RemoveIf(func(*fs.File, FDFlags) bool {
- return true
- })
-}
-
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
-func (f *FDMap) DecRef() {
- f.DecRefWithDestructor(f.destroy)
-}
-
-// Size returns the number of file descriptor slots currently allocated.
-func (f *FDMap) Size() int {
- f.mu.RLock()
- defer f.mu.RUnlock()
-
- return len(f.files)
-}
-
-// String is a stringer for FDMap.
-func (f *FDMap) String() string {
- f.mu.RLock()
- defer f.mu.RUnlock()
-
- var b bytes.Buffer
- for k, v := range f.files {
- n, _ := v.file.Dirent.FullName(nil /* root */)
- b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", k, n))
- }
- return b.String()
-}
-
-// NewFDFrom allocates a new FD guaranteed to be the lowest number available
-// greater than or equal to from. This property is important as Unix programs
-// tend to count on this allocation order.
-func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error) {
- if fd < 0 {
- // Don't accept negative FDs.
- return 0, syscall.EINVAL
- }
-
- f.mu.Lock()
- defer f.mu.Unlock()
-
- // Finds the lowest fd not in the handles map.
- lim := limitSet.Get(limits.NumberOfFiles)
- for i := fd; lim.Cur == limits.Infinity || i < kdefs.FD(lim.Cur); i++ {
- if _, ok := f.files[i]; !ok {
- file.IncRef()
- f.files[i] = descriptor{file, flags}
- return i, nil
- }
- }
-
- return -1, syscall.EMFILE
-}
-
-// NewFDAt sets the file reference for the given FD. If there is an
-// active reference for that FD, the ref count for that existing reference
-// is decremented.
-func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error {
- if fd < 0 {
- // Don't accept negative FDs.
- return syscall.EBADF
- }
-
- // In this one case we do not do a defer of the Unlock. The
- // reason is that we must have done all the work needed for
- // discarding any old open file before we return to the
- // caller. In other words, the DecRef(), below, must have
- // completed by the time we return to the caller to ensure
- // side effects are, in fact, effected. A classic example is
- // dup2(fd1, fd2); if fd2 was already open, it must be closed,
- // and we don't want to resume the caller until it is; we have
- // to block on the DecRef(). Hence we can not just do a 'go
- // oldfile.DecRef()', since there would be no guarantee that
- // it would be done before we the caller resumed. Since we
- // must wait for the DecRef() to finish, and that could take
- // time, it's best to first call f.muUnlock beore so we are
- // not blocking other uses of this FDMap on the DecRef() call.
- f.mu.Lock()
- oldDesc, oldExists := f.files[fd]
- lim := limitSet.Get(limits.NumberOfFiles).Cur
- // if we're closing one then the effective limit is one
- // more than the actual limit.
- if oldExists && lim != limits.Infinity {
- lim++
- }
- if lim != limits.Infinity && fd >= kdefs.FD(lim) {
- f.mu.Unlock()
- return syscall.EMFILE
- }
-
- file.IncRef()
- f.files[fd] = descriptor{file, flags}
- f.mu.Unlock()
-
- if oldExists {
- oldDesc.file.DecRef()
- }
- return nil
-}
-
-// SetFlags sets the flags for the given file descriptor, if it is valid.
-func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags) {
- f.mu.Lock()
- defer f.mu.Unlock()
-
- desc, ok := f.files[fd]
- if !ok {
- return
- }
-
- f.files[fd] = descriptor{desc.file, flags}
-}
-
-// GetDescriptor returns a reference to the file and the flags for the FD. It
-// bumps its reference count as well. It returns nil if there is no File
-// for the FD, i.e. if the FD is invalid. The caller must use DecRef
-// when they are done.
-func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags) {
- f.mu.RLock()
- defer f.mu.RUnlock()
-
- if desc, ok := f.files[fd]; ok {
- desc.file.IncRef()
- return desc.file, desc.flags
- }
- return nil, FDFlags{}
-}
-
-// GetFile returns a reference to the File for the FD and bumps
-// its reference count as well. It returns nil if there is no File
-// for the FD, i.e. if the FD is invalid. The caller must use DecRef
-// when they are done.
-func (f *FDMap) GetFile(fd kdefs.FD) *fs.File {
- f.mu.RLock()
- if desc, ok := f.files[fd]; ok {
- desc.file.IncRef()
- f.mu.RUnlock()
- return desc.file
- }
- f.mu.RUnlock()
- return nil
-}
-
-// fds returns an ordering of FDs.
-func (f *FDMap) fds() FDs {
- fds := make(FDs, 0, len(f.files))
- for fd := range f.files {
- fds = append(fds, fd)
- }
- sort.Sort(fds)
- return fds
-}
-
-// GetFDs returns a list of valid fds.
-func (f *FDMap) GetFDs() FDs {
- f.mu.RLock()
- defer f.mu.RUnlock()
- return f.fds()
-}
-
-// GetRefs returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDMap) GetRefs() []*fs.File {
- f.mu.RLock()
- defer f.mu.RUnlock()
-
- fds := f.fds()
- fs := make([]*fs.File, 0, len(fds))
- for _, fd := range fds {
- desc := f.files[fd]
- desc.file.IncRef()
- fs = append(fs, desc.file)
- }
- return fs
-}
-
-// Fork returns an independent FDMap pointing to the same descriptors.
-func (f *FDMap) Fork() *FDMap {
- f.mu.RLock()
- defer f.mu.RUnlock()
-
- clone := f.k.NewFDMap()
-
- // Grab a extra reference for every file.
- for fd, desc := range f.files {
- desc.file.IncRef()
- clone.files[fd] = desc
- }
-
- // That's it!
- return clone
-}
-
-// unlock releases all file locks held by this FDMap's uid. Must only be
-// called on a non-nil *fs.File.
-func (f *FDMap) unlock(file *fs.File) {
- id := lock.UniqueID(f.ID())
- file.Dirent.Inode.LockCtx.Posix.UnlockRegion(id, lock.LockRange{0, lock.LockEOF})
-}
-
-// inotifyFileClose generates the appropriate inotify events for f being closed.
-func inotifyFileClose(f *fs.File) {
- var ev uint32
- d := f.Dirent
-
- if fs.IsDir(d.Inode.StableAttr) {
- ev |= linux.IN_ISDIR
- }
-
- if f.Flags().Write {
- ev |= linux.IN_CLOSE_WRITE
- } else {
- ev |= linux.IN_CLOSE_NOWRITE
- }
-
- d.InotifyEvent(ev, 0)
-}
-
-// Remove removes an FD from the FDMap, and returns (File, true) if a File
-// one was found. Callers are expected to decrement the reference count on
-// the File. Otherwise returns (nil, false).
-func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool) {
- f.mu.Lock()
- desc := f.files[fd]
- delete(f.files, fd)
- f.mu.Unlock()
- if desc.file != nil {
- f.unlock(desc.file)
- inotifyFileClose(desc.file)
- return desc.file, true
- }
- return nil, false
-}
-
-// RemoveIf removes all FDs where cond is true.
-func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool) {
- var removed []*fs.File
- f.mu.Lock()
- for fd, desc := range f.files {
- if desc.file != nil && cond(desc.file, desc.flags) {
- delete(f.files, fd)
- removed = append(removed, desc.file)
- }
- }
- f.mu.Unlock()
-
- for _, file := range removed {
- f.unlock(file)
- inotifyFileClose(file)
- file.DecRef()
- }
-}
diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go
deleted file mode 100644
index 8571dbe59..000000000
--- a/pkg/sentry/kernel/fd_map_test.go
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kernel
-
-import (
- "testing"
-
- "gvisor.dev/gvisor/pkg/sentry/fs/filetest"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
- "gvisor.dev/gvisor/pkg/sentry/limits"
-)
-
-const (
- // maxFD is the maximum FD to try to create in the map.
- // This number of open files has been seen in the wild.
- maxFD = 2 * 1024
-)
-
-func newTestFDMap() *FDMap {
- return &FDMap{
- files: make(map[kdefs.FD]descriptor),
- }
-}
-
-// TestFDMapMany allocates maxFD FDs, i.e. maxes out the FDMap,
-// until there is no room, then makes sure that NewFDAt works
-// and also that if we remove one and add one that works too.
-func TestFDMapMany(t *testing.T) {
- file := filetest.NewTestFile(t)
- limitSet := limits.NewLimitSet()
- limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true /* privileged */)
-
- f := newTestFDMap()
- for i := 0; i < maxFD; i++ {
- if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
- t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD)
- }
- }
-
- if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil {
- t.Fatalf("f.NewFDFrom(0, r) in full map: got nil, wanted error")
- }
-
- if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil {
- t.Fatalf("f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
- }
-}
-
-// TestFDMap does a set of simple tests to make sure simple adds,
-// removes, GetRefs, and DecRefs work. The ordering is just weird
-// enough that a table-driven approach seemed clumsy.
-func TestFDMap(t *testing.T) {
- file := filetest.NewTestFile(t)
- limitSet := limits.NewLimitSet()
- limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}, true /* privileged */)
-
- f := newTestFDMap()
- if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
- t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err)
- }
-
- if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil {
- t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error")
- }
-
- largeLimit := limits.Limit{maxFD, maxFD}
- limitSet.Set(limits.NumberOfFiles, largeLimit, true /* privileged */)
-
- if fd, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
- t.Fatalf("Adding an FD to a resized map: got %v, want nil", err)
- } else if fd != kdefs.FD(1) {
- t.Fatalf("Added an FD to a resized map: got %v, want 1", fd)
- }
-
- if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil {
- t.Fatalf("Replacing FD 1 via f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
- }
-
- if err := f.NewFDAt(maxFD+1, file, FDFlags{}, limitSet); err == nil {
- t.Fatalf("Using an FD that was too large via f.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1)
- }
-
- if ref := f.GetFile(1); ref == nil {
- t.Fatalf("f.GetFile(1): got nil, wanted %v", file)
- }
-
- if ref := f.GetFile(2); ref != nil {
- t.Fatalf("f.GetFile(2): got a %v, wanted nil", ref)
- }
-
- ref, ok := f.Remove(1)
- if !ok {
- t.Fatalf("f.Remove(1) for an existing FD: failed, want success")
- }
- ref.DecRef()
-
- if ref, ok := f.Remove(1); ok {
- ref.DecRef()
- t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
- }
-
-}
-
-func TestDescriptorFlags(t *testing.T) {
- file := filetest.NewTestFile(t)
- f := newTestFDMap()
- limitSet := limits.NewLimitSet()
- limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true /* privileged */)
-
- origFlags := FDFlags{CloseOnExec: true}
-
- if err := f.NewFDAt(2, file, origFlags, limitSet); err != nil {
- t.Fatalf("f.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err)
- }
-
- newFile, newFlags := f.GetDescriptor(2)
- if newFile == nil {
- t.Fatalf("f.GetFile(2): got a %v, wanted nil", newFile)
- }
-
- if newFlags != origFlags {
- t.Fatalf("new File flags %+v don't match original %+v", newFlags, origFlags)
- }
-}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
new file mode 100644
index 000000000..1f3a57dc1
--- /dev/null
+++ b/pkg/sentry/kernel/fd_table.go
@@ -0,0 +1,380 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "bytes"
+ "fmt"
+ "math"
+ "sync"
+ "sync/atomic"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+)
+
+// FDFlags define flags for an individual descriptor.
+//
+// +stateify savable
+type FDFlags struct {
+ // CloseOnExec indicates the descriptor should be closed on exec.
+ CloseOnExec bool
+}
+
+// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
+// representation.
+func (f FDFlags) ToLinuxFileFlags() (mask uint) {
+ if f.CloseOnExec {
+ mask |= linux.O_CLOEXEC
+ }
+ return
+}
+
+// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
+// representation.
+func (f FDFlags) ToLinuxFDFlags() (mask uint) {
+ if f.CloseOnExec {
+ mask |= linux.FD_CLOEXEC
+ }
+ return
+}
+
+// descriptor holds the details about a file descriptor, namely a pointer to
+// the file itself and the descriptor flags.
+//
+// Note that this is immutable and can only be changed via operations on the
+// descriptorTable.
+//
+// +stateify savable
+type descriptor struct {
+ file *fs.File
+ flags FDFlags
+}
+
+// FDTable is used to manage File references and flags.
+//
+// +stateify savable
+type FDTable struct {
+ refs.AtomicRefCount
+ k *Kernel
+
+ // uid is a unique identifier.
+ uid uint64
+
+ // mu protects below.
+ mu sync.Mutex `state:"nosave"`
+
+ // used contains the number of non-nil entries.
+ used int32
+
+ // descriptorTable holds descriptors.
+ descriptorTable `state:".(map[int32]descriptor)"`
+}
+
+func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
+ m := make(map[int32]descriptor)
+ f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ m[fd] = descriptor{
+ file: file,
+ flags: flags,
+ }
+ })
+ return m
+}
+
+func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
+ f.init() // Initialize table.
+ for fd, d := range m {
+ f.set(fd, d.file, d.flags)
+
+ // Note that we do _not_ need to acquire a extra table
+ // reference here. The table reference will already be
+ // accounted for in the file, so we drop the reference taken by
+ // set above.
+ d.file.DecRef()
+ }
+}
+
+// drop drops the table reference.
+func (f *FDTable) drop(file *fs.File) {
+ // Release locks.
+ file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF})
+
+ // Send inotify events.
+ d := file.Dirent
+ var ev uint32
+ if fs.IsDir(d.Inode.StableAttr) {
+ ev |= linux.IN_ISDIR
+ }
+ if file.Flags().Write {
+ ev |= linux.IN_CLOSE_WRITE
+ } else {
+ ev |= linux.IN_CLOSE_NOWRITE
+ }
+ d.InotifyEvent(ev, 0)
+
+ // Drop the table reference.
+ file.DecRef()
+}
+
+// ID returns a unique identifier for this FDTable.
+func (f *FDTable) ID() uint64 {
+ return f.uid
+}
+
+// NewFDTable allocates a new FDTable that may be used by tasks in k.
+func (k *Kernel) NewFDTable() *FDTable {
+ f := &FDTable{
+ k: k,
+ uid: atomic.AddUint64(&k.fdMapUids, 1),
+ }
+ f.init()
+ return f
+}
+
+// destroy removes all of the file descriptors from the map.
+func (f *FDTable) destroy() {
+ f.RemoveIf(func(*fs.File, FDFlags) bool {
+ return true
+ })
+}
+
+// DecRef implements RefCounter.DecRef with destructor f.destroy.
+func (f *FDTable) DecRef() {
+ f.DecRefWithDestructor(f.destroy)
+}
+
+// Size returns the number of file descriptor slots currently allocated.
+func (f *FDTable) Size() int {
+ size := atomic.LoadInt32(&f.used)
+ return int(size)
+}
+
+// forEach iterates over all non-nil files.
+//
+// It is the caller's responsibility to acquire an appropriate lock.
+func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) {
+ fd := int32(0)
+ for {
+ file, flags, ok := f.get(fd)
+ if !ok {
+ break
+ }
+ if file != nil {
+ if !file.TryIncRef() {
+ continue // Race caught.
+ }
+ fn(int32(fd), file, flags)
+ file.DecRef()
+ }
+ fd++
+ }
+}
+
+// String is a stringer for FDTable.
+func (f *FDTable) String() string {
+ var b bytes.Buffer
+ f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ n, _ := file.Dirent.FullName(nil /* root */)
+ b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n))
+ })
+ return b.String()
+}
+
+// NewFDs allocates new FDs guaranteed to be the lowest number available
+// greater than or equal to the fd parameter. All files will share the set
+// flags. Success is guaranteed to be all or none.
+func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) {
+ if fd < 0 {
+ // Don't accept negative FDs.
+ return nil, syscall.EINVAL
+ }
+
+ // Default limit.
+ end := int32(math.MaxInt32)
+
+ // Ensure we don't get past the provided limit.
+ if limitSet := limits.FromContext(ctx); limitSet != nil {
+ lim := limitSet.Get(limits.NumberOfFiles)
+ if lim.Cur != limits.Infinity {
+ end = int32(lim.Cur)
+ }
+ if fd >= end {
+ return nil, syscall.EMFILE
+ }
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ // Install all entries.
+ for i := fd; i < end && len(fds) < len(files); i++ {
+ if d, _, _ := f.get(i); d == nil {
+ f.set(i, files[len(fds)], flags) // Set the descriptor.
+ fds = append(fds, i) // Record the file descriptor.
+ }
+ }
+
+ // Failure? Unwind existing FDs.
+ if len(fds) < len(files) {
+ for _, i := range fds {
+ f.set(i, nil, FDFlags{}) // Zap entry.
+ }
+ return nil, syscall.EMFILE
+ }
+
+ return fds, nil
+}
+
+// NewFDAt sets the file reference for the given FD. If there is an active
+// reference for that FD, the ref count for that existing reference is
+// decremented.
+func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
+ if fd < 0 {
+ // Don't accept negative FDs.
+ return syscall.EBADF
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ // Check the limit for the provided file.
+ if limitSet := limits.FromContext(ctx); limitSet != nil {
+ if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
+ return syscall.EMFILE
+ }
+ }
+
+ // Install the entry.
+ f.set(fd, file, flags)
+ return nil
+}
+
+// SetFlags sets the flags for the given file descriptor.
+//
+// True is returned iff flags were changed.
+func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
+ if fd < 0 {
+ // Don't accept negative FDs.
+ return syscall.EBADF
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ file, _, _ := f.get(fd)
+ if file == nil {
+ // No file found.
+ return syscall.EBADF
+ }
+
+ // Update the flags.
+ f.set(fd, file, flags)
+ return nil
+}
+
+// Get returns a reference to the file and the flags for the FD or nil if no
+// file is defined for the given fd.
+//
+// N.B. Callers are required to use DecRef when they are done.
+//
+//go:nosplit
+func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) {
+ if fd < 0 {
+ return nil, FDFlags{}
+ }
+
+ for {
+ file, flags, _ := f.get(fd)
+ if file != nil {
+ if !file.TryIncRef() {
+ continue // Race caught.
+ }
+ // Reference acquired.
+ return file, flags
+ }
+ // No file available.
+ return nil, FDFlags{}
+ }
+}
+
+// GetFDs returns a list of valid fds.
+func (f *FDTable) GetFDs() []int32 {
+ fds := make([]int32, 0, f.used)
+ f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ fds = append(fds, fd)
+ })
+ return fds
+}
+
+// GetRefs returns a stable slice of references to all files and bumps the
+// reference count on each. The caller must use DecRef on each reference when
+// they're done using the slice.
+func (f *FDTable) GetRefs() []*fs.File {
+ files := make([]*fs.File, 0, f.Size())
+ f.forEach(func(_ int32, file *fs.File, flags FDFlags) {
+ file.IncRef() // Acquire a reference for caller.
+ files = append(files, file)
+ })
+ return files
+}
+
+// Fork returns an independent FDTable.
+func (f *FDTable) Fork() *FDTable {
+ clone := f.k.NewFDTable()
+
+ f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ // The set function here will acquire an appropriate table
+ // reference for the clone. We don't need anything else.
+ clone.set(fd, file, flags)
+ })
+ return clone
+}
+
+// Remove removes an FD from and returns a non-file iff successful.
+//
+// N.B. Callers are required to use DecRef when they are done.
+func (f *FDTable) Remove(fd int32) *fs.File {
+ if fd < 0 {
+ return nil
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ orig, _, _ := f.get(fd)
+ if orig != nil {
+ orig.IncRef() // Reference for caller.
+ f.set(fd, nil, FDFlags{}) // Zap entry.
+ }
+ return orig
+}
+
+// RemoveIf removes all FDs where cond is true.
+func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ if cond(file, flags) {
+ f.set(fd, nil, FDFlags{}) // Clear from table.
+ }
+ })
+}
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
new file mode 100644
index 000000000..2413788e7
--- /dev/null
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -0,0 +1,192 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "runtime"
+ "sync"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fs/filetest"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+)
+
+const (
+ // maxFD is the maximum FD to try to create in the map.
+ //
+ // This number of open files has been seen in the wild.
+ maxFD = 2 * 1024
+)
+
+func runTest(t testing.TB, fn func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet)) {
+ t.Helper() // Don't show in stacks.
+
+ // Create the limits and context.
+ limitSet := limits.NewLimitSet()
+ limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true)
+ ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet)
+
+ // Create a test file.;
+ file := filetest.NewTestFile(t)
+
+ // Create the table.
+ fdTable := new(FDTable)
+ fdTable.init()
+
+ // Run the test.
+ fn(ctx, fdTable, file, limitSet)
+}
+
+// TestFDTableMany allocates maxFD FDs, i.e. maxes out the FDTable, until there
+// is no room, then makes sure that NewFDAt works and also that if we remove
+// one and add one that works too.
+func TestFDTableMany(t *testing.T) {
+ runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+ for i := 0; i < maxFD; i++ {
+ if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil {
+ t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD)
+ }
+ }
+
+ if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil {
+ t.Fatalf("fdTable.NewFDs(0, r) in full map: got nil, wanted error")
+ }
+
+ if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil {
+ t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+ }
+ })
+}
+
+// TestFDTable does a set of simple tests to make sure simple adds, removes,
+// GetRefs, and DecRefs work. The ordering is just weird enough that a
+// table-driven approach seemed clumsy.
+func TestFDTable(t *testing.T) {
+ runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet) {
+ // Cap the limit at one.
+ limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}, true)
+
+ if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil {
+ t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err)
+ }
+
+ if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil {
+ t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error")
+ }
+
+ // Remove the previous limit.
+ limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true)
+
+ if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil {
+ t.Fatalf("Adding an FD to a resized map: got %v, want nil", err)
+ } else if len(fds) != 1 || fds[0] != 1 {
+ t.Fatalf("Added an FD to a resized map: got %v, want {1}", fds)
+ }
+
+ if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil {
+ t.Fatalf("Replacing FD 1 via fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+ }
+
+ if err := fdTable.NewFDAt(ctx, maxFD+1, file, FDFlags{}); err == nil {
+ t.Fatalf("Using an FD that was too large via fdTable.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1)
+ }
+
+ if ref, _ := fdTable.Get(1); ref == nil {
+ t.Fatalf("fdTable.Get(1): got nil, wanted %v", file)
+ }
+
+ if ref, _ := fdTable.Get(2); ref != nil {
+ t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref)
+ }
+
+ ref := fdTable.Remove(1)
+ if ref == nil {
+ t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success")
+ }
+ ref.DecRef()
+
+ if ref := fdTable.Remove(1); ref != nil {
+ t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
+ }
+ })
+}
+
+func TestDescriptorFlags(t *testing.T) {
+ runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+ if err := fdTable.NewFDAt(ctx, 2, file, FDFlags{CloseOnExec: true}); err != nil {
+ t.Fatalf("fdTable.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err)
+ }
+
+ newFile, flags := fdTable.Get(2)
+ if newFile == nil {
+ t.Fatalf("fdTable.Get(2): got a %v, wanted nil", newFile)
+ }
+
+ if !flags.CloseOnExec {
+ t.Fatalf("new File flags %v don't match original %d\n", flags, 0)
+ }
+ })
+}
+
+func BenchmarkFDLookupAndDecRef(b *testing.B) {
+ b.StopTimer() // Setup.
+
+ runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+ fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{})
+ if err != nil {
+ b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err)
+ }
+
+ b.StartTimer() // Benchmark.
+ for i := 0; i < b.N; i++ {
+ tf, _ := fdTable.Get(fds[i%len(fds)])
+ tf.DecRef()
+ }
+ })
+}
+
+func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) {
+ b.StopTimer() // Setup.
+
+ runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+ fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{})
+ if err != nil {
+ b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err)
+ }
+
+ concurrency := runtime.GOMAXPROCS(0)
+ if concurrency < 4 {
+ concurrency = 4
+ }
+ each := b.N / concurrency
+
+ b.StartTimer() // Benchmark.
+ var wg sync.WaitGroup
+ for i := 0; i < concurrency; i++ {
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ for i := 0; i < each; i++ {
+ tf, _ := fdTable.Get(fds[i%len(fds)])
+ tf.DecRef()
+ }
+ }()
+ }
+ wg.Wait()
+ })
+}
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
new file mode 100644
index 000000000..e009df974
--- /dev/null
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -0,0 +1,103 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "sync/atomic"
+ "unsafe"
+
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+)
+
+type descriptorTable struct {
+ // slice is a *[]unsafe.Pointer, where each element is actually
+ // *descriptor object, updated atomically.
+ //
+ // Changes to the slice itself requiring holding FDTable.mu.
+ slice unsafe.Pointer `state:".(map[int32]*descriptor)"`
+}
+
+// init initializes the table.
+func (f *FDTable) init() {
+ var slice []unsafe.Pointer // Empty slice.
+ atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
+}
+
+// get gets a file entry.
+//
+// The boolean indicates whether this was in range.
+//
+//go:nosplit
+func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) {
+ slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+ if fd >= int32(len(slice)) {
+ return nil, FDFlags{}, false
+ }
+ d := (*descriptor)(atomic.LoadPointer(&slice[fd]))
+ if d == nil {
+ return nil, FDFlags{}, true
+ }
+ return d.file, d.flags, true
+}
+
+// set sets an entry.
+//
+// This handles accounting changes, as well as acquiring and releasing the
+// reference needed by the table iff the file is different.
+//
+// Precondition: mu must be held.
+func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
+ slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+
+ // Grow the table as required.
+ if last := int32(len(slice)); fd >= last {
+ end := fd + 1
+ if end < 2*last {
+ end = 2 * last
+ }
+ slice = append(slice, make([]unsafe.Pointer, end-last)...)
+ atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
+ }
+
+ // Create the new element.
+ var d *descriptor
+ if file != nil {
+ d = &descriptor{
+ file: file,
+ flags: flags,
+ }
+ }
+
+ // Update the single element.
+ orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(d)))
+
+ // Acquire a table reference.
+ if file != nil && (orig == nil || file != orig.file) {
+ file.IncRef()
+ }
+
+ // Drop the table reference.
+ if orig != nil && file != orig.file {
+ f.drop(orig.file)
+ }
+
+ // Adjust used.
+ switch {
+ case orig == nil && file != nil:
+ atomic.AddInt32(&f.used, 1)
+ case orig != nil && file == nil:
+ atomic.AddInt32(&f.used, -1)
+ }
+}
diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD
deleted file mode 100644
index 5d62f406a..000000000
--- a/pkg/sentry/kernel/kdefs/BUILD
+++ /dev/null
@@ -1,10 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
- name = "kdefs",
- srcs = ["kdefs.go"],
- importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs",
- visibility = ["//:sandbox"],
-)
diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go
deleted file mode 100644
index 304da2032..000000000
--- a/pkg/sentry/kernel/kdefs/kdefs.go
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package kdefs defines common kernel definitions.
-//
-package kdefs
-
-// FD is a File Descriptor.
-type FD int32
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 47dadc43a..38b49cba2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -155,7 +155,7 @@ type Kernel struct {
// cpuClockTicker increments cpuClock.
cpuClockTicker *ktime.Timer `state:"nosave"`
- // fdMapUids is an ever-increasing counter for generating FDMap uids.
+ // fdMapUids is an ever-increasing counter for generating FDTable uids.
//
// fdMapUids is mutable, and is accessed using atomic memory operations.
fdMapUids uint64
@@ -400,8 +400,8 @@ func (k *Kernel) flushMountSourceRefs() error {
// There may be some open FDs whose filesystems have been unmounted. We
// must flush those as well.
- return k.tasks.forEachFDPaused(func(desc descriptor) error {
- desc.file.Dirent.Inode.MountSource.FlushDirentRefs()
+ return k.tasks.forEachFDPaused(func(file *fs.File) error {
+ file.Dirent.Inode.MountSource.FlushDirentRefs()
return nil
})
}
@@ -410,35 +410,35 @@ func (k *Kernel) flushMountSourceRefs() error {
// task.
//
// Precondition: Must be called with the kernel paused.
-func (ts *TaskSet) forEachFDPaused(f func(descriptor) error) error {
+func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) {
ts.mu.RLock()
defer ts.mu.RUnlock()
for t := range ts.Root.tids {
// We can skip locking Task.mu here since the kernel is paused.
- if t.fds == nil {
+ if t.fdTable == nil {
continue
}
- for _, desc := range t.fds.files {
- if err := f(desc); err != nil {
- return err
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ if lastErr := f(file); lastErr != nil && err == nil {
+ err = lastErr
}
- }
+ })
}
- return nil
+ return err
}
func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
- return ts.forEachFDPaused(func(desc descriptor) error {
- if flags := desc.file.Flags(); !flags.Write {
+ return ts.forEachFDPaused(func(file *fs.File) error {
+ if flags := file.Flags(); !flags.Write {
return nil
}
- if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
+ if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
return nil
}
// Here we need all metadata synced.
- syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+ syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
if err := fs.SaveFileFsyncError(syncErr); err != nil {
- name, _ := desc.file.Dirent.FullName(nil /* root */)
+ name, _ := file.Dirent.FullName(nil /* root */)
// Wrap this error in ErrSaveRejection
// so that it will trigger a save
// error, rather than a panic. This
@@ -483,14 +483,12 @@ func (ts *TaskSet) unregisterEpollWaiters() {
defer ts.mu.RUnlock()
for t := range ts.Root.tids {
// We can skip locking Task.mu here since the kernel is paused.
- if fdmap := t.fds; fdmap != nil {
- for _, desc := range fdmap.files {
- if desc.file != nil {
- if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok {
- e.UnregisterEpollWaiters()
- }
+ if t.fdTable != nil {
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
+ e.UnregisterEpollWaiters()
}
- }
+ })
}
}
}
@@ -538,6 +536,8 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
}
log.Infof("Memory load took [%s].", time.Since(memoryStart))
+ log.Infof("Overall load took [%s]", time.Since(loadStart))
+
// Ensure that all pending asynchronous work is complete:
// - namedpipe opening
// - inode file opening
@@ -602,9 +602,9 @@ type CreateProcessArgs struct {
// Credentials is the initial credentials.
Credentials *auth.Credentials
- // FDMap is the initial set of file descriptors. If CreateProcess succeeds,
- // it takes a reference on FDMap.
- FDMap *FDMap
+ // FDTable is the initial set of file descriptors. If CreateProcess succeeds,
+ // it takes a reference on FDTable.
+ FDTable *FDTable
// Umask is the initial umask.
Umask uint
@@ -789,9 +789,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
return nil, 0, errors.New(se.String())
}
- // Take a reference on the FDMap, which will be transferred to
+ // Take a reference on the FDTable, which will be transferred to
// TaskSet.NewTask().
- args.FDMap.IncRef()
+ args.FDTable.IncRef()
// Create the task.
config := &TaskConfig{
@@ -799,7 +799,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
ThreadGroup: tg,
TaskContext: tc,
FSContext: newFSContext(root, wd, args.Umask),
- FDMap: args.FDMap,
+ FDTable: args.FDTable,
Credentials: args.Credentials,
AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores),
UTSNamespace: args.UTSNamespace,
@@ -871,7 +871,7 @@ func (k *Kernel) pauseTimeLocked() {
}
// By precondition, nothing else can be interacting with PIDNamespace.tids
- // or FDMap.files, so we can iterate them without synchronization. (We
+ // or FDTable.files, so we can iterate them without synchronization. (We
// can't hold the TaskSet mutex when pausing thread group timers because
// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
// mutex, while holding the Timer mutex.)
@@ -882,14 +882,14 @@ func (k *Kernel) pauseTimeLocked() {
it.PauseTimer()
}
}
- // This means we'll iterate FDMaps shared by multiple tasks repeatedly,
+ // This means we'll iterate FDTables shared by multiple tasks repeatedly,
// but ktime.Timer.Pause is idempotent so this is harmless.
- if fdm := t.fds; fdm != nil {
- for _, desc := range fdm.files {
- if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+ if t.fdTable != nil {
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
tfd.PauseTimer()
}
- }
+ })
}
}
k.timekeeper.PauseUpdates()
@@ -914,12 +914,12 @@ func (k *Kernel) resumeTimeLocked() {
it.ResumeTimer()
}
}
- if fdm := t.fds; fdm != nil {
- for _, desc := range fdm.files {
- if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+ if t.fdTable != nil {
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
tfd.ResumeTimer()
}
- }
+ })
}
}
}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 2e3a39d3b..e91f82bb3 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -236,15 +236,15 @@ type Task struct {
// tc is protected by mu, and is owned by the task goroutine.
tc TaskContext
- // fsc is the task's filesystem context.
+ // fsContext is the task's filesystem context.
//
- // fsc is protected by mu, and is owned by the task goroutine.
- fsc *FSContext
+ // fsContext is protected by mu, and is owned by the task goroutine.
+ fsContext *FSContext
- // fds is the task's file descriptor table.
+ // fdTable is the task's file descriptor table.
//
- // fds is protected by mu, and is owned by the task goroutine.
- fds *FDMap
+ // fdTable is protected by mu, and is owned by the task goroutine.
+ fdTable *FDTable
// If vforkParent is not nil, it is the task that created this task with
// vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
@@ -602,7 +602,7 @@ func (t *Task) Value(key interface{}) interface{} {
case context.CtxThreadGroupID:
return int32(t.ThreadGroup().ID())
case fs.CtxRoot:
- return t.fsc.RootDirectory()
+ return t.fsContext.RootDirectory()
case fs.CtxDirentCacheLimiter:
return t.k.DirentCacheLimiter
case inet.CtxStack:
@@ -668,7 +668,7 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
func (t *Task) IsChrooted() bool {
realRoot := t.tg.mounts.Root()
defer realRoot.DecRef()
- root := t.fsc.RootDirectory()
+ root := t.fsContext.RootDirectory()
if root != nil {
defer root.DecRef()
}
@@ -689,16 +689,55 @@ func (t *Task) TaskContext() *TaskContext {
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
func (t *Task) FSContext() *FSContext {
- return t.fsc
+ return t.fsContext
}
-// FDMap returns t's FDMap. FDMap does not take an additional reference on the
-// returned FDMap.
+// FDTable returns t's FDTable. FDMTable does not take an additional reference
+// on the returned FDMap.
//
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
-func (t *Task) FDMap() *FDMap {
- return t.fds
+func (t *Task) FDTable() *FDTable {
+ return t.fdTable
+}
+
+// GetFile is a convenience wrapper t.FDTable().GetFile.
+//
+// Precondition: same as FDTable.
+func (t *Task) GetFile(fd int32) *fs.File {
+ f, _ := t.fdTable.Get(fd)
+ return f
+}
+
+// NewFDs is a convenience wrapper for t.FDTable().NewFDs.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error) {
+ return t.fdTable.NewFDs(t, fd, files, flags)
+}
+
+// NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) {
+ fds, err := t.fdTable.NewFDs(t, fd, []*fs.File{file}, flags)
+ if err != nil {
+ return 0, err
+ }
+ return fds[0], nil
+}
+
+// NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error {
+ return t.fdTable.NewFDAt(t, fd, file, flags)
}
// WithMuLocked executes f with t.mu locked.
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index b5cc3860d..0916fd658 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -214,20 +214,20 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
}
}
- var fsc *FSContext
+ var fsContext *FSContext
if opts.NewFSContext {
- fsc = t.fsc.Fork()
+ fsContext = t.fsContext.Fork()
} else {
- fsc = t.fsc
- fsc.IncRef()
+ fsContext = t.fsContext
+ fsContext.IncRef()
}
- var fds *FDMap
+ var fdTable *FDTable
if opts.NewFiles {
- fds = t.fds.Fork()
+ fdTable = t.fdTable.Fork()
} else {
- fds = t.fds
- fds.IncRef()
+ fdTable = t.fdTable
+ fdTable.IncRef()
}
pidns := t.tg.pidns
@@ -251,8 +251,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
ThreadGroup: tg,
SignalMask: t.SignalMask(),
TaskContext: tc,
- FSContext: fsc,
- FDMap: fds,
+ FSContext: fsContext,
+ FDTable: fdTable,
Credentials: creds,
Niceness: t.Niceness(),
NetworkNamespaced: t.netns,
@@ -485,22 +485,22 @@ func (t *Task) Unshare(opts *SharingOptions) error {
// namespace"
t.ipcns = NewIPCNamespace(creds.UserNamespace)
}
- var oldfds *FDMap
+ var oldFDTable *FDTable
if opts.NewFiles {
- oldfds = t.fds
- t.fds = oldfds.Fork()
+ oldFDTable = t.fdTable
+ t.fdTable = oldFDTable.Fork()
}
- var oldfsc *FSContext
+ var oldFSContext *FSContext
if opts.NewFSContext {
- oldfsc = t.fsc
- t.fsc = oldfsc.Fork()
+ oldFSContext = t.fsContext
+ t.fsContext = oldFSContext.Fork()
}
t.mu.Unlock()
- if oldfds != nil {
- oldfds.DecRef()
+ if oldFDTable != nil {
+ oldFDTable.DecRef()
}
- if oldfsc != nil {
- oldfsc.DecRef()
+ if oldFSContext != nil {
+ oldFSContext.DecRef()
}
return nil
}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index cd85acaef..17a089b90 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -195,7 +195,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
t.tg.pidns.owner.mu.Unlock()
// Remove FDs with the CloseOnExec flag set.
- t.fds.RemoveIf(func(file *fs.File, flags FDFlags) bool {
+ t.fdTable.RemoveIf(func(file *fs.File, flags FDFlags) bool {
return flags.CloseOnExec
})
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index b97d65185..535f03e50 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -265,8 +265,8 @@ func (*runExitMain) execute(t *Task) taskRunState {
// Releasing the MM unblocks a blocked CLONE_VFORK parent.
t.unstopVforkParent()
- t.fsc.DecRef()
- t.fds.DecRef()
+ t.fsContext.DecRef()
+ t.fdTable.DecRef()
// If this is the last task to exit from the thread group, release the
// thread group's resources.
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index cf48663b6..a29e9b9eb 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -63,7 +63,7 @@ func (t *Task) DebugDumpState() {
if mm := t.MemoryManager(); mm != nil {
t.Debugf("Mappings:\n%s", mm)
}
- t.Debugf("FDMap:\n%s", t.fds)
+ t.Debugf("FDTable:\n%s", t.fdTable)
}
// debugDumpRegisters logs register state at log level debug.
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 72caae537..a88bf3951 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -52,9 +52,10 @@ type TaskConfig struct {
// succeeds.
FSContext *FSContext
- // FDMap is the FDMap of the new task. A reference must be held on FDMap,
- // which is transferred to TaskSet.NewTask whether or not it succeeds.
- FDMap *FDMap
+ // FDTable is the FDTableof the new task. A reference must be held on
+ // FDMap, which is transferred to TaskSet.NewTask whether or not it
+ // succeeds.
+ FDTable *FDTable
// Credentials is the Credentials of the new task.
Credentials *auth.Credentials
@@ -90,7 +91,7 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
if err != nil {
cfg.TaskContext.release()
cfg.FSContext.DecRef()
- cfg.FDMap.DecRef()
+ cfg.FDTable.DecRef()
return nil, err
}
return t, nil
@@ -112,8 +113,8 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
signalMask: cfg.SignalMask,
signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable},
tc: *tc,
- fsc: cfg.FSContext,
- fds: cfg.FDMap,
+ fsContext: cfg.FSContext,
+ fdTable: cfg.FDTable,
p: cfg.Kernel.Platform.NewContext(),
k: cfg.Kernel,
ptraceTracees: make(map[*Task]struct{}),
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index edfdac2a7..baa12d9a0 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -54,7 +54,7 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in
// openPath opens name for loading.
//
// openPath returns the fs.Dirent and an *fs.File for name, which is not
-// installed in the Task FDMap. The caller takes ownership of both.
+// installed in the Task FDTable. The caller takes ownership of both.
//
// name must be a readable, executable, regular file.
func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, name string) (*fs.Dirent, *fs.File, error) {
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 7d92e16cc..679087e25 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -426,7 +426,12 @@ func (c *vCPU) unlock() {
// Normal state.
case vCPUUser | vCPUGuest | vCPUWaiter:
// Force a transition: this must trigger a notification when we
- // return from guest mode.
+ // return from guest mode. We must clear vCPUWaiter here
+ // anyways, because BounceToKernel will force a transition only
+ // from ring3 to ring0, which will not clear this bit. Halt may
+ // workaround the issue, but if there is no exception or
+ // syscall in this period, BounceToKernel will hang.
+ atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
c.notify()
case vCPUUser | vCPUWaiter:
// Waiting for the lock to be released; the responsibility is
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 7a24d4806..2b03ea87c 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -14,7 +14,6 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
"//pkg/sentry/kernel",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/kernel/time",
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usermem",
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 39de46c39..81dbd7309 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -17,7 +17,6 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usermem",
"//pkg/syserror",
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index b646dc258..4f4a20dfe 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -23,7 +23,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
@@ -63,7 +62,7 @@ type RightsFiles []*fs.File
func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) {
files := make(RightsFiles, 0, len(fds))
for _, fd := range fds {
- file, _ := t.FDMap().GetDescriptor(kdefs.FD(fd))
+ file := t.GetFile(fd)
if file == nil {
files.Release()
return nil, syserror.EBADF
@@ -109,7 +108,9 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32
files, trunc := rights.Files(t, max)
fds := make([]int32, 0, len(files))
for i := 0; i < max && len(files) > 0; i++ {
- fd, err := t.FDMap().NewFDFrom(0, files[0], kernel.FDFlags{cloexec}, t.ThreadGroup().Limits())
+ fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{
+ CloseOnExec: cloexec,
+ })
files[0].DecRef()
files = files[1:]
if err != nil {
@@ -315,8 +316,7 @@ func PackTimestamp(t *kernel.Task, timestamp int64, buf []byte) []byte {
// Parse parses a raw socket control message into portable objects.
func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport.ControlMessages, error) {
var (
- fds linux.ControlMessageRights
-
+ fds linux.ControlMessageRights
haveCreds bool
creds linux.ControlMessageCredentials
)
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 45bb24a3f..1f014f399 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -28,7 +28,6 @@ go_library(
"//pkg/sentry/inet",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/kernel/time",
"//pkg/sentry/safemem",
"//pkg/sentry/socket",
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 2a38e370a..b2b2d98a1 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -40,7 +40,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/socket"
@@ -537,7 +536,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *wait
// Accept implements the linux syscall accept(2) for sockets backed by
// tcpip.Endpoint.
-func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
// Issue the accept request to get the new endpoint.
ep, wq, terr := s.Endpoint.Accept()
if terr != nil {
@@ -575,10 +574,9 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
}
}
- fdFlags := kernel.FDFlags{
+ fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
- }
- fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits())
+ })
t.Kernel().RecordSocket(ns)
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 4f670beb4..a951f1bb0 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -26,7 +26,6 @@ go_library(
"//pkg/sentry/fs/fsutil",
"//pkg/sentry/inet",
"//pkg/sentry/kernel",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/kernel/time",
"//pkg/sentry/safemem",
"//pkg/sentry/socket",
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index c63f3aacf..7f69406b7 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -26,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/socket"
@@ -190,7 +189,7 @@ func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
}
// Accept implements socket.Socket.Accept.
-func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
var peerAddr []byte
var peerAddrlen uint32
var peerAddrPtr *byte
@@ -236,11 +235,11 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
}
defer f.DecRef()
- fdFlags := kernel.FDFlags{
+ kfd, kerr := t.NewFDFrom(0, f, kernel.FDFlags{
CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0,
- }
- kfd, kerr := t.FDMap().NewFDFrom(0, f, fdFlags, t.ThreadGroup().Limits())
+ })
t.Kernel().RecordSocket(f)
+
return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr)
}
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index f6b001b63..45ebb2a0e 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -20,7 +20,6 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
"//pkg/sentry/kernel",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/kernel/time",
"//pkg/sentry/socket",
"//pkg/sentry/socket/netlink/port",
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index ecc1e2d53..f3d6c1e9b 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -27,7 +27,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
@@ -272,7 +271,7 @@ func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr
}
// Accept implements socket.Socket.Accept.
-func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
// Netlink sockets never support accept.
return 0, nil, 0, syserr.ErrNotSupported
}
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index 96d374383..5061dcbde 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -25,7 +25,6 @@ go_library(
"//pkg/sentry/fs/fsutil",
"//pkg/sentry/inet",
"//pkg/sentry/kernel",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/kernel/time",
"//pkg/sentry/socket",
"//pkg/sentry/socket/hostinet",
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index cc7b964ea..ccaaddbfc 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -26,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn"
@@ -286,7 +285,7 @@ func rpcAccept(t *kernel.Task, fd uint32, peer bool) (*pb.AcceptResponse_ResultP
}
// Accept implements socket.Socket.Accept.
-func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
payload, se := rpcAccept(t, s.fd, peerRequested)
// Check if we need to block.
@@ -336,10 +335,9 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
})
defer file.DecRef()
- fdFlags := kernel.FDFlags{
+ fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
- }
- fd, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
+ })
if err != nil {
return 0, nil, 0, syserr.FromError(err)
}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 933120f34..0efa58a58 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -27,7 +27,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -53,7 +52,7 @@ type Socket interface {
// Accept implements the accept4(2) linux syscall.
// Returns fd, real peer address length and error. Real peer address
// length is only set if len(peer) > 0.
- Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error)
+ Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error)
// Bind implements the bind(2) linux syscall.
Bind(t *kernel.Task, sockaddr []byte) *syserr.Error
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index 8580eb87d..da9977fde 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -20,7 +20,6 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
"//pkg/sentry/kernel",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/kernel/time",
"//pkg/sentry/safemem",
"//pkg/sentry/socket",
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index bf7d2cfa2..b30871a90 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -28,7 +28,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/control"
@@ -194,7 +193,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
// Accept implements the linux syscall accept(2) for sockets backed by
// a transport.Endpoint.
-func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
// Issue the accept request to get the new endpoint.
ep, err := s.ep.Accept()
if err != nil {
@@ -229,10 +228,9 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
}
}
- fdFlags := kernel.FDFlags{
+ fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
- }
- fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits())
+ })
if e != nil {
return 0, nil, 0, syserr.FromError(e)
}
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index d77c7a433..445d25010 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -30,7 +30,6 @@ go_library(
"//pkg/seccomp",
"//pkg/sentry/arch",
"//pkg/sentry/kernel",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/socket/control",
"//pkg/sentry/socket/epsocket",
"//pkg/sentry/socket/netlink",
diff --git a/pkg/sentry/strace/poll.go b/pkg/sentry/strace/poll.go
index 57cf6b139..5187594a7 100644
--- a/pkg/sentry/strace/poll.go
+++ b/pkg/sentry/strace/poll.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
"gvisor.dev/gvisor/pkg/sentry/usermem"
)
@@ -50,7 +49,7 @@ func pollFD(t *kernel.Task, pfd *linux.PollFD, post bool) string {
if post {
revents = PollEventSet.Parse(uint64(pfd.REvents))
}
- return fmt.Sprintf("{FD: %s, Events: %s, REvents: %s}", fd(t, kdefs.FD(pfd.FD)), PollEventSet.Parse(uint64(pfd.Events)), revents)
+ return fmt.Sprintf("{FD: %s, Events: %s, REvents: %s}", fd(t, pfd.FD), PollEventSet.Parse(uint64(pfd.Events)), revents)
}
func pollFDs(t *kernel.Task, addr usermem.Addr, nfds uint, post bool) string {
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 86e9c5690..311389547 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -31,7 +31,6 @@ import (
"gvisor.dev/gvisor/pkg/seccomp"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
pb "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto"
slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -133,7 +132,7 @@ func path(t *kernel.Task, addr usermem.Addr) string {
return fmt.Sprintf("%#x %s", addr, path)
}
-func fd(t *kernel.Task, fd kdefs.FD) string {
+func fd(t *kernel.Task, fd int32) string {
root := t.FSContext().RootDirectory()
if root != nil {
defer root.DecRef()
@@ -151,7 +150,7 @@ func fd(t *kernel.Task, fd kdefs.FD) string {
return fmt.Sprintf("AT_FDCWD %s", name)
}
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
// Cast FD to uint64 to avoid printing negative hex.
return fmt.Sprintf("%#x (bad FD)", uint64(fd))
@@ -375,7 +374,7 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
}
switch i.format[arg] {
case FD:
- output = append(output, fd(t, kdefs.FD(args[arg].Int())))
+ output = append(output, fd(t, args[arg].Int()))
case WriteBuffer:
output = append(output, dump(t, args[arg].Pointer(), args[arg+1].SizeT(), maximumBlobSize))
case WriteIOVec:
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
index 18fddee76..79d972202 100644
--- a/pkg/sentry/syscalls/BUILD
+++ b/pkg/sentry/syscalls/BUILD
@@ -15,7 +15,6 @@ go_library(
"//pkg/sentry/arch",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/epoll",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/kernel/time",
"//pkg/syserror",
"//pkg/waiter",
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
index c710ec9e3..11850dd0f 100644
--- a/pkg/sentry/syscalls/epoll.go
+++ b/pkg/sentry/syscalls/epoll.go
@@ -20,20 +20,18 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/waiter"
)
// CreateEpoll implements the epoll_create(2) linux syscall.
-func CreateEpoll(t *kernel.Task, closeOnExec bool) (kdefs.FD, error) {
+func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) {
file := epoll.NewEventPoll(t)
defer file.DecRef()
- flags := kernel.FDFlags{
+ fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
CloseOnExec: closeOnExec,
- }
- fd, err := t.FDMap().NewFDFrom(0, file, flags, t.ThreadGroup().Limits())
+ })
if err != nil {
return 0, err
}
@@ -42,16 +40,16 @@ func CreateEpoll(t *kernel.Task, closeOnExec bool) (kdefs.FD, error) {
}
// AddEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_ADD.
-func AddEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
+func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
// Get epoll from the file descriptor.
- epollfile := t.FDMap().GetFile(epfd)
+ epollfile := t.GetFile(epfd)
if epollfile == nil {
return syscall.EBADF
}
defer epollfile.DecRef()
// Get the target file id.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return syscall.EBADF
}
@@ -68,16 +66,16 @@ func AddEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags
}
// UpdateEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_MOD.
-func UpdateEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
+func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
// Get epoll from the file descriptor.
- epollfile := t.FDMap().GetFile(epfd)
+ epollfile := t.GetFile(epfd)
if epollfile == nil {
return syscall.EBADF
}
defer epollfile.DecRef()
// Get the target file id.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return syscall.EBADF
}
@@ -94,16 +92,16 @@ func UpdateEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFl
}
// RemoveEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_DEL.
-func RemoveEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD) error {
+func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error {
// Get epoll from the file descriptor.
- epollfile := t.FDMap().GetFile(epfd)
+ epollfile := t.GetFile(epfd)
if epollfile == nil {
return syscall.EBADF
}
defer epollfile.DecRef()
// Get the target file id.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return syscall.EBADF
}
@@ -120,9 +118,9 @@ func RemoveEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD) error {
}
// WaitEpoll implements the epoll_wait(2) linux syscall.
-func WaitEpoll(t *kernel.Task, fd kdefs.FD, max int, timeout int) ([]epoll.Event, error) {
+func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]epoll.Event, error) {
// Get epoll from the file descriptor.
- epollfile := t.FDMap().GetFile(fd)
+ epollfile := t.GetFile(fd)
if epollfile == nil {
return nil, syscall.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 6e5be0158..33a40b9c6 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -71,7 +71,6 @@ go_library(
"//pkg/sentry/kernel/epoll",
"//pkg/sentry/kernel/eventfd",
"//pkg/sentry/kernel/fasync",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/kernel/pipe",
"//pkg/sentry/kernel/sched",
"//pkg/sentry/kernel/shm",
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 94816b038..99af33cfd 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -50,244 +50,244 @@ var AMD64 = &kernel.SyscallTable{
Table: map[uintptr]kernel.Syscall{
0: syscalls.Supported("read", Read),
1: syscalls.Supported("write", Write),
- 2: syscalls.Supported("open", Open),
+ 2: syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil),
3: syscalls.Supported("close", Close),
- 4: syscalls.Undocumented("stat", Stat),
- 5: syscalls.Undocumented("fstat", Fstat),
- 6: syscalls.Undocumented("lstat", Lstat),
- 7: syscalls.Undocumented("poll", Poll),
- 8: syscalls.Undocumented("lseek", Lseek),
- 9: syscalls.Undocumented("mmap", Mmap),
- 10: syscalls.Undocumented("mprotect", Mprotect),
- 11: syscalls.Undocumented("munmap", Munmap),
- 12: syscalls.Undocumented("brk", Brk),
- 13: syscalls.Undocumented("rt_sigaction", RtSigaction),
- 14: syscalls.Undocumented("rt_sigprocmask", RtSigprocmask),
- 15: syscalls.Undocumented("rt_sigreturn", RtSigreturn),
- 16: syscalls.Undocumented("ioctl", Ioctl),
- 17: syscalls.Undocumented("pread64", Pread64),
- 18: syscalls.Undocumented("pwrite64", Pwrite64),
- 19: syscalls.Undocumented("readv", Readv),
- 20: syscalls.Undocumented("writev", Writev),
- 21: syscalls.Undocumented("access", Access),
- 22: syscalls.Undocumented("pipe", Pipe),
- 23: syscalls.Undocumented("select", Select),
- 24: syscalls.Undocumented("sched_yield", SchedYield),
- 25: syscalls.Undocumented("mremap", Mremap),
- 26: syscalls.Undocumented("msync", Msync),
- 27: syscalls.Undocumented("mincore", Mincore),
- 28: syscalls.Undocumented("madvise", Madvise),
- 29: syscalls.Undocumented("shmget", Shmget),
- 30: syscalls.Undocumented("shmat", Shmat),
- 31: syscalls.Undocumented("shmctl", Shmctl),
- 32: syscalls.Undocumented("dup", Dup),
- 33: syscalls.Undocumented("dup2", Dup2),
- 34: syscalls.Undocumented("pause", Pause),
- 35: syscalls.Undocumented("nanosleep", Nanosleep),
- 36: syscalls.Undocumented("getitimer", Getitimer),
- 37: syscalls.Undocumented("alarm", Alarm),
- 38: syscalls.Undocumented("setitimer", Setitimer),
- 39: syscalls.Undocumented("getpid", Getpid),
- 40: syscalls.Undocumented("sendfile", Sendfile),
- 41: syscalls.Undocumented("socket", Socket),
- 42: syscalls.Undocumented("connect", Connect),
- 43: syscalls.Undocumented("accept", Accept),
- 44: syscalls.Undocumented("sendto", SendTo),
- 45: syscalls.Undocumented("recvfrom", RecvFrom),
- 46: syscalls.Undocumented("sendmsg", SendMsg),
- 47: syscalls.Undocumented("recvmsg", RecvMsg),
- 48: syscalls.Undocumented("shutdown", Shutdown),
- 49: syscalls.Undocumented("bind", Bind),
- 50: syscalls.Undocumented("listen", Listen),
- 51: syscalls.Undocumented("getsockname", GetSockName),
- 52: syscalls.Undocumented("getpeername", GetPeerName),
- 53: syscalls.Undocumented("socketpair", SocketPair),
- 54: syscalls.Undocumented("setsockopt", SetSockOpt),
- 55: syscalls.Undocumented("getsockopt", GetSockOpt),
- 56: syscalls.Undocumented("clone", Clone),
- 57: syscalls.Undocumented("fork", Fork),
- 58: syscalls.Undocumented("vfork", Vfork),
- 59: syscalls.Undocumented("execve", Execve),
- 60: syscalls.Undocumented("exit", Exit),
- 61: syscalls.Undocumented("wait4", Wait4),
- 62: syscalls.Undocumented("kill", Kill),
- 63: syscalls.Undocumented("uname", Uname),
- 64: syscalls.Undocumented("semget", Semget),
- 65: syscalls.Undocumented("semop", Semop),
- 66: syscalls.Undocumented("semctl", Semctl),
- 67: syscalls.Undocumented("shmdt", Shmdt),
+ 4: syscalls.Supported("stat", Stat),
+ 5: syscalls.Supported("fstat", Fstat),
+ 6: syscalls.Supported("lstat", Lstat),
+ 7: syscalls.Supported("poll", Poll),
+ 8: syscalls.Supported("lseek", Lseek),
+ 9: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil),
+ 10: syscalls.Supported("mprotect", Mprotect),
+ 11: syscalls.Supported("munmap", Munmap),
+ 12: syscalls.Supported("brk", Brk),
+ 13: syscalls.Supported("rt_sigaction", RtSigaction),
+ 14: syscalls.Supported("rt_sigprocmask", RtSigprocmask),
+ 15: syscalls.Supported("rt_sigreturn", RtSigreturn),
+ 16: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
+ 17: syscalls.Supported("pread64", Pread64),
+ 18: syscalls.Supported("pwrite64", Pwrite64),
+ 19: syscalls.Supported("readv", Readv),
+ 20: syscalls.Supported("writev", Writev),
+ 21: syscalls.Supported("access", Access),
+ 22: syscalls.Supported("pipe", Pipe),
+ 23: syscalls.Supported("select", Select),
+ 24: syscalls.Supported("sched_yield", SchedYield),
+ 25: syscalls.Supported("mremap", Mremap),
+ 26: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil),
+ 27: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil),
+ 28: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil),
+ 29: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
+ 30: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil),
+ 31: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil),
+ 32: syscalls.Supported("dup", Dup),
+ 33: syscalls.Supported("dup2", Dup2),
+ 34: syscalls.Supported("pause", Pause),
+ 35: syscalls.Supported("nanosleep", Nanosleep),
+ 36: syscalls.Supported("getitimer", Getitimer),
+ 37: syscalls.Supported("alarm", Alarm),
+ 38: syscalls.Supported("setitimer", Setitimer),
+ 39: syscalls.Supported("getpid", Getpid),
+ 40: syscalls.Supported("sendfile", Sendfile),
+ 41: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil),
+ 42: syscalls.Supported("connect", Connect),
+ 43: syscalls.Supported("accept", Accept),
+ 44: syscalls.Supported("sendto", SendTo),
+ 45: syscalls.Supported("recvfrom", RecvFrom),
+ 46: syscalls.Supported("sendmsg", SendMsg),
+ 47: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil),
+ 48: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil),
+ 49: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil),
+ 50: syscalls.Supported("listen", Listen),
+ 51: syscalls.Supported("getsockname", GetSockName),
+ 52: syscalls.Supported("getpeername", GetPeerName),
+ 53: syscalls.Supported("socketpair", SocketPair),
+ 54: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil),
+ 55: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil),
+ 56: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil),
+ 57: syscalls.Supported("fork", Fork),
+ 58: syscalls.Supported("vfork", Vfork),
+ 59: syscalls.Supported("execve", Execve),
+ 60: syscalls.Supported("exit", Exit),
+ 61: syscalls.Supported("wait4", Wait4),
+ 62: syscalls.Supported("kill", Kill),
+ 63: syscalls.Supported("uname", Uname),
+ 64: syscalls.Supported("semget", Semget),
+ 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
+ 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+ 67: syscalls.Supported("shmdt", Shmdt),
68: syscalls.ErrorWithEvent("msgget", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
69: syscalls.ErrorWithEvent("msgsnd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
70: syscalls.ErrorWithEvent("msgrcv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
71: syscalls.ErrorWithEvent("msgctl", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
- 72: syscalls.Undocumented("fcntl", Fcntl),
- 73: syscalls.Undocumented("flock", Flock),
- 74: syscalls.Undocumented("fsync", Fsync),
- 75: syscalls.Undocumented("fdatasync", Fdatasync),
- 76: syscalls.Undocumented("truncate", Truncate),
- 77: syscalls.Undocumented("ftruncate", Ftruncate),
- 78: syscalls.Undocumented("getdents", Getdents),
- 79: syscalls.Undocumented("getcwd", Getcwd),
- 80: syscalls.Undocumented("chdir", Chdir),
- 81: syscalls.Undocumented("fchdir", Fchdir),
- 82: syscalls.Undocumented("rename", Rename),
- 83: syscalls.Undocumented("mkdir", Mkdir),
- 84: syscalls.Undocumented("rmdir", Rmdir),
- 85: syscalls.Undocumented("creat", Creat),
- 86: syscalls.Undocumented("link", Link),
- 87: syscalls.Undocumented("link", Unlink),
- 88: syscalls.Undocumented("symlink", Symlink),
- 89: syscalls.Undocumented("readlink", Readlink),
- 90: syscalls.Undocumented("chmod", Chmod),
- 91: syscalls.Undocumented("fchmod", Fchmod),
- 92: syscalls.Undocumented("chown", Chown),
- 93: syscalls.Undocumented("fchown", Fchown),
- 94: syscalls.Undocumented("lchown", Lchown),
- 95: syscalls.Undocumented("umask", Umask),
- 96: syscalls.Undocumented("gettimeofday", Gettimeofday),
- 97: syscalls.Undocumented("getrlimit", Getrlimit),
- 98: syscalls.Undocumented("getrusage", Getrusage),
- 99: syscalls.Undocumented("sysinfo", Sysinfo),
- 100: syscalls.Undocumented("times", Times),
- 101: syscalls.Undocumented("ptrace", Ptrace),
- 102: syscalls.Undocumented("getuid", Getuid),
- 103: syscalls.Undocumented("syslog", Syslog),
- 104: syscalls.Undocumented("getgid", Getgid),
- 105: syscalls.Undocumented("setuid", Setuid),
- 106: syscalls.Undocumented("setgid", Setgid),
- 107: syscalls.Undocumented("geteuid", Geteuid),
- 108: syscalls.Undocumented("getegid", Getegid),
- 109: syscalls.Undocumented("setpgid", Setpgid),
- 110: syscalls.Undocumented("getppid", Getppid),
- 111: syscalls.Undocumented("getpgrp", Getpgrp),
- 112: syscalls.Undocumented("setsid", Setsid),
- 113: syscalls.Undocumented("setreuid", Setreuid),
- 114: syscalls.Undocumented("setregid", Setregid),
- 115: syscalls.Undocumented("getgroups", Getgroups),
- 116: syscalls.Undocumented("setgroups", Setgroups),
- 117: syscalls.Undocumented("setresuid", Setresuid),
- 118: syscalls.Undocumented("getresuid", Getresuid),
- 119: syscalls.Undocumented("setresgid", Setresgid),
- 120: syscalls.Undocumented("setresgid", Getresgid),
- 121: syscalls.Undocumented("getpgid", Getpgid),
+ 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
+ 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil),
+ 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil),
+ 75: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil),
+ 76: syscalls.Supported("truncate", Truncate),
+ 77: syscalls.Supported("ftruncate", Ftruncate),
+ 78: syscalls.Supported("getdents", Getdents),
+ 79: syscalls.Supported("getcwd", Getcwd),
+ 80: syscalls.Supported("chdir", Chdir),
+ 81: syscalls.Supported("fchdir", Fchdir),
+ 82: syscalls.Supported("rename", Rename),
+ 83: syscalls.Supported("mkdir", Mkdir),
+ 84: syscalls.Supported("rmdir", Rmdir),
+ 85: syscalls.Supported("creat", Creat),
+ 86: syscalls.Supported("link", Link),
+ 87: syscalls.Supported("unlink", Unlink),
+ 88: syscalls.Supported("symlink", Symlink),
+ 89: syscalls.Supported("readlink", Readlink),
+ 90: syscalls.Supported("chmod", Chmod),
+ 91: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil),
+ 92: syscalls.Supported("chown", Chown),
+ 93: syscalls.Supported("fchown", Fchown),
+ 94: syscalls.Supported("lchown", Lchown),
+ 95: syscalls.Supported("umask", Umask),
+ 96: syscalls.Supported("gettimeofday", Gettimeofday),
+ 97: syscalls.Supported("getrlimit", Getrlimit),
+ 98: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil),
+ 99: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil),
+ 100: syscalls.Supported("times", Times),
+ 101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil),
+ 102: syscalls.Supported("getuid", Getuid),
+ 103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil),
+ 104: syscalls.Supported("getgid", Getgid),
+ 105: syscalls.Supported("setuid", Setuid),
+ 106: syscalls.Supported("setgid", Setgid),
+ 107: syscalls.Supported("geteuid", Geteuid),
+ 108: syscalls.Supported("getegid", Getegid),
+ 109: syscalls.Supported("setpgid", Setpgid),
+ 110: syscalls.Supported("getppid", Getppid),
+ 111: syscalls.Supported("getpgrp", Getpgrp),
+ 112: syscalls.Supported("setsid", Setsid),
+ 113: syscalls.Supported("setreuid", Setreuid),
+ 114: syscalls.Supported("setregid", Setregid),
+ 115: syscalls.Supported("getgroups", Getgroups),
+ 116: syscalls.Supported("setgroups", Setgroups),
+ 117: syscalls.Supported("setresuid", Setresuid),
+ 118: syscalls.Supported("getresuid", Getresuid),
+ 119: syscalls.Supported("setresgid", Setresgid),
+ 120: syscalls.Supported("setresgid", Getresgid),
+ 121: syscalls.Supported("getpgid", Getpgid),
122: syscalls.ErrorWithEvent("setfsuid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
123: syscalls.ErrorWithEvent("setfsgid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
- 124: syscalls.Undocumented("getsid", Getsid),
- 125: syscalls.Undocumented("capget", Capget),
- 126: syscalls.Undocumented("capset", Capset),
- 127: syscalls.Undocumented("rt_sigpending", RtSigpending),
- 128: syscalls.Undocumented("rt_sigtimedwait", RtSigtimedwait),
- 129: syscalls.Undocumented("rt_sigqueueinfo", RtSigqueueinfo),
- 130: syscalls.Undocumented("rt_sigsuspend", RtSigsuspend),
- 131: syscalls.Undocumented("sigaltstack", Sigaltstack),
- 132: syscalls.Undocumented("utime", Utime),
- 133: syscalls.Undocumented("mknod", Mknod),
+ 124: syscalls.Supported("getsid", Getsid),
+ 125: syscalls.Supported("capget", Capget),
+ 126: syscalls.Supported("capset", Capset),
+ 127: syscalls.Supported("rt_sigpending", RtSigpending),
+ 128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait),
+ 129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo),
+ 130: syscalls.Supported("rt_sigsuspend", RtSigsuspend),
+ 131: syscalls.Supported("sigaltstack", Sigaltstack),
+ 132: syscalls.Supported("utime", Utime),
+ 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil),
134: syscalls.Error("uselib", syscall.ENOSYS, "Obsolete", nil),
135: syscalls.ErrorWithEvent("personality", syscall.EINVAL, "Unable to change personality.", nil),
136: syscalls.ErrorWithEvent("ustat", syscall.ENOSYS, "Needs filesystem support.", nil),
- 137: syscalls.Undocumented("statfs", Statfs),
- 138: syscalls.Undocumented("fstatfs", Fstatfs),
+ 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil),
+ 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil),
139: syscalls.ErrorWithEvent("sysfs", syscall.ENOSYS, "", []string{"gvisor.dev/issue/165"}),
- 140: syscalls.Undocumented("getpriority", Getpriority),
- 141: syscalls.Undocumented("setpriority", Setpriority),
+ 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil),
+ 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil),
142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil),
- 143: syscalls.Undocumented("sched_getparam", SchedGetparam),
- 144: syscalls.Undocumented("sched_setscheduler", SchedSetscheduler),
- 145: syscalls.Undocumented("sched_getscheduler", SchedGetscheduler),
- 146: syscalls.Undocumented("sched_get_priority_max", SchedGetPriorityMax),
- 147: syscalls.Undocumented("sched_get_priority_min", SchedGetPriorityMin),
+ 143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil),
+ 144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil),
+ 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil),
+ 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil),
+ 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil),
148: syscalls.ErrorWithEvent("sched_rr_get_interval", syscall.EPERM, "", nil),
- 149: syscalls.Undocumented("mlock", Mlock),
- 150: syscalls.Undocumented("munlock", Munlock),
- 151: syscalls.Undocumented("mlockall", Mlockall),
- 152: syscalls.Undocumented("munlockall", Munlockall),
+ 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+ 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil),
154: syscalls.Error("modify_ldt", syscall.EPERM, "", nil),
155: syscalls.Error("pivot_root", syscall.EPERM, "", nil),
- 156: syscalls.Error("sysctl", syscall.EPERM, `syscall is "worthless"`, nil),
- 157: syscalls.Undocumented("prctl", Prctl),
- 158: syscalls.Undocumented("arch_prctl", ArchPrctl),
+ 156: syscalls.Error("sysctl", syscall.EPERM, "Deprecated. Use /proc/sys instead.", nil),
+ 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil),
+ 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil),
159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil),
- 160: syscalls.Undocumented("setrlimit", Setrlimit),
- 161: syscalls.Undocumented("chroot", Chroot),
- 162: syscalls.Undocumented("sync", Sync),
+ 160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil),
+ 161: syscalls.Supported("chroot", Chroot),
+ 162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil),
163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil),
164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil),
- 165: syscalls.Undocumented("mount", Mount),
- 166: syscalls.Undocumented("umount2", Umount2),
+ 165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil),
+ 166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil),
167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil),
- 170: syscalls.Undocumented("sethostname", Sethostname),
- 171: syscalls.Undocumented("setdomainname", Setdomainname),
+ 170: syscalls.Supported("sethostname", Sethostname),
+ 171: syscalls.Supported("setdomainname", Setdomainname),
172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil),
173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil),
174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil),
175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil),
176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil),
- 177: syscalls.Error("get_kernel_syms", syscall.ENOSYS, "Not supported in > 2.6", nil),
- 178: syscalls.Error("query_module", syscall.ENOSYS, "Not supported in > 2.6", nil),
+ 177: syscalls.Error("get_kernel_syms", syscall.ENOSYS, "Not supported in Linux > 2.6.", nil),
+ 178: syscalls.Error("query_module", syscall.ENOSYS, "Not supported in Linux > 2.6.", nil),
179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations
- 180: syscalls.Error("nfsservctl", syscall.ENOSYS, "Does not exist > 3.1", nil),
- 181: syscalls.Error("getpmsg", syscall.ENOSYS, "Not implemented in Linux", nil),
- 182: syscalls.Error("putpmsg", syscall.ENOSYS, "Not implemented in Linux", nil),
- 183: syscalls.Error("afs_syscall", syscall.ENOSYS, "Not implemented in Linux", nil),
- 184: syscalls.Error("tuxcall", syscall.ENOSYS, "Not implemented in Linux", nil),
- 185: syscalls.Error("security", syscall.ENOSYS, "Not implemented in Linux", nil),
- 186: syscalls.Undocumented("gettid", Gettid),
+ 180: syscalls.Error("nfsservctl", syscall.ENOSYS, "Removed after Linux 3.1.", nil),
+ 181: syscalls.Error("getpmsg", syscall.ENOSYS, "Not implemented in Linux.", nil),
+ 182: syscalls.Error("putpmsg", syscall.ENOSYS, "Not implemented in Linux.", nil),
+ 183: syscalls.Error("afs_syscall", syscall.ENOSYS, "Not implemented in Linux.", nil),
+ 184: syscalls.Error("tuxcall", syscall.ENOSYS, "Not implemented in Linux.", nil),
+ 185: syscalls.Error("security", syscall.ENOSYS, "Not implemented in Linux.", nil),
+ 186: syscalls.Supported("gettid", Gettid),
187: syscalls.ErrorWithEvent("readahead", syscall.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341)
- 188: syscalls.ErrorWithEvent("setxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 189: syscalls.ErrorWithEvent("lsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 190: syscalls.ErrorWithEvent("fsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 191: syscalls.ErrorWithEvent("getxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 192: syscalls.ErrorWithEvent("lgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 193: syscalls.ErrorWithEvent("fgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 194: syscalls.ErrorWithEvent("listxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 195: syscalls.ErrorWithEvent("llistxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 196: syscalls.ErrorWithEvent("flistxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 197: syscalls.ErrorWithEvent("removexattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 198: syscalls.ErrorWithEvent("lremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 199: syscalls.ErrorWithEvent("fremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil),
- 200: syscalls.Undocumented("tkill", Tkill),
- 201: syscalls.Undocumented("time", Time),
- 202: syscalls.Undocumented("futex", Futex),
- 203: syscalls.Undocumented("sched_setaffinity", SchedSetaffinity),
- 204: syscalls.Undocumented("sched_getaffinity", SchedGetaffinity),
+ 188: syscalls.ErrorWithEvent("setxattr", syscall.ENOTSUP, "Requires filesystem support.", nil),
+ 189: syscalls.ErrorWithEvent("lsetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil),
+ 190: syscalls.ErrorWithEvent("fsetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil),
+ 191: syscalls.ErrorWithEvent("getxattr", syscall.ENOTSUP, "Requires filesystem support.", nil),
+ 192: syscalls.ErrorWithEvent("lgetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil),
+ 193: syscalls.ErrorWithEvent("fgetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil),
+ 194: syscalls.ErrorWithEvent("listxattr", syscall.ENOTSUP, "Requires filesystem support.", nil),
+ 195: syscalls.ErrorWithEvent("llistxattr", syscall.ENOTSUP, "Requires filesystem support.", nil),
+ 196: syscalls.ErrorWithEvent("flistxattr", syscall.ENOTSUP, "Requires filesystem support.", nil),
+ 197: syscalls.ErrorWithEvent("removexattr", syscall.ENOTSUP, "Requires filesystem support.", nil),
+ 198: syscalls.ErrorWithEvent("lremovexattr", syscall.ENOTSUP, "Requires filesystem support.", nil),
+ 199: syscalls.ErrorWithEvent("fremovexattr", syscall.ENOTSUP, "Requires filesystem support.", nil),
+ 200: syscalls.Supported("tkill", Tkill),
+ 201: syscalls.Supported("time", Time),
+ 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
+ 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil),
+ 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil),
205: syscalls.Error("set_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
- 206: syscalls.Undocumented("io_setup", IoSetup),
- 207: syscalls.Undocumented("io_destroy", IoDestroy),
- 208: syscalls.Undocumented("io_getevents", IoGetevents),
- 209: syscalls.Undocumented("io_submit", IoSubmit),
- 210: syscalls.Undocumented("io_cancel", IoCancel),
+ 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+ 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
211: syscalls.Error("get_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
- 213: syscalls.Undocumented("epoll_create", EpollCreate),
- 214: syscalls.ErrorWithEvent("epoll_ctl_old", syscall.ENOSYS, "Deprecated", nil),
- 215: syscalls.ErrorWithEvent("epoll_wait_old", syscall.ENOSYS, "Deprecated", nil),
- 216: syscalls.ErrorWithEvent("remap_file_pages", syscall.ENOSYS, "Deprecated since 3.16", nil),
- 217: syscalls.Undocumented("getdents64", Getdents64),
- 218: syscalls.Undocumented("set_tid_address", SetTidAddress),
- 219: syscalls.Undocumented("restart_syscall", RestartSyscall),
+ 213: syscalls.Supported("epoll_create", EpollCreate),
+ 214: syscalls.ErrorWithEvent("epoll_ctl_old", syscall.ENOSYS, "Deprecated.", nil),
+ 215: syscalls.ErrorWithEvent("epoll_wait_old", syscall.ENOSYS, "Deprecated.", nil),
+ 216: syscalls.ErrorWithEvent("remap_file_pages", syscall.ENOSYS, "Deprecated since Linux 3.16.", nil),
+ 217: syscalls.Supported("getdents64", Getdents64),
+ 218: syscalls.Supported("set_tid_address", SetTidAddress),
+ 219: syscalls.Supported("restart_syscall", RestartSyscall),
220: syscalls.ErrorWithEvent("semtimedop", syscall.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920)
- 221: syscalls.Undocumented("fadvise64", Fadvise64),
- 222: syscalls.Undocumented("timer_create", TimerCreate),
- 223: syscalls.Undocumented("timer_settime", TimerSettime),
- 224: syscalls.Undocumented("timer_gettime", TimerGettime),
- 225: syscalls.Undocumented("timer_getoverrun", TimerGetoverrun),
- 226: syscalls.Undocumented("timer_delete", TimerDelete),
- 227: syscalls.Undocumented("clock_settime", ClockSettime),
- 228: syscalls.Undocumented("clock_gettime", ClockGettime),
- 229: syscalls.Undocumented("clock_getres", ClockGetres),
- 230: syscalls.Undocumented("clock_nanosleep", ClockNanosleep),
- 231: syscalls.Undocumented("exit_group", ExitGroup),
- 232: syscalls.Undocumented("epoll_wait", EpollWait),
- 233: syscalls.Undocumented("epoll_ctl", EpollCtl),
- 234: syscalls.Undocumented("tgkill", Tgkill),
- 235: syscalls.Undocumented("utimes", Utimes),
+ 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
+ 222: syscalls.Supported("timer_create", TimerCreate),
+ 223: syscalls.Supported("timer_settime", TimerSettime),
+ 224: syscalls.Supported("timer_gettime", TimerGettime),
+ 225: syscalls.Supported("timer_getoverrun", TimerGetoverrun),
+ 226: syscalls.Supported("timer_delete", TimerDelete),
+ 227: syscalls.Supported("clock_settime", ClockSettime),
+ 228: syscalls.Supported("clock_gettime", ClockGettime),
+ 229: syscalls.Supported("clock_getres", ClockGetres),
+ 230: syscalls.Supported("clock_nanosleep", ClockNanosleep),
+ 231: syscalls.Supported("exit_group", ExitGroup),
+ 232: syscalls.Supported("epoll_wait", EpollWait),
+ 233: syscalls.Supported("epoll_ctl", EpollCtl),
+ 234: syscalls.Supported("tgkill", Tgkill),
+ 235: syscalls.Supported("utimes", Utimes),
236: syscalls.Error("vserver", syscall.ENOSYS, "Not implemented by Linux", nil),
237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}),
- 238: syscalls.Undocumented("set_mempolicy", SetMempolicy),
- 239: syscalls.Undocumented("get_mempolicy", GetMempolicy),
+ 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil),
+ 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil),
240: syscalls.ErrorWithEvent("mq_open", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
241: syscalls.ErrorWithEvent("mq_unlink", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
242: syscalls.ErrorWithEvent("mq_timedsend", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
@@ -295,69 +295,69 @@ var AMD64 = &kernel.SyscallTable{
244: syscalls.ErrorWithEvent("mq_notify", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
245: syscalls.ErrorWithEvent("mq_getsetattr", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil),
- 247: syscalls.Undocumented("waitid", Waitid),
- 248: syscalls.Error("add_key", syscall.EACCES, "Not available to user", nil),
- 249: syscalls.Error("request_key", syscall.EACCES, "Not available to user", nil),
- 250: syscalls.Error("keyctl", syscall.EACCES, "Not available to user", nil),
+ 247: syscalls.Supported("waitid", Waitid),
+ 248: syscalls.Error("add_key", syscall.EACCES, "Not available to user.", nil),
+ 249: syscalls.Error("request_key", syscall.EACCES, "Not available to user.", nil),
+ 250: syscalls.Error("keyctl", syscall.EACCES, "Not available to user.", nil),
251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
- 253: syscalls.Undocumented("inotify_init", InotifyInit),
- 254: syscalls.Undocumented("inotify_add_watch", InotifyAddWatch),
- 255: syscalls.Undocumented("inotify_rm_watch", InotifyRmWatch),
+ 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil),
+ 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
+ 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
- 257: syscalls.Undocumented("openat", Openat),
- 258: syscalls.Undocumented("mkdirat", Mkdirat),
- 259: syscalls.Undocumented("mknodat", Mknodat),
- 260: syscalls.Undocumented("fchownat", Fchownat),
- 261: syscalls.Undocumented("futimesat", Futimesat),
- 262: syscalls.Undocumented("fstatat", Fstatat),
- 263: syscalls.Undocumented("unlinkat", Unlinkat),
- 264: syscalls.Undocumented("renameat", Renameat),
- 265: syscalls.Undocumented("linkat", Linkat),
- 266: syscalls.Undocumented("symlinkat", Symlinkat),
- 267: syscalls.Undocumented("readlinkat", Readlinkat),
- 268: syscalls.Undocumented("fchmodat", Fchmodat),
- 269: syscalls.Undocumented("faccessat", Faccessat),
- 270: syscalls.Undocumented("pselect", Pselect),
- 271: syscalls.Undocumented("ppoll", Ppoll),
- 272: syscalls.Undocumented("unshare", Unshare),
- 273: syscalls.Error("set_robust_list", syscall.ENOSYS, "Obsolete", nil),
- 274: syscalls.Error("get_robust_list", syscall.ENOSYS, "Obsolete", nil),
- 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
- 276: syscalls.ErrorWithEvent("tee", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
- 277: syscalls.Undocumented("sync_file_range", SyncFileRange),
+ 257: syscalls.Supported("openat", Openat),
+ 258: syscalls.Supported("mkdirat", Mkdirat),
+ 259: syscalls.Supported("mknodat", Mknodat),
+ 260: syscalls.Supported("fchownat", Fchownat),
+ 261: syscalls.Supported("futimesat", Futimesat),
+ 262: syscalls.Supported("fstatat", Fstatat),
+ 263: syscalls.Supported("unlinkat", Unlinkat),
+ 264: syscalls.Supported("renameat", Renameat),
+ 265: syscalls.Supported("linkat", Linkat),
+ 266: syscalls.Supported("symlinkat", Symlinkat),
+ 267: syscalls.Supported("readlinkat", Readlinkat),
+ 268: syscalls.Supported("fchmodat", Fchmodat),
+ 269: syscalls.Supported("faccessat", Faccessat),
+ 270: syscalls.Supported("pselect", Pselect),
+ 271: syscalls.Supported("ppoll", Ppoll),
+ 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
+ 273: syscalls.Error("set_robust_list", syscall.ENOSYS, "Obsolete.", nil),
+ 274: syscalls.Error("get_robust_list", syscall.ENOSYS, "Obsolete.", nil),
+ 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+ 276: syscalls.ErrorWithEvent("tee", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+ 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
278: syscalls.ErrorWithEvent("vmsplice", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly)
- 280: syscalls.Undocumented("utimensat", Utimensat),
- 281: syscalls.Undocumented("epoll_pwait", EpollPwait),
+ 280: syscalls.Supported("utimensat", Utimensat),
+ 281: syscalls.Supported("epoll_pwait", EpollPwait),
282: syscalls.ErrorWithEvent("signalfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426)
- 283: syscalls.Undocumented("timerfd_create", TimerfdCreate),
- 284: syscalls.Undocumented("eventfd", Eventfd),
- 285: syscalls.Undocumented("fallocate", Fallocate),
- 286: syscalls.Undocumented("timerfd_settime", TimerfdSettime),
- 287: syscalls.Undocumented("timerfd_gettime", TimerfdGettime),
- 288: syscalls.Undocumented("accept4", Accept4),
+ 283: syscalls.Supported("timerfd_create", TimerfdCreate),
+ 284: syscalls.Supported("eventfd", Eventfd),
+ 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil),
+ 286: syscalls.Supported("timerfd_settime", TimerfdSettime),
+ 287: syscalls.Supported("timerfd_gettime", TimerfdGettime),
+ 288: syscalls.Supported("accept4", Accept4),
289: syscalls.ErrorWithEvent("signalfd4", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426)
- 290: syscalls.Undocumented("eventfd2", Eventfd2),
- 291: syscalls.Undocumented("epoll_create1", EpollCreate1),
- 292: syscalls.Undocumented("dup3", Dup3),
- 293: syscalls.Undocumented("pipe2", Pipe2),
- 294: syscalls.Undocumented("inotify_init1", InotifyInit1),
- 295: syscalls.Undocumented("preadv", Preadv),
- 296: syscalls.Undocumented("pwritev", Pwritev),
- 297: syscalls.Undocumented("rt_tgsigqueueinfo", RtTgsigqueueinfo),
+ 290: syscalls.Supported("eventfd2", Eventfd2),
+ 291: syscalls.Supported("epoll_create1", EpollCreate1),
+ 292: syscalls.Supported("dup3", Dup3),
+ 293: syscalls.Supported("pipe2", Pipe2),
+ 294: syscalls.Supported("inotify_init1", InotifyInit1),
+ 295: syscalls.Supported("preadv", Preadv),
+ 296: syscalls.Supported("pwritev", Pwritev),
+ 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
298: syscalls.ErrorWithEvent("perf_event_open", syscall.ENODEV, "No support for perf counters", nil),
- 299: syscalls.Undocumented("recvmmsg", RecvMMsg),
+ 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil),
300: syscalls.ErrorWithEvent("fanotify_init", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
301: syscalls.ErrorWithEvent("fanotify_mark", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
- 302: syscalls.Undocumented("prlimit64", Prlimit64),
+ 302: syscalls.Supported("prlimit64", Prlimit64),
303: syscalls.Error("name_to_handle_at", syscall.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
304: syscalls.Error("open_by_handle_at", syscall.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil),
- 306: syscalls.Undocumented("syncfs", Syncfs),
- 307: syscalls.Undocumented("sendmmsg", SendMMsg),
+ 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil),
+ 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil),
308: syscalls.ErrorWithEvent("setns", syscall.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995)
- 309: syscalls.Undocumented("getcpu", Getcpu),
+ 309: syscalls.Supported("getcpu", Getcpu),
310: syscalls.ErrorWithEvent("process_vm_readv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
311: syscalls.ErrorWithEvent("process_vm_writev", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil),
@@ -365,20 +365,20 @@ var AMD64 = &kernel.SyscallTable{
314: syscalls.ErrorWithEvent("sched_setattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
315: syscalls.ErrorWithEvent("sched_getattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
316: syscalls.ErrorWithEvent("renameat2", syscall.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772)
- 317: syscalls.Undocumented("seccomp", Seccomp),
- 318: syscalls.Undocumented("getrandom", GetRandom),
- 319: syscalls.Undocumented("memfd_create", MemfdCreate),
+ 317: syscalls.Supported("seccomp", Seccomp),
+ 318: syscalls.Supported("getrandom", GetRandom),
+ 319: syscalls.Supported("memfd_create", MemfdCreate),
320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil),
321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
322: syscalls.ErrorWithEvent("execveat", syscall.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836)
323: syscalls.ErrorWithEvent("userfaultfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
324: syscalls.ErrorWithEvent("membarrier", syscall.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897)
- 325: syscalls.Undocumented("mlock2", Mlock2),
+ 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
// Syscalls after 325 are "backports" from versions of Linux after 4.4.
326: syscalls.ErrorWithEvent("copy_file_range", syscall.ENOSYS, "", nil),
- 327: syscalls.Undocumented("preadv2", Preadv2),
- 328: syscalls.Undocumented("pwritev2", Pwritev2),
+ 327: syscalls.Supported("preadv2", Preadv2),
+ 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
332: syscalls.Supported("statx", Statx),
},
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index 7081d1a45..f56411bfe 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/eventfd"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -55,7 +54,7 @@ type ioCallback struct {
OpCode uint16
ReqPrio int16
- FD uint32
+ FD int32
Buf uint64
Bytes uint64
@@ -65,7 +64,7 @@ type ioCallback struct {
Flags uint32
// eventfd to signal if IOCB_FLAG_RESFD is set in flags.
- ResFD uint32
+ ResFD int32
}
// ioEvent describes an I/O result.
@@ -292,7 +291,7 @@ func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioC
// submitCallback processes a single callback.
func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error {
- file := t.FDMap().GetFile(kdefs.FD(cb.FD))
+ file := t.GetFile(cb.FD)
if file == nil {
// File not found.
return syserror.EBADF
@@ -302,7 +301,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad
// Was there an eventFD? Extract it.
var eventFile *fs.File
if cb.Flags&_IOCB_FLAG_RESFD != 0 {
- eventFile = t.FDMap().GetFile(kdefs.FD(cb.ResFD))
+ eventFile = t.GetFile(cb.ResFD)
if eventFile == nil {
// Bad FD.
return syserror.EBADF
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 14a61cfa5..a0dd4d4e3 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -20,7 +20,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/syscalls"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
@@ -61,9 +60,9 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
// EpollCtl implements the epoll_ctl(2) linux syscall.
func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- epfd := kdefs.FD(args[0].Int())
+ epfd := args[0].Int()
op := args[1].Int()
- fd := kdefs.FD(args[2].Int())
+ fd := args[2].Int()
eventAddr := args[3].Pointer()
// Capture the event state if needed.
@@ -132,7 +131,7 @@ func copyOutEvents(t *kernel.Task, addr usermem.Addr, e []epoll.Event) error {
// EpollWait implements the epoll_wait(2) linux syscall.
func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- epfd := kdefs.FD(args[0].Int())
+ epfd := args[0].Int()
eventsAddr := args[1].Pointer()
maxEvents := int(args[2].Int())
timeout := int(args[3].Int())
diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go
index 7dbe84884..5cfc2c3c8 100644
--- a/pkg/sentry/syscalls/linux/sys_eventfd.go
+++ b/pkg/sentry/syscalls/linux/sys_eventfd.go
@@ -47,10 +47,9 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
})
defer event.DecRef()
- fd, err := t.FDMap().NewFDFrom(0, event, kernel.FDFlags{
+ fd, err := t.NewFDFrom(0, event, kernel.FDFlags{
CloseOnExec: flags&EFD_CLOEXEC != 0,
- },
- t.ThreadGroup().Limits())
+ })
if err != nil {
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 3410af69c..eb6f5648f 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -26,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -34,7 +33,7 @@ import (
)
// fileOpAt performs an operation on the second last component in the path.
-func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error {
+func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error {
// Extract the last component.
dir, name := fs.SplitLast(path)
if dir == "/" {
@@ -60,7 +59,7 @@ func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dire
}
// fileOpOn performs an operation on the last entry of the path.
-func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error {
+func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error {
var (
d *fs.Dirent // The file.
wd *fs.Dirent // The working directory (if required.)
@@ -78,7 +77,7 @@ func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func
rel = wd
} else {
// Need to extract the given FD.
- f = t.FDMap().GetFile(dirFD)
+ f = t.GetFile(dirFD)
if f == nil {
return syserror.EBADF
}
@@ -131,7 +130,7 @@ func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string
return path, dirPath, nil
}
-func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd uintptr, err error) {
+func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) {
path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return 0, err
@@ -184,8 +183,9 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u
defer file.DecRef()
// Success.
- fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}
- newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
+ newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
+ CloseOnExec: flags&linux.O_CLOEXEC != 0,
+ })
if err != nil {
return err
}
@@ -201,7 +201,7 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u
return fd, err // Use result in frame.
}
-func mknodAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return err
@@ -285,7 +285,7 @@ func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Mknodat implements the linux syscall mknodat(2).
func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
path := args[1].Pointer()
mode := linux.FileMode(args[2].ModeT())
// We don't need this argument until we support creation of device nodes.
@@ -294,7 +294,7 @@ func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
return 0, nil, mknodAt(t, dirFD, path, mode)
}
-func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) {
+func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) {
path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return 0, err
@@ -326,6 +326,7 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
if err != nil {
break
}
+ defer found.DecRef()
// We found something (possibly a symlink). If the
// O_EXCL flag was passed, then we can immediately
@@ -346,17 +347,18 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
}
// Try to resolve the symlink directly to a Dirent.
- resolved, err := found.Inode.Getlink(t)
- if err == nil || err != fs.ErrResolveViaReadlink {
+ var resolved *fs.Dirent
+ resolved, err = found.Inode.Getlink(t)
+ if err == nil {
// No more resolution necessary.
- found.DecRef()
- found = resolved
+ defer resolved.DecRef()
break
+ } else if err != fs.ErrResolveViaReadlink {
+ return err
}
// Are we able to resolve further?
if remainingTraversals == 0 {
- found.DecRef()
return syscall.ELOOP
}
@@ -373,10 +375,10 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
if err != nil {
break
}
+ defer newParent.DecRef()
// Repeat the process with the parent and name of the
// symlink target.
- parent.DecRef()
parent = newParent
name = newName
}
@@ -384,9 +386,6 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
var newFile *fs.File
switch err {
case nil:
- // The file existed.
- defer found.DecRef()
-
// Like sys_open, check for a few things about the
// filesystem before trying to get a reference to the
// fs.File. The same constraints on Check apply.
@@ -429,8 +428,9 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
}
// Success.
- fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}
- newFD, err := t.FDMap().NewFDFrom(0, newFile, fdFlags, t.ThreadGroup().Limits())
+ newFD, err := t.NewFDFrom(0, newFile, kernel.FDFlags{
+ CloseOnExec: flags&linux.O_CLOEXEC != 0,
+ })
if err != nil {
return err
}
@@ -464,7 +464,7 @@ func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Openat implements linux syscall openat(2).
func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
addr := args[1].Pointer()
flags := uint(args[2].Uint())
if flags&linux.O_CREAT != 0 {
@@ -503,7 +503,7 @@ func (ac accessContext) Value(key interface{}) interface{} {
}
}
-func accessAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, resolve bool, mode uint) error {
+func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, resolve bool, mode uint) error {
const rOK = 4
const wOK = 2
const xOK = 1
@@ -558,7 +558,7 @@ func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Faccessat implements linux syscall faccessat(2).
func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
addr := args[1].Pointer()
mode := args[2].ModeT()
flags := args[3].Int()
@@ -568,10 +568,10 @@ func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
// Ioctl implements linux syscall ioctl(2).
func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
request := int(args[1].Int())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -580,12 +580,12 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Shared flags between file and socket.
switch request {
case linux.FIONCLEX:
- t.FDMap().SetFlags(fd, kernel.FDFlags{
+ t.FDTable().SetFlags(fd, kernel.FDFlags{
CloseOnExec: false,
})
return 0, nil, nil
case linux.FIOCLEX:
- t.FDMap().SetFlags(fd, kernel.FDFlags{
+ t.FDTable().SetFlags(fd, kernel.FDFlags{
CloseOnExec: true,
})
return 0, nil, nil
@@ -729,9 +729,9 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Fchdir implements the linux syscall fchdir(2).
func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -753,10 +753,13 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Close implements linux syscall close(2).
func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
- file, ok := t.FDMap().Remove(fd)
- if !ok {
+ // Note that Remove provides a reference on the file that we may use to
+ // flush. It is still active until we drop the final reference below
+ // (and other reference-holding operations complete).
+ file := t.FDTable().Remove(fd)
+ if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
@@ -767,30 +770,30 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Dup implements linux syscall dup(2).
func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
- newfd, err := t.FDMap().NewFDFrom(0, file, kernel.FDFlags{}, t.ThreadGroup().Limits())
+ newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{})
if err != nil {
return 0, nil, syserror.EMFILE
}
- return uintptr(newfd), nil, nil
+ return uintptr(newFD), nil, nil
}
// Dup2 implements linux syscall dup2(2).
func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- oldfd := kdefs.FD(args[0].Int())
- newfd := kdefs.FD(args[1].Int())
+ oldfd := args[0].Int()
+ newfd := args[1].Int()
// If oldfd is a valid file descriptor, and newfd has the same value as oldfd,
// then dup2() does nothing, and returns newfd.
if oldfd == newfd {
- oldFile := t.FDMap().GetFile(oldfd)
+ oldFile := t.GetFile(oldfd)
if oldFile == nil {
return 0, nil, syserror.EBADF
}
@@ -806,21 +809,21 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Dup3 implements linux syscall dup3(2).
func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- oldfd := kdefs.FD(args[0].Int())
- newfd := kdefs.FD(args[1].Int())
+ oldfd := args[0].Int()
+ newfd := args[1].Int()
flags := args[2].Uint()
if oldfd == newfd {
return 0, nil, syserror.EINVAL
}
- oldFile := t.FDMap().GetFile(oldfd)
+ oldFile := t.GetFile(oldfd)
if oldFile == nil {
return 0, nil, syserror.EBADF
}
defer oldFile.DecRef()
- err := t.FDMap().NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}, t.ThreadGroup().Limits())
+ err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0})
if err != nil {
return 0, nil, err
}
@@ -863,10 +866,10 @@ func fSetOwn(t *kernel.Task, file *fs.File, who int32) {
// Fcntl implements linux syscall fcntl(2).
func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
cmd := args[1].Int()
- file, flags := t.FDMap().GetDescriptor(fd)
+ file, flags := t.FDTable().Get(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -874,9 +877,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
switch cmd {
case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
- from := kdefs.FD(args[2].Int())
- fdFlags := kernel.FDFlags{CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC}
- fd, err := t.FDMap().NewFDFrom(from, file, fdFlags, t.ThreadGroup().Limits())
+ from := args[2].Int()
+ fd, err := t.NewFDFrom(from, file, kernel.FDFlags{
+ CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
+ })
if err != nil {
return 0, nil, err
}
@@ -885,7 +889,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return uintptr(flags.ToLinuxFDFlags()), nil, nil
case linux.F_SETFD:
flags := args[2].Uint()
- t.FDMap().SetFlags(fd, kernel.FDFlags{
+ t.FDTable().SetFlags(fd, kernel.FDFlags{
CloseOnExec: flags&linux.FD_CLOEXEC != 0,
})
case linux.F_GETFL:
@@ -946,8 +950,8 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, err
}
- // The lock uid is that of the Task's FDMap.
- lockUniqueID := lock.UniqueID(t.FDMap().ID())
+ // The lock uid is that of the Task's FDTable.
+ lockUniqueID := lock.UniqueID(t.FDTable().ID())
// These locks don't block; execute the non-blocking operation using the inode's lock
// context directly.
@@ -1037,7 +1041,7 @@ const (
// Fadvise64 implements linux syscall fadvise64(2).
// This implementation currently ignores the provided advice.
func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
length := args[2].Int64()
advice := args[3].Int()
@@ -1046,7 +1050,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, syserror.EINVAL
}
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -1072,7 +1076,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, nil
}
-func mkdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
path, _, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return err
@@ -1117,14 +1121,14 @@ func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Mkdirat implements linux syscall mkdirat(2).
func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
addr := args[1].Pointer()
mode := linux.FileMode(args[2].ModeT())
return 0, nil, mkdirAt(t, dirFD, addr, mode)
}
-func rmdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error {
+func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
path, _, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return err
@@ -1164,7 +1168,7 @@ func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr)
}
-func symlinkAt(t *kernel.Task, dirFD kdefs.FD, newAddr usermem.Addr, oldAddr usermem.Addr) error {
+func symlinkAt(t *kernel.Task, dirFD int32, newAddr usermem.Addr, oldAddr usermem.Addr) error {
newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
if err != nil {
return err
@@ -1207,7 +1211,7 @@ func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// Symlinkat implements linux syscall symlinkat(2).
func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
oldAddr := args[0].Pointer()
- dirFD := kdefs.FD(args[1].Int())
+ dirFD := args[1].Int()
newAddr := args[2].Pointer()
return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr)
@@ -1249,7 +1253,7 @@ func mayLinkAt(t *kernel.Task, target *fs.Inode) error {
// linkAt creates a hard link to the target specified by oldDirFD and oldAddr,
// specified by newDirFD and newAddr. If resolve is true, then the symlinks
// will be followed when evaluating the target.
-func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr, resolve, allowEmpty bool) error {
+func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr, resolve, allowEmpty bool) error {
oldPath, _, err := copyInPath(t, oldAddr, allowEmpty)
if err != nil {
return err
@@ -1263,7 +1267,7 @@ func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kd
}
if allowEmpty && oldPath == "" {
- target := t.FDMap().GetFile(oldDirFD)
+ target := t.GetFile(oldDirFD)
if target == nil {
return syserror.EBADF
}
@@ -1325,9 +1329,9 @@ func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Linkat implements linux syscall linkat(2).
func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- oldDirFD := kdefs.FD(args[0].Int())
+ oldDirFD := args[0].Int()
oldAddr := args[1].Pointer()
- newDirFD := kdefs.FD(args[2].Int())
+ newDirFD := args[2].Int()
newAddr := args[3].Pointer()
// man linkat(2):
@@ -1352,7 +1356,7 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
}
-func readlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
+func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return 0, err
@@ -1402,7 +1406,7 @@ func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Readlinkat implements linux syscall readlinkat(2).
func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
addr := args[1].Pointer()
bufAddr := args[2].Pointer()
size := args[3].SizeT()
@@ -1411,7 +1415,7 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return n, nil, err
}
-func unlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error {
+func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return err
@@ -1441,7 +1445,7 @@ func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Unlinkat implements linux syscall unlinkat(2).
func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
addr := args[1].Pointer()
flags := args[2].Uint()
if flags&linux.AT_REMOVEDIR != 0 {
@@ -1502,10 +1506,10 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Ftruncate implements linux syscall ftruncate(2).
func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
length := args[1].Int64()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -1620,7 +1624,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error {
return nil
}
-func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error {
+func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error {
path, _, err := copyInPath(t, addr, allowEmpty)
if err != nil {
return err
@@ -1628,7 +1632,7 @@ func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty
if path == "" {
// Annoying. What's wrong with fchown?
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return syserror.EBADF
}
@@ -1662,11 +1666,11 @@ func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Fchown implements linux syscall fchown(2).
func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
uid := auth.UID(args[1].Uint())
gid := auth.GID(args[2].Uint())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -1677,7 +1681,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Fchownat implements Linux syscall fchownat(2).
func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
addr := args[1].Pointer()
uid := auth.UID(args[2].Uint())
gid := auth.GID(args[3].Uint())
@@ -1707,7 +1711,7 @@ func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error {
return nil
}
-func chmodAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+func chmodAt(t *kernel.Task, fd int32, addr usermem.Addr, mode linux.FileMode) error {
path, _, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
return err
@@ -1728,10 +1732,10 @@ func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Fchmod implements linux syscall fchmod(2).
func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
mode := linux.FileMode(args[1].ModeT())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -1742,7 +1746,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Fchmodat implements linux syscall fchmodat(2).
func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
mode := linux.FileMode(args[2].ModeT())
@@ -1758,7 +1762,7 @@ func defaultSetToSystemTimeSpec() fs.TimeSpec {
}
}
-func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error {
+func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error {
setTimestamp := func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
// Does the task own the file?
if !d.Inode.CheckOwnership(t) {
@@ -1791,7 +1795,7 @@ func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, r
// Linux returns EINVAL in this case. See utimes.c.
return syserror.EINVAL
}
- f := t.FDMap().GetFile(dirFD)
+ f := t.GetFile(dirFD)
if f == nil {
return syserror.EBADF
}
@@ -1859,7 +1863,7 @@ func timespecIsValid(ts linux.Timespec) bool {
// Utimensat implements linux syscall utimensat(2).
func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
pathnameAddr := args[1].Pointer()
timesAddr := args[2].Pointer()
flags := args[3].Int()
@@ -1894,7 +1898,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
// Futimesat implements linux syscall futimesat(2).
func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- dirFD := kdefs.FD(args[0].Int())
+ dirFD := args[0].Int()
pathnameAddr := args[1].Pointer()
timesAddr := args[2].Pointer()
@@ -1918,7 +1922,7 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
}
-func renameAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr) error {
+func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error {
newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
if err != nil {
return err
@@ -1966,21 +1970,21 @@ func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Renameat implements linux syscall renameat(2).
func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- oldDirFD := kdefs.FD(args[0].Int())
+ oldDirFD := args[0].Int()
oldPathAddr := args[1].Pointer()
- newDirFD := kdefs.FD(args[2].Int())
+ newDirFD := args[2].Int()
newPathAddr := args[3].Pointer()
return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
}
// Fallocate implements linux system call fallocate(2).
func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
mode := args[1].Int64()
offset := args[2].Int64()
length := args[3].Int64()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -2029,10 +2033,10 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
// Flock implements linux syscall flock(2).
func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
operation := args[1].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
// flock(2): EBADF fd is not an open file descriptor.
return 0, nil, syserror.EBADF
@@ -2139,8 +2143,9 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
defer dirent.DecRef()
defer file.DecRef()
- fdFlags := kernel.FDFlags{CloseOnExec: cloExec}
- newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
+ newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
+ CloseOnExec: cloExec,
+ })
if err != nil {
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index dea872672..7a2beda23 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -23,14 +23,13 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
)
// Getdents implements linux syscall getdents(2) for 64bit systems.
func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
size := int(args[2].Uint())
@@ -46,7 +45,7 @@ func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Getdents64 implements linux syscall getdents64(2).
func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
size := int(args[2].Uint())
@@ -62,8 +61,8 @@ func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
// getdents implements the core of getdents(2)/getdents64(2).
// f is the syscall implementation dirent serialization function.
-func getdents(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) {
- dir := t.FDMap().GetFile(fd)
+func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) {
+ dir := t.GetFile(fd)
if dir == nil {
return 0, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go
index 9cfa660fa..a4f36e01f 100644
--- a/pkg/sentry/syscalls/linux/sys_inotify.go
+++ b/pkg/sentry/syscalls/linux/sys_inotify.go
@@ -22,7 +22,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/anon"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
)
const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC)
@@ -44,9 +43,9 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t))
defer n.DecRef()
- fd, err := t.FDMap().NewFDFrom(0, n, kernel.FDFlags{
+ fd, err := t.NewFDFrom(0, n, kernel.FDFlags{
CloseOnExec: flags&linux.IN_CLOEXEC != 0,
- }, t.ThreadGroup().Limits())
+ })
if err != nil {
return 0, nil, err
@@ -63,8 +62,8 @@ func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
// fdToInotify resolves an fd to an inotify object. If successful, the file will
// have an extra ref and the caller is responsible for releasing the ref.
-func fdToInotify(t *kernel.Task, fd kdefs.FD) (*fs.Inotify, *fs.File, error) {
- file := t.FDMap().GetFile(fd)
+func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) {
+ file := t.GetFile(fd)
if file == nil {
// Invalid fd.
return nil, nil, syscall.EBADF
@@ -82,7 +81,7 @@ func fdToInotify(t *kernel.Task, fd kdefs.FD) (*fs.Inotify, *fs.File, error) {
// InotifyAddWatch implements the inotify_add_watch() syscall.
func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
mask := args[2].Uint()
@@ -114,7 +113,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern
}
// Copy out to the return frame.
- fd = kdefs.FD(ino.AddWatch(dirent, mask))
+ fd = ino.AddWatch(dirent, mask)
return nil
})
@@ -123,7 +122,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern
// InotifyRmWatch implements the inotify_rm_watch() syscall.
func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
wd := args[1].Int()
ino, file, err := fdToInotify(t, fd)
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
index a3813b818..297e920c4 100644
--- a/pkg/sentry/syscalls/linux/sys_lseek.go
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -18,17 +18,16 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/syserror"
)
// Lseek implements linux syscall lseek(2).
func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
offset := args[1].Int64()
whence := args[2].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index d831833bc..58a05b5bb 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -20,7 +20,6 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -40,7 +39,7 @@ func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
prot := args[2].Int()
flags := args[3].Int()
- fd := kdefs.FD(args[4].Int())
+ fd := args[4].Int()
fixed := flags&linux.MAP_FIXED != 0
private := flags&linux.MAP_PRIVATE != 0
shared := flags&linux.MAP_SHARED != 0
@@ -80,7 +79,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if !anon {
// Convert the passed FD to a file reference.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 7c1bea43d..576022dd5 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -19,8 +19,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
"gvisor.dev/gvisor/pkg/sentry/usermem"
)
@@ -38,24 +38,17 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
w.SetFlags(linuxToFlags(flags).Settable())
defer w.DecRef()
- rfd, err := t.FDMap().NewFDFrom(0, r, kernel.FDFlags{
- CloseOnExec: flags&linux.O_CLOEXEC != 0},
- t.ThreadGroup().Limits())
+ fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{
+ CloseOnExec: flags&linux.O_CLOEXEC != 0,
+ })
if err != nil {
return 0, err
}
- wfd, err := t.FDMap().NewFDFrom(0, w, kernel.FDFlags{
- CloseOnExec: flags&linux.O_CLOEXEC != 0},
- t.ThreadGroup().Limits())
- if err != nil {
- t.FDMap().Remove(rfd)
- return 0, err
- }
-
- if _, err := t.CopyOut(addr, []kdefs.FD{rfd, wfd}); err != nil {
- t.FDMap().Remove(rfd)
- t.FDMap().Remove(wfd)
+ if _, err := t.CopyOut(addr, fds); err != nil {
+ // The files are not closed in this case, the exact semantics
+ // of this error case are not well defined, but they could have
+ // already been observed by user space.
return 0, syscall.EFAULT
}
return 0, nil
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index ef6211218..7a13beac2 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -64,7 +63,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan
return
}
- file := t.FDMap().GetFile(kdefs.FD(pfd.FD))
+ file := t.GetFile(pfd.FD)
if file == nil {
pfd.REvents = linux.POLLNVAL
return
@@ -265,7 +264,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
// immediately to ensure we don't leak. Note, another thread
// might be about to close fd. This is racy, but that's
// OK. Linux is racy in the same way.
- file := t.FDMap().GetFile(kdefs.FD(fd))
+ file := t.GetFile(fd)
if file == nil {
return 0, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 9d70881fd..5329023e6 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -23,7 +23,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/mm"
)
@@ -122,9 +121,9 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
switch args[1].Int() {
case linux.PR_SET_MM_EXE_FILE:
- fd := kdefs.FD(args[2].Int())
+ fd := args[2].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syscall.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index a1965f490..b2474e60d 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -39,11 +38,11 @@ const (
// they can do large reads all at once. Bug for bug. Same for other read
// calls below.
func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
size := args[2].SizeT()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -75,12 +74,12 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Pread64 implements linux syscall pread64(2).
func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
size := args[2].SizeT()
offset := args[3].Int64()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -122,11 +121,11 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// Readv implements linux syscall readv(2).
func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -152,12 +151,12 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Preadv implements linux syscall preadv(2).
func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
offset := args[3].Int64()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -201,13 +200,13 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// splits the offset argument into a high/low value for compatibility with
// 32-bit architectures. The flags argument is the 5th argument.
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
offset := args[3].Int64()
flags := int(args[5].Int())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index ccdb079bb..d1165a57a 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -23,7 +23,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/control"
@@ -200,9 +199,9 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
})
defer s.DecRef()
- fd, err := t.FDMap().NewFDFrom(0, s, kernel.FDFlags{
+ fd, err := t.NewFDFrom(0, s, kernel.FDFlags{
CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
- }, t.ThreadGroup().Limits())
+ })
if err != nil {
return 0, nil, err
}
@@ -225,9 +224,6 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
fileFlags := fs.SettableFileFlags{
NonBlocking: stype&linux.SOCK_NONBLOCK != 0,
}
- fdFlags := kernel.FDFlags{
- CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
- }
// Create the socket pair.
s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol)
@@ -240,20 +236,16 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
defer s2.DecRef()
// Create the FDs for the sockets.
- fd1, err := t.FDMap().NewFDFrom(0, s1, fdFlags, t.ThreadGroup().Limits())
- if err != nil {
- return 0, nil, err
- }
- fd2, err := t.FDMap().NewFDFrom(0, s2, fdFlags, t.ThreadGroup().Limits())
+ fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{
+ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+ })
if err != nil {
- t.FDMap().Remove(fd1)
return 0, nil, err
}
// Copy the file descriptors out.
- if _, err := t.CopyOut(socks, []int32{int32(fd1), int32(fd2)}); err != nil {
- t.FDMap().Remove(fd1)
- t.FDMap().Remove(fd2)
+ if _, err := t.CopyOut(socks, fds); err != nil {
+ // Note that we don't close files here; see pipe(2) also.
return 0, nil, err
}
@@ -262,12 +254,12 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
// Connect implements the linux syscall connect(2).
func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
addrlen := args[2].Uint()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syscall.EBADF
}
@@ -291,14 +283,14 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// accept is the implementation of the accept syscall. It is called by accept
// and accept4 syscall handlers.
-func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) {
+func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) {
// Check that no unsupported flags are passed in.
if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
return 0, syscall.EINVAL
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, syscall.EBADF
}
@@ -331,7 +323,7 @@ func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr
// Accept4 implements the linux syscall accept4(2).
func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
addrlen := args[2].Pointer()
flags := int(args[3].Int())
@@ -342,7 +334,7 @@ func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// Accept implements the linux syscall accept(2).
func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
addrlen := args[2].Pointer()
@@ -352,12 +344,12 @@ func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Bind implements the linux syscall bind(2).
func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
addrlen := args[2].Uint()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syscall.EBADF
}
@@ -380,11 +372,11 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Listen implements the linux syscall listen(2).
func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
backlog := args[1].Int()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syscall.EBADF
}
@@ -409,11 +401,11 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Shutdown implements the linux syscall shutdown(2).
func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
how := args[1].Int()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syscall.EBADF
}
@@ -437,14 +429,14 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// GetSockOpt implements the linux syscall getsockopt(2).
func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
level := args[1].Int()
name := args[2].Int()
optValAddr := args[3].Pointer()
optLenAddr := args[4].Pointer()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syscall.EBADF
}
@@ -521,14 +513,14 @@ func getSockOpt(t *kernel.Task, s socket.Socket, level, name, len int) (interfac
//
// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket.
func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
level := args[1].Int()
name := args[2].Int()
optValAddr := args[3].Pointer()
optLen := args[4].Int()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syscall.EBADF
}
@@ -561,12 +553,12 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
// GetSockName implements the linux syscall getsockname(2).
func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
addrlen := args[2].Pointer()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syscall.EBADF
}
@@ -589,12 +581,12 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
// GetPeerName implements the linux syscall getpeername(2).
func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
addrlen := args[2].Pointer()
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syscall.EBADF
}
@@ -617,7 +609,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
// RecvMsg implements the linux syscall recvmsg(2).
func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
msgPtr := args[1].Pointer()
flags := args[2].Int()
@@ -627,7 +619,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syscall.EBADF
}
@@ -663,7 +655,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// RecvMMsg implements the linux syscall recvmmsg(2).
func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
msgPtr := args[1].Pointer()
vlen := args[2].Uint()
flags := args[3].Int()
@@ -680,7 +672,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syscall.EBADF
}
@@ -842,7 +834,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
// recvFrom is the implementation of the recvfrom syscall. It is called by
// recvfrom and recv syscall handlers.
-func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) {
+func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) {
if int(bufLen) < 0 {
return 0, syscall.EINVAL
}
@@ -853,7 +845,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, syscall.EBADF
}
@@ -903,7 +895,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f
// RecvFrom implements the linux syscall recvfrom(2).
func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
bufPtr := args[1].Pointer()
bufLen := args[2].Uint64()
flags := args[3].Int()
@@ -916,7 +908,7 @@ func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// SendMsg implements the linux syscall sendmsg(2).
func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
msgPtr := args[1].Pointer()
flags := args[2].Int()
@@ -926,7 +918,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syscall.EBADF
}
@@ -953,7 +945,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// SendMMsg implements the linux syscall sendmmsg(2).
func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
msgPtr := args[1].Pointer()
vlen := args[2].Uint()
flags := args[3].Int()
@@ -964,7 +956,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syscall.EBADF
}
@@ -1079,14 +1071,14 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
// sendTo is the implementation of the sendto syscall. It is called by sendto
// and send syscall handlers.
-func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) {
+func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) {
bl := int(bufLen)
if bl < 0 {
return 0, syscall.EINVAL
}
// Get socket from the file descriptor.
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, syscall.EBADF
}
@@ -1135,7 +1127,7 @@ func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, fla
// SendTo implements the linux syscall sendto(2).
func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
bufPtr := args[1].Pointer()
bufLen := args[2].Uint64()
flags := args[3].Int()
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index b6517313f..a7c98efcb 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -19,7 +19,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -81,8 +80,8 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
// Sendfile implements linux system call sendfile(2).
func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- outFD := kdefs.FD(args[0].Int())
- inFD := kdefs.FD(args[1].Int())
+ outFD := args[0].Int()
+ inFD := args[1].Int()
offsetAddr := args[2].Pointer()
count := int64(args[3].SizeT())
@@ -92,13 +91,13 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
}
// Get files.
- outFile := t.FDMap().GetFile(outFD)
+ outFile := t.GetFile(outFD)
if outFile == nil {
return 0, nil, syserror.EBADF
}
defer outFile.DecRef()
- inFile := t.FDMap().GetFile(inFD)
+ inFile := t.GetFile(inFD)
if inFile == nil {
return 0, nil, syserror.EBADF
}
@@ -163,9 +162,9 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Splice implements splice(2).
func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- inFD := kdefs.FD(args[0].Int())
+ inFD := args[0].Int()
inOffset := args[1].Pointer()
- outFD := kdefs.FD(args[2].Int())
+ outFD := args[2].Int()
outOffset := args[3].Pointer()
count := int64(args[4].SizeT())
flags := args[5].Int()
@@ -182,13 +181,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0
// Get files.
- outFile := t.FDMap().GetFile(outFD)
+ outFile := t.GetFile(outFD)
if outFile == nil {
return 0, nil, syserror.EBADF
}
defer outFile.DecRef()
- inFile := t.FDMap().GetFile(inFD)
+ inFile := t.GetFile(inFD)
if inFile == nil {
return 0, nil, syserror.EBADF
}
@@ -251,8 +250,8 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Tee imlements tee(2).
func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- inFD := kdefs.FD(args[0].Int())
- outFD := kdefs.FD(args[1].Int())
+ inFD := args[0].Int()
+ outFD := args[1].Int()
count := int64(args[2].SizeT())
flags := args[3].Int()
@@ -265,13 +264,13 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0
// Get files.
- outFile := t.FDMap().GetFile(outFD)
+ outFile := t.GetFile(outFD)
if outFile == nil {
return 0, nil, syserror.EBADF
}
defer outFile.DecRef()
- inFile := t.FDMap().GetFile(inFD)
+ inFile := t.GetFile(inFD)
if inFile == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 9a5657254..5556bc276 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -20,7 +20,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -42,7 +41,7 @@ func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Fstatat implements linux syscall newfstatat, i.e. fstatat(2).
func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
statAddr := args[2].Pointer()
flags := args[3].Int()
@@ -54,7 +53,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if path == "" {
// Annoying. What's wrong with fstat?
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -93,10 +92,10 @@ func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Fstat implements linux syscall fstat(2).
func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
statAddr := args[1].Pointer()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -178,7 +177,7 @@ func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs
// Statx implements linux syscall statx(2).
func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
pathAddr := args[1].Pointer()
flags := args[2].Int()
mask := args[3].Uint()
@@ -197,7 +196,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
if path == "" {
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -285,10 +284,10 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Fstatfs implements linux syscall fstatfs(2).
func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
statfsAddr := args[1].Pointer()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index 37225735f..3e55235bd 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -19,7 +19,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -32,9 +31,9 @@ func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
// Syncfs implements linux system call syncfs(2).
func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -47,9 +46,9 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Fsync implements linux syscall fsync(2).
func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -63,9 +62,9 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
//
// At the moment, it just calls Fsync, which is a big hammer, but correct.
func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -79,6 +78,7 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
var err error
+ fd := args[0].Int()
offset := args[1].Int64()
nbytes := args[2].Int64()
uflags := args[3].Uint()
@@ -97,8 +97,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
nbytes = fs.FileMaxOffset
}
- fd := kdefs.FD(args[0].Int())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
index ea6d44315..1ce5ce4c3 100644
--- a/pkg/sentry/syscalls/linux/sys_timerfd.go
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -20,7 +20,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -49,9 +48,9 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
NonBlocking: flags&linux.TFD_NONBLOCK != 0,
})
- fd, err := t.FDMap().NewFDFrom(0, f, kernel.FDFlags{
+ fd, err := t.NewFDFrom(0, f, kernel.FDFlags{
CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
- }, t.ThreadGroup().Limits())
+ })
if err != nil {
return 0, nil, err
}
@@ -61,7 +60,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
// TimerfdSettime implements Linux syscall timerfd_settime(2).
func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
flags := args[1].Int()
newValAddr := args[2].Pointer()
oldValAddr := args[3].Pointer()
@@ -70,7 +69,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
return 0, nil, syserror.EINVAL
}
- f := t.FDMap().GetFile(fd)
+ f := t.GetFile(fd)
if f == nil {
return 0, nil, syserror.EBADF
}
@@ -101,10 +100,10 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
// TimerfdGettime implements Linux syscall timerfd_gettime(2).
func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
curValAddr := args[1].Pointer()
- f := t.FDMap().GetFile(fd)
+ f := t.GetFile(fd)
if f == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index 3a5bf9ac4..5278c96a6 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -39,11 +38,11 @@ const (
// Write implements linux syscall write(2).
func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
size := args[2].SizeT()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -75,12 +74,12 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Pwrite64 implements linux syscall pwrite64(2).
func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
size := args[2].SizeT()
offset := args[3].Int64()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -122,11 +121,11 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Writev implements linux syscall writev(2).
func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -152,12 +151,12 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Pwritev implements linux syscall pwritev(2).
func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
offset := args[3].Int64()
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -202,7 +201,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// splits the offset argument into a high/low value for compatibility with
// 32-bit architectures. The flags argument is the 5th argument.
- fd := kdefs.FD(args[0].Int())
+ fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
offset := args[3].Int64()
@@ -212,7 +211,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
return 0, nil, syserror.EACCES
}
- file := t.FDMap().GetFile(fd)
+ file := t.GetFile(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go
index a5f3d8407..94d52d5d5 100644
--- a/pkg/sentry/syscalls/syscalls.go
+++ b/pkg/sentry/syscalls/syscalls.go
@@ -40,16 +40,7 @@ func Supported(name string, fn kernel.SyscallFn) kernel.Syscall {
Name: name,
Fn: fn,
SupportLevel: kernel.SupportFull,
- Note: "Full Support",
- }
-}
-
-// Undocumented returns a syscall that is undocumented.
-func Undocumented(name string, fn kernel.SyscallFn) kernel.Syscall {
- return kernel.Syscall{
- Name: name,
- Fn: fn,
- SupportLevel: kernel.SupportUndocumented,
+ Note: "Fully Supported.",
}
}
@@ -75,7 +66,7 @@ func Error(name string, err syscall.Errno, note string, urls []string) kernel.Sy
return 0, nil, err
},
SupportLevel: kernel.SupportUnimplemented,
- Note: fmt.Sprintf("%sReturns %q", note, err.Error()),
+ Note: fmt.Sprintf("%sReturns %q.", note, err.Error()),
URLs: urls,
}
}
@@ -93,7 +84,7 @@ func ErrorWithEvent(name string, err syscall.Errno, note string, urls []string)
return 0, nil, err
},
SupportLevel: kernel.SupportUnimplemented,
- Note: fmt.Sprintf("%sReturns %q", note, err.Error()),
+ Note: fmt.Sprintf("%sReturns %q.", note, err.Error()),
URLs: urls,
}
}
@@ -115,7 +106,7 @@ func CapError(name string, c linux.Capability, note string, urls []string) kerne
return 0, nil, syserror.ENOSYS
},
SupportLevel: kernel.SupportUnimplemented,
- Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise", note, syserror.EPERM, c.String(), syserror.ENOSYS),
+ Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, syserror.EPERM, c.String(), syserror.ENOSYS),
URLs: urls,
}
}
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
new file mode 100644
index 000000000..fc9abbb55
--- /dev/null
+++ b/pkg/tcpip/iptables/BUILD
@@ -0,0 +1,18 @@
+package(licenses = ["notice"])
+
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+go_library(
+ name = "iptables",
+ srcs = [
+ "iptables.go",
+ "targets.go",
+ "types.go",
+ ],
+ importpath = "gvisor.dev/gvisor/pkg/tcpip/iptables",
+ visibility = ["//visibility:public"],
+ deps = [
+ "//pkg/tcpip",
+ "//pkg/tcpip/buffer",
+ ],
+)
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
new file mode 100644
index 000000000..f1e1d1fad
--- /dev/null
+++ b/pkg/tcpip/iptables/iptables.go
@@ -0,0 +1,81 @@
+// Copyright 2019 The gVisor authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package iptables supports packet filtering and manipulation via the iptables
+// tool.
+package iptables
+
+const (
+ tablenameNat = "nat"
+ tablenameMangle = "mangle"
+)
+
+// Chain names as defined by net/ipv4/netfilter/ip_tables.c.
+const (
+ chainNamePrerouting = "PREROUTING"
+ chainNameInput = "INPUT"
+ chainNameForward = "FORWARD"
+ chainNameOutput = "OUTPUT"
+ chainNamePostrouting = "POSTROUTING"
+)
+
+// DefaultTables returns a default set of tables. Each chain is set to accept
+// all packets.
+func DefaultTables() *IPTables {
+ return &IPTables{
+ Tables: map[string]Table{
+ tablenameNat: Table{
+ BuiltinChains: map[Hook]Chain{
+ Prerouting: unconditionalAcceptChain(chainNamePrerouting),
+ Input: unconditionalAcceptChain(chainNameInput),
+ Output: unconditionalAcceptChain(chainNameOutput),
+ Postrouting: unconditionalAcceptChain(chainNamePostrouting),
+ },
+ DefaultTargets: map[Hook]Target{
+ Prerouting: UnconditionalAcceptTarget{},
+ Input: UnconditionalAcceptTarget{},
+ Output: UnconditionalAcceptTarget{},
+ Postrouting: UnconditionalAcceptTarget{},
+ },
+ UserChains: map[string]Chain{},
+ },
+ tablenameMangle: Table{
+ BuiltinChains: map[Hook]Chain{
+ Prerouting: unconditionalAcceptChain(chainNamePrerouting),
+ Output: unconditionalAcceptChain(chainNameOutput),
+ },
+ DefaultTargets: map[Hook]Target{
+ Prerouting: UnconditionalAcceptTarget{},
+ Output: UnconditionalAcceptTarget{},
+ },
+ UserChains: map[string]Chain{},
+ },
+ },
+ Priorities: map[Hook][]string{
+ Prerouting: []string{tablenameMangle, tablenameNat},
+ Output: []string{tablenameMangle, tablenameNat},
+ },
+ }
+}
+
+func unconditionalAcceptChain(name string) Chain {
+ return Chain{
+ Name: name,
+ Rules: []Rule{
+ Rule{
+ Target: UnconditionalAcceptTarget{},
+ },
+ },
+ }
+}
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
new file mode 100644
index 000000000..19a7f77e3
--- /dev/null
+++ b/pkg/tcpip/iptables/targets.go
@@ -0,0 +1,35 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains various Targets.
+
+package iptables
+
+import "gvisor.dev/gvisor/pkg/tcpip/buffer"
+
+// UnconditionalAcceptTarget accepts all packets.
+type UnconditionalAcceptTarget struct{}
+
+// Action implements Target.Action.
+func (UnconditionalAcceptTarget) Action(packet buffer.VectorisedView) (Verdict, string) {
+ return Accept, ""
+}
+
+// UnconditionalDropTarget denies all packets.
+type UnconditionalDropTarget struct{}
+
+// Action implements Target.Action.
+func (UnconditionalDropTarget) Action(packet buffer.VectorisedView) (Verdict, string) {
+ return Drop, ""
+}
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
new file mode 100644
index 000000000..600bd9a10
--- /dev/null
+++ b/pkg/tcpip/iptables/types.go
@@ -0,0 +1,183 @@
+// Copyright 2019 The gVisor authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+// A Hook specifies one of the hooks built into the network stack.
+//
+// Userspace app Userspace app
+// ^ |
+// | v
+// [Input] [Output]
+// ^ |
+// | v
+// | routing
+// | |
+// | v
+// ----->[Prerouting]----->routing----->[Forward]---------[Postrouting]----->
+type Hook uint
+
+// These values correspond to values in include/uapi/linux/netfilter.h.
+const (
+ // Prerouting happens before a packet is routed to applications or to
+ // be forwarded.
+ Prerouting Hook = iota
+
+ // Input happens before a packet reaches an application.
+ Input
+
+ // Forward happens once it's decided that a packet should be forwarded
+ // to another host.
+ Forward
+
+ // Output happens after a packet is written by an application to be
+ // sent out.
+ Output
+
+ // Postrouting happens just before a packet goes out on the wire.
+ Postrouting
+
+ // The total number of hooks.
+ NumHooks
+)
+
+// A Verdict is returned by a rule's target to indicate how traversal of rules
+// should (or should not) continue.
+type Verdict int
+
+const (
+ // Accept indicates the packet should continue traversing netstack as
+ // normal.
+ Accept Verdict = iota
+
+ // Drop inicates the packet should be dropped, stopping traversing
+ // netstack.
+ Drop
+
+ // Stolen indicates the packet was co-opted by the target and should
+ // stop traversing netstack.
+ Stolen
+
+ // Queue indicates the packet should be queued for userspace processing.
+ Queue
+
+ // Repeat indicates the packet should re-traverse the chains for the
+ // current hook.
+ Repeat
+
+ // None indicates no verdict was reached.
+ None
+
+ // Jump indicates a jump to another chain.
+ Jump
+
+ // Continue indicates that traversal should continue at the next rule.
+ Continue
+
+ // Return indicates that traversal should return to the calling chain.
+ Return
+)
+
+// IPTables holds all the tables for a netstack.
+type IPTables struct {
+ // Tables maps table names to tables. User tables have arbitrary names.
+ Tables map[string]Table
+
+ // Priorities maps each hook to a list of table names. The order of the
+ // list is the order in which each table should be visited for that
+ // hook.
+ Priorities map[Hook][]string
+}
+
+// A Table defines a set of chains and hooks into the network stack. The
+// currently supported tables are:
+// * nat
+// * mangle
+type Table struct {
+ // BuiltinChains holds the un-deletable chains built into netstack. If
+ // a hook isn't present in the map, this table doesn't utilize that
+ // hook.
+ BuiltinChains map[Hook]Chain
+
+ // DefaultTargets holds a target for each hook that will be executed if
+ // chain traversal doesn't yield a verdict.
+ DefaultTargets map[Hook]Target
+
+ // UserChains holds user-defined chains for the keyed by name. Users
+ // can give their chains arbitrary names.
+ UserChains map[string]Chain
+
+ // Chains maps names to chains for both builtin and user-defined chains.
+ // Its entries point to Chains already either in BuiltinChains or
+ // UserChains, and its purpose is to make looking up tables by name
+ // fast.
+ Chains map[string]*Chain
+}
+
+// ValidHooks returns a bitmap of the builtin hooks for the given table.
+func (table *Table) ValidHooks() (uint32, *tcpip.Error) {
+ hooks := uint32(0)
+ for hook, _ := range table.BuiltinChains {
+ hooks |= 1 << hook
+ }
+ return hooks, nil
+}
+
+// A Chain defines a list of rules for packet processing. When a packet
+// traverses a chain, it is checked against each rule until either a rule
+// returns a verdict or the chain ends.
+//
+// By convention, builtin chains end with a rule that matches everything and
+// returns either Accept or Drop. User-defined chains end with Return. These
+// aren't strictly necessary here, but the iptables tool writes tables this way.
+type Chain struct {
+ // Name is the chain name.
+ Name string
+
+ // Rules is the list of rules to traverse.
+ Rules []Rule
+}
+
+// A Rule is a packet processing rule. It consists of two pieces. First it
+// contains zero or more matchers, each of which is a specification of which
+// packets this rule applies to. If there are no matchers in the rule, it
+// applies to any packet.
+type Rule struct {
+ // Matchers is the list of matchers for this rule.
+ Matchers []Matcher
+
+ // Target is the action to invoke if all the matchers match the packet.
+ Target Target
+}
+
+// A Matcher is the interface for matching packets.
+type Matcher interface {
+ // Match returns whether the packet matches and whether the packet
+ // should be "hotdropped", i.e. dropped immediately. This is usually
+ // used for suspicious packets.
+ Match(hook Hook, packet buffer.VectorisedView, interfaceName string) (matches bool, hotdrop bool)
+}
+
+// A Target is the interface for taking an action for a packet.
+type Target interface {
+ // Action takes an action on the packet and returns a verdict on how
+ // traversal should (or should not) continue. If the return value is
+ // Jump, it also returns the name of the chain to jump to.
+ Action(packet buffer.VectorisedView) (Verdict, string)
+}