From d58eb9ce828fd7c831f30e922e01f1d2b84e462c Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Fri, 31 May 2019 16:14:04 -0700 Subject: Add basic iptables structures to netstack. Change-Id: Ib589906175a59dae315405a28f2d7f525ff8877f --- pkg/tcpip/iptables/BUILD | 20 +++++ pkg/tcpip/iptables/iptables.go | 97 +++++++++++++++++++++ pkg/tcpip/iptables/targets.go | 35 ++++++++ pkg/tcpip/iptables/types.go | 190 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 342 insertions(+) create mode 100644 pkg/tcpip/iptables/BUILD create mode 100644 pkg/tcpip/iptables/iptables.go create mode 100644 pkg/tcpip/iptables/targets.go create mode 100644 pkg/tcpip/iptables/types.go diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD new file mode 100644 index 000000000..173f148da --- /dev/null +++ b/pkg/tcpip/iptables/BUILD @@ -0,0 +1,20 @@ +package(licenses = ["notice"]) + +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + +go_library( + name = "iptables", + srcs = [ + "iptables.go", + "targets.go", + "types.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/iptables", + visibility = ["//visibility:public"], + deps = [ + "//pkg/state", + "//pkg/tcpip", + "//pkg/tcpip/buffer", + "//pkg/tcpip/header", + ], +) diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go new file mode 100644 index 000000000..ee1ed4666 --- /dev/null +++ b/pkg/tcpip/iptables/iptables.go @@ -0,0 +1,97 @@ +// Copyright 2019 The gVisor authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package iptables supports packet filtering and manipulation via the iptables +// tool. +package iptables + +const ( + tablenameNat = "nat" + tablenameMangle = "mangle" +) + +// Chain names as defined by net/ipv4/netfilter/ip_tables.c. +const ( + chainNamePrerouting = "PREROUTING" + chainNameInput = "INPUT" + chainNameForward = "FORWARD" + chainNameOutput = "OUTPUT" + chainNamePostrouting = "POSTROUTING" +) + +// DefaultTables returns a default set of tables. Each chain is set to accept +// all packets. +func DefaultTables() *IPTables { + tables := IPTables{ + Tables: map[string]*Table{ + tablenameNat: &Table{ + BuiltinChains: map[Hook]*Chain{ + Prerouting: unconditionalAcceptChain(chainNamePrerouting), + Input: unconditionalAcceptChain(chainNameInput), + Output: unconditionalAcceptChain(chainNameOutput), + Postrouting: unconditionalAcceptChain(chainNamePostrouting), + }, + DefaultTargets: map[Hook]Target{ + Prerouting: UnconditionalAcceptTarget{}, + Input: UnconditionalAcceptTarget{}, + Output: UnconditionalAcceptTarget{}, + Postrouting: UnconditionalAcceptTarget{}, + }, + UserChains: map[string]*Chain{}, + }, + tablenameMangle: &Table{ + BuiltinChains: map[Hook]*Chain{ + Prerouting: unconditionalAcceptChain(chainNamePrerouting), + Output: unconditionalAcceptChain(chainNameOutput), + }, + DefaultTargets: map[Hook]Target{ + Prerouting: UnconditionalAcceptTarget{}, + Output: UnconditionalAcceptTarget{}, + }, + UserChains: map[string]*Chain{}, + }, + }, + Priorities: map[Hook][]string{ + Prerouting: []string{tablenameMangle, tablenameNat}, + Output: []string{tablenameMangle, tablenameNat}, + }, + } + + // Initialize each table's Chains field. + tables.Tables[tablenameNat].Chains = map[string]*Chain{ + chainNamePrerouting: tables.Tables[tablenameNat].BuiltinChains[Prerouting], + chainNameInput: tables.Tables[tablenameNat].BuiltinChains[Input], + chainNameOutput: tables.Tables[tablenameNat].BuiltinChains[Output], + chainNamePostrouting: tables.Tables[tablenameNat].BuiltinChains[Postrouting], + } + tables.Tables[tablenameMangle].Chains = map[string]*Chain{ + chainNamePrerouting: tables.Tables[tablenameMangle].BuiltinChains[Prerouting], + chainNameInput: tables.Tables[tablenameMangle].BuiltinChains[Input], + chainNameOutput: tables.Tables[tablenameMangle].BuiltinChains[Output], + chainNamePostrouting: tables.Tables[tablenameMangle].BuiltinChains[Postrouting], + } + + return &tables +} + +func unconditionalAcceptChain(name string) *Chain { + return &Chain{ + Name: name, + Rules: []*Rule{ + &Rule{ + Target: UnconditionalAcceptTarget{}, + }, + }, + } +} diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go new file mode 100644 index 000000000..028978e3a --- /dev/null +++ b/pkg/tcpip/iptables/targets.go @@ -0,0 +1,35 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package iptables + +import "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" + +// This file contains various Targets. + +// UnconditionalAcceptTarget accepts all packets. +type UnconditionalAcceptTarget struct{} + +// Action implements Target.Action. +func (_ UnconditionalAcceptTarget) Action(packet buffer.VectorisedView) (Verdict, string) { + return Accept, "" +} + +// UnconditionalDropTarget denies all packets. +type UnconditionalDropTarget struct{} + +// Action implements Target.Action. +func (_ UnconditionalDropTarget) Action(packet buffer.VectorisedView) (Verdict, string) { + return Drop, "" +} diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go new file mode 100644 index 000000000..65bfc7b1d --- /dev/null +++ b/pkg/tcpip/iptables/types.go @@ -0,0 +1,190 @@ +// Copyright 2019 The gVisor authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package iptables + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" +) + +// Hook specifies one of the hooks built into the network stack. +// +// Userspace app Userspace app +// ^ | +// | v +// [Input] [Output] +// ^ | +// | v +// | routing +// | | +// | v +// ----->[Prerouting]----->routing----->[Forward]---------[Postrouting]-----> +type Hook uint + +// These values correspond to values in include/uapi/linux/netfilter.h. +const ( + // Prerouting happens before a packet is routed to applications or to + // be forwarded. + Prerouting Hook = iota + + // Input happens before a packet reaches an application. + Input + + // Forward happens once it's decided that a packet should be forwarded + // to another host. + Forward + + // Output happens after a packet is written by an application to be + // sent out. + Output + + // Postrouting happens just before a packet goes out on the wire. + Postrouting + + // The total number of hooks. + NumHooks +) + +// Verdict is returned by a rule's target to indicate how traversal of rules +// should (or should not) continue. +type Verdict int + +const ( + // Accept indicates the packet should continue traversing netstack as + // normal. + Accept Verdict = iota + + // Drop inicates the packet should be dropped, stopping traversing + // netstack. + Drop + + // Stolen indicates the packet was co-opted by the target and should + // stop traversing netstack. + Stolen + + // Queue indicates the packet should be queued for userspace processing. + Queue + + // Repeat indicates the packet should re-traverse the chains for the + // current hook. + Repeat + + // None indicates no verdict was reached. + None + + // Jump indicates a jump to another chain. + Jump + + // Continue indicates that traversal should continue at the next rule. + Continue + + // Return indicates that traversal should return to the calling chain. + Return +) + +// IPTables holds all the tables for a netstack. +type IPTables struct { + // mu protects the entire struct. + mu sync.RWMutex + + // Tables maps table names to tables. User tables have arbitrary names. + Tables map[string]*Table + + // Priorities maps each hook to a list of table names. The order of the + // list is the order in which each table should be visited for that + // hook. + Priorities map[Hook][]string +} + +// Table defines a set of chains and hooks into the network stack. The +// currently supported tables are: +// * nat +// * mangle +type Table struct { + // BuiltinChains holds the un-deletable chains built into netstack. If + // a hook isn't present in the map, this table doesn't utilize that + // hook. + BuiltinChains map[Hook]*Chain + + // DefaultTargets holds a target for each hook that will be executed if + // chain traversal doesn't yield a verdict. + DefaultTargets map[Hook]Target + + // UserChains holds user-defined chains for the keyed by name. Users + // can give their chains arbitrary names. + UserChains map[string]*Chain + + // Chains maps names to chains for both builtin and user-defined chains. + // Its entries point to Chains already either in BuiltinChains and + // UserChains, and its purpose is to make looking up tables by name + // fast. + Chains map[string]*Chain +} + +// ValidHooks returns a bitmap of the builtin hooks for the given table. +// +// Precondition: IPTables.mu must be locked for reading. +func (table *Table) ValidHooks() (uint32, *tcpip.Error) { + hooks := uint32(0) + for hook, _ := range table.BuiltinChains { + hooks |= 1 << hook + } + return hooks, nil +} + +// Chain defines a list of rules for packet processing. When a packet traverses +// a chain, it is checked against each rule until either a rule returns a +// verdict or the chain ends. +// +// By convention, builtin chains end with a rule that matches everything and +// returns either Accept or Drop. User-defined chains end with Return. These +// aren't strictly necessary here, but the iptables tool writes tables this way. +type Chain struct { + // Name is the chain name. + Name string + + // Rules is the list of rules to traverse. + Rules []*Rule +} + +// Rule is a packet processing rule. It consists of two pieces. First it +// contains zero or more matchers, each of which is a specification of which +// packets this rule applies to. If there are no matchers in the rule, it +// applies to any packet. +type Rule struct { + // Matchers is the list of matchers for this rule. + Matchers []Matcher + + // Target is the action to invoke if all the matchers match the packet. + Target Target +} + +// Matcher is the interface for matching packets. +type Matcher interface { + // Match returns whether the packet matches and whether the packet + // should be "hotdropped", i.e. dropped immediately. This is usually + // used for suspicious packets. + Match(hook Hook, packet buffer.VectorisedView, interfaceName string) (matches bool, hotdrop bool) +} + +// Target is the interface for taking an action for a packet. +type Target interface { + // Action takes an action on the packet and returns a verdict on how + // traversal should (or should not) continue. If the return value is + // Jump, it also returns the name of the chain to jump to. + Action(packet buffer.VectorisedView) (Verdict, string) +} -- cgit v1.2.3 From 8afbd974da2483d8f81e3abde5c9d689719263cb Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Fri, 7 Jun 2019 12:54:53 -0700 Subject: Address Ian's comments. Change-Id: I7445033b1970cbba3f2ed0682fe520dce02d8fad --- pkg/tcpip/iptables/iptables.go | 36 +++++++++++------------------------- pkg/tcpip/iptables/types.go | 12 ++++++------ 2 files changed, 17 insertions(+), 31 deletions(-) diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go index ee1ed4666..bd54ef5a6 100644 --- a/pkg/tcpip/iptables/iptables.go +++ b/pkg/tcpip/iptables/iptables.go @@ -34,9 +34,9 @@ const ( // all packets. func DefaultTables() *IPTables { tables := IPTables{ - Tables: map[string]*Table{ - tablenameNat: &Table{ - BuiltinChains: map[Hook]*Chain{ + Tables: map[string]Table{ + tablenameNat: Table{ + BuiltinChains: map[Hook]Chain{ Prerouting: unconditionalAcceptChain(chainNamePrerouting), Input: unconditionalAcceptChain(chainNameInput), Output: unconditionalAcceptChain(chainNameOutput), @@ -48,10 +48,10 @@ func DefaultTables() *IPTables { Output: UnconditionalAcceptTarget{}, Postrouting: UnconditionalAcceptTarget{}, }, - UserChains: map[string]*Chain{}, + UserChains: map[string]Chain{}, }, - tablenameMangle: &Table{ - BuiltinChains: map[Hook]*Chain{ + tablenameMangle: Table{ + BuiltinChains: map[Hook]Chain{ Prerouting: unconditionalAcceptChain(chainNamePrerouting), Output: unconditionalAcceptChain(chainNameOutput), }, @@ -59,7 +59,7 @@ func DefaultTables() *IPTables { Prerouting: UnconditionalAcceptTarget{}, Output: UnconditionalAcceptTarget{}, }, - UserChains: map[string]*Chain{}, + UserChains: map[string]Chain{}, }, }, Priorities: map[Hook][]string{ @@ -68,28 +68,14 @@ func DefaultTables() *IPTables { }, } - // Initialize each table's Chains field. - tables.Tables[tablenameNat].Chains = map[string]*Chain{ - chainNamePrerouting: tables.Tables[tablenameNat].BuiltinChains[Prerouting], - chainNameInput: tables.Tables[tablenameNat].BuiltinChains[Input], - chainNameOutput: tables.Tables[tablenameNat].BuiltinChains[Output], - chainNamePostrouting: tables.Tables[tablenameNat].BuiltinChains[Postrouting], - } - tables.Tables[tablenameMangle].Chains = map[string]*Chain{ - chainNamePrerouting: tables.Tables[tablenameMangle].BuiltinChains[Prerouting], - chainNameInput: tables.Tables[tablenameMangle].BuiltinChains[Input], - chainNameOutput: tables.Tables[tablenameMangle].BuiltinChains[Output], - chainNamePostrouting: tables.Tables[tablenameMangle].BuiltinChains[Postrouting], - } - return &tables } -func unconditionalAcceptChain(name string) *Chain { - return &Chain{ +func unconditionalAcceptChain(name string) Chain { + return Chain{ Name: name, - Rules: []*Rule{ - &Rule{ + Rules: []Rule{ + Rule{ Target: UnconditionalAcceptTarget{}, }, }, diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go index 65bfc7b1d..cdfb6ba28 100644 --- a/pkg/tcpip/iptables/types.go +++ b/pkg/tcpip/iptables/types.go @@ -98,11 +98,11 @@ const ( // IPTables holds all the tables for a netstack. type IPTables struct { - // mu protects the entire struct. - mu sync.RWMutex + // Mu protects the entire struct. + Mu sync.RWMutex // Tables maps table names to tables. User tables have arbitrary names. - Tables map[string]*Table + Tables map[string]Table // Priorities maps each hook to a list of table names. The order of the // list is the order in which each table should be visited for that @@ -118,7 +118,7 @@ type Table struct { // BuiltinChains holds the un-deletable chains built into netstack. If // a hook isn't present in the map, this table doesn't utilize that // hook. - BuiltinChains map[Hook]*Chain + BuiltinChains map[Hook]Chain // DefaultTargets holds a target for each hook that will be executed if // chain traversal doesn't yield a verdict. @@ -126,7 +126,7 @@ type Table struct { // UserChains holds user-defined chains for the keyed by name. Users // can give their chains arbitrary names. - UserChains map[string]*Chain + UserChains map[string]Chain // Chains maps names to chains for both builtin and user-defined chains. // Its entries point to Chains already either in BuiltinChains and @@ -158,7 +158,7 @@ type Chain struct { Name string // Rules is the list of rules to traverse. - Rules []*Rule + Rules []Rule } // Rule is a packet processing rule. It consists of two pieces. First it -- cgit v1.2.3 From 06a83df533244dc2b3b8adfc1bf0608d3753c1d9 Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Mon, 10 Jun 2019 12:43:54 -0700 Subject: Address more comments. Change-Id: I83ae1079f3dcba6b018f59ab7898decab5c211d2 --- pkg/tcpip/iptables/iptables.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go index bd54ef5a6..f1e1d1fad 100644 --- a/pkg/tcpip/iptables/iptables.go +++ b/pkg/tcpip/iptables/iptables.go @@ -33,7 +33,7 @@ const ( // DefaultTables returns a default set of tables. Each chain is set to accept // all packets. func DefaultTables() *IPTables { - tables := IPTables{ + return &IPTables{ Tables: map[string]Table{ tablenameNat: Table{ BuiltinChains: map[Hook]Chain{ @@ -67,8 +67,6 @@ func DefaultTables() *IPTables { Output: []string{tablenameMangle, tablenameNat}, }, } - - return &tables } func unconditionalAcceptChain(name string) Chain { -- cgit v1.2.3 From b915a25597a961311ceb57f89d18eaee9c9461d8 Mon Sep 17 00:00:00 2001 From: Josh Gao Date: Thu, 13 Jun 2019 14:26:26 -0700 Subject: Fix use of "2 ^ 30". 2 ^ 30 is 28, not 1073741824. --- runsc/boot/loader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index c1dea736f..b2a91bec9 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -259,7 +259,7 @@ func New(args Args) (*Loader, error) { // Adjust the total memory returned by the Sentry so that applications that // use /proc/meminfo can make allocations based on this limit. usage.MinimumTotalMemoryBytes = args.TotalMem - log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(2^30)) + log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1 << 30)) } // Initiate the Kernel object, which is required by the Context passed -- cgit v1.2.3 From 8a625ceeb173307094e81d273458b6651e54220a Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 28 Jun 2019 11:48:27 -0700 Subject: runsc: allow openat for runsc-race I see that runsc-race is killed by SIGSYS, because openat isn't allowed by seccomp filters: 60052 openat(AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY|O_CLOEXEC 60052 <... openat resumed> ) = 257 60052 --- SIGSYS {si_signo=SIGSYS, si_code=SYS_SECCOMP, si_call_addr=0xfaacf1, si_syscall=__NR_openat, si_arch=AUDIT_ARCH_X86_64} --- PiperOrigin-RevId: 255640808 --- runsc/boot/filter/extra_filters_race.go | 1 + 1 file changed, 1 insertion(+) diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go index d5bee4453..9ff80276a 100644 --- a/runsc/boot/filter/extra_filters_race.go +++ b/runsc/boot/filter/extra_filters_race.go @@ -33,6 +33,7 @@ func instrumentationFilters() seccomp.SyscallRules { syscall.SYS_MUNLOCK: {}, syscall.SYS_NANOSLEEP: {}, syscall.SYS_OPEN: {}, + syscall.SYS_OPENAT: {}, syscall.SYS_SET_ROBUST_LIST: {}, // Used within glibc's malloc. syscall.SYS_TIME: {}, -- cgit v1.2.3 From 7c13789818ec43644c3a159cd5cad2a5aad2e26d Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Fri, 28 Jun 2019 12:06:17 -0700 Subject: Superblock interface in the disk layout package for ext4. PiperOrigin-RevId: 255644277 --- pkg/sentry/fs/ext4/disklayout/BUILD | 1 + pkg/sentry/fs/ext4/disklayout/block_group.go | 6 +- pkg/sentry/fs/ext4/disklayout/superblock.go | 468 +++++++++++++++++++++++++++ 3 files changed, 472 insertions(+), 3 deletions(-) create mode 100644 pkg/sentry/fs/ext4/disklayout/superblock.go diff --git a/pkg/sentry/fs/ext4/disklayout/BUILD b/pkg/sentry/fs/ext4/disklayout/BUILD index 2fb1d5b37..be637bcbb 100644 --- a/pkg/sentry/fs/ext4/disklayout/BUILD +++ b/pkg/sentry/fs/ext4/disklayout/BUILD @@ -8,6 +8,7 @@ go_library( "block_group.go", "block_group_32.go", "block_group_64.go", + "superblock.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext4/disklayout", ) diff --git a/pkg/sentry/fs/ext4/disklayout/block_group.go b/pkg/sentry/fs/ext4/disklayout/block_group.go index 7359df5b9..7df76a036 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group.go +++ b/pkg/sentry/fs/ext4/disklayout/block_group.go @@ -128,8 +128,8 @@ func (f BGFlags) ToInt() uint16 { // BGFlagsFromInt converts the 16-bit flag representation to a BGFlags struct. func BGFlagsFromInt(flags uint16) BGFlags { return BGFlags{ - InodeUninit: (flags & BgInodeUninit) > 0, - BlockUninit: (flags & BgBlockUninit) > 0, - InodeZeroed: (flags & BgInodeZeroed) > 0, + InodeUninit: flags&BgInodeUninit > 0, + BlockUninit: flags&BgBlockUninit > 0, + InodeZeroed: flags&BgInodeZeroed > 0, } } diff --git a/pkg/sentry/fs/ext4/disklayout/superblock.go b/pkg/sentry/fs/ext4/disklayout/superblock.go new file mode 100644 index 000000000..d630ba8a6 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/superblock.go @@ -0,0 +1,468 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlock should be implemented by structs representing ext4 superblock. +// The superblock holds a lot of information about the enclosing filesystem. +// This interface aims to provide access methods to important information held +// by the superblock. It does NOT expose all fields of the superblock, only the +// ones necessary. This can be expanded when need be. +// +// Location and replication: +// - The superblock is located at offset 1024 in block group 0. +// - Redundant copies of the superblock and group descriptors are kept in +// all groups if sparse_super feature flag is NOT set. If it is set, the +// replicas only exist in groups whose group number is either 0 or a +// power of 3, 5, or 7. +// - There is also a sparse superblock feature v2 in which there are just +// two replicas saved in block groups pointed by the s_backup_bgs field. +// +// Replicas should eventually be updated if the superblock is updated. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block. +type SuperBlock interface { + // InodesCount returns the total number of inodes in this filesystem. + InodesCount() uint32 + + // BlocksCount returns the total number of data blocks in this filesystem. + BlocksCount() uint64 + + // FreeBlocksCount returns the number of free blocks in this filesystem. + FreeBlocksCount() uint64 + + // FreeInodesCount returns the number of free inodes in this filesystem. + FreeInodesCount() uint32 + + // MountCount returns the number of mounts since the last fsck. + MountCount() uint16 + + // MaxMountCount returns the number of mounts allowed beyond which a fsck is + // needed. + MaxMountCount() uint16 + + // FirstDataBlock returns the absolute block number of the first data block, + // which contains the super block itself. + // + // If the filesystem has 1kb data blocks then this should return 1. For all + // other configurations, this typically returns 0. + // + // The first block group descriptor is in (FirstDataBlock() + 1)th block. + FirstDataBlock() uint32 + + // BlockSize returns the size of one data block in this filesystem. + // This can be calculated by 2^(10 + sb.s_log_block_size). This ensures that + // the smallest block size is 1kb. + BlockSize() uint64 + + // BlocksPerGroup returns the number of data blocks in a block group. + BlocksPerGroup() uint32 + + // ClusterSize returns block cluster size (set during mkfs time by admin). + // This can be calculated by 2^(10 + sb.s_log_cluster_size). This ensures that + // the smallest cluster size is 1kb. + // + // sb.s_log_cluster_size must equal sb.s_log_block_size if bigalloc feature + // is NOT set and consequently BlockSize() = ClusterSize() in that case. + ClusterSize() uint64 + + // ClustersPerGroup returns: + // - number of clusters per group if bigalloc is enabled. + // - BlocksPerGroup() otherwise. + ClustersPerGroup() uint32 + + // InodeSize returns the size of the inode disk record size in bytes. Use this + // to iterate over inode arrays on disk. + // + // In ext2 and ext3: + // - Each inode had a disk record of 128 bytes. + // - The inode struct size was fixed at 128 bytes. + // + // In ext4 its possible to allocate larger on-disk inodes: + // - Inode disk record size = sb.s_inode_size (function return value). + // = 256 (default) + // - Inode struct size = 128 + inode.i_extra_isize. + // = 128 + 28 = 156 (default) + InodeSize() uint16 + + // InodesPerGroup returns the number of inodes in a block group. + InodesPerGroup() uint32 + + // BgDescSize returns the size of the block group descriptor struct. + // + // In ext2, ext3, ext4 (without 64-bit feature), the block group descriptor + // is only 32 bytes long. + // In ext4 with 64-bit feature, the block group descriptor expands to AT LEAST + // 64 bytes. It might be bigger than that. + BgDescSize() uint16 + + // CompatibleFeatures returns the CompatFeatures struct which holds all the + // compatible features this fs supports. + CompatibleFeatures() CompatFeatures + + // IncompatibleFeatures returns the CompatFeatures struct which holds all the + // incompatible features this fs supports. + IncompatibleFeatures() IncompatFeatures + + // ReadOnlyCompatibleFeatures returns the CompatFeatures struct which holds all the + // readonly compatible features this fs supports. + ReadOnlyCompatibleFeatures() RoCompatFeatures + + // Magic() returns the magic signature which must be 0xef53. + Magic() uint16 + + // Revision returns the superblock revision. Superblock struct fields from + // offset 0x54 till 0x150 should only be used if superblock has DynamicRev. + Revision() SbRevision +} + +// SbRevision is the type for superblock revisions. +type SbRevision int + +// Super block revisions. +const ( + // OldRev is the good old (original) format. + OldRev SbRevision = 0 + + // DynamicRev is v2 format w/ dynamic inode sizes. + DynamicRev SbRevision = 1 +) + +// Superblock compatible features. +// This is not exhaustive, unused features are not listed. +const ( + // SbDirPrealloc indicates directory preallocation. + SbDirPrealloc = 0x1 + + // SbHasJournal indicates the presence of a journal. jbd2 should only work + // with this being set. + SbHasJournal = 0x4 + + // SbExtAttr indicates extended attributes support. + SbExtAttr = 0x8 + + // SbResizeInode indicates that the fs has reserved GDT blocks (right after + // group descriptors) for fs expansion. + SbResizeInode = 0x10 + + // SbDirIndex indicates that the fs has directory indices. + SbDirIndex = 0x20 + + // SbSparseV2 stands for Sparse superblock version 2. + SbSparseV2 = 0x200 +) + +// CompatFeatures represents a superblock's compatible feature set. If the +// kernel does not understand any of these feature, it can still read/write +// to this fs. +type CompatFeatures struct { + DirPrealloc bool + HasJournal bool + ExtAttr bool + ResizeInode bool + DirIndex bool + SparseV2 bool +} + +// ToInt converts superblock compatible features back to its 32-bit rep. +func (f CompatFeatures) ToInt() uint32 { + var res uint32 + + if f.DirPrealloc { + res |= SbDirPrealloc + } + if f.HasJournal { + res |= SbHasJournal + } + if f.ExtAttr { + res |= SbExtAttr + } + if f.ResizeInode { + res |= SbResizeInode + } + if f.DirIndex { + res |= SbDirIndex + } + if f.SparseV2 { + res |= SbSparseV2 + } + + return res +} + +// CompatFeaturesFromInt converts the integer representation of superblock +// compatible features to CompatFeatures struct. +func CompatFeaturesFromInt(f uint32) CompatFeatures { + return CompatFeatures{ + DirPrealloc: f&SbDirPrealloc > 0, + HasJournal: f&SbHasJournal > 0, + ExtAttr: f&SbExtAttr > 0, + ResizeInode: f&SbResizeInode > 0, + DirIndex: f&SbDirIndex > 0, + SparseV2: f&SbSparseV2 > 0, + } +} + +// Superblock incompatible features. +// This is not exhaustive, unused features are not listed. +const ( + // SbDirentFileType indicates that directory entries record the file type. + // We should use struct ext4_dir_entry_2 for dirents then. + SbDirentFileType = 0x2 + + // SbRecovery indicates that the filesystem needs recovery. + SbRecovery = 0x4 + + // SbJournalDev indicates that the filesystem has a separate journal device. + SbJournalDev = 0x8 + + // SbMetaBG indicates that the filesystem is using Meta block groups. Moves + // the group descriptors from the congested first block group into the first + // group of each metablock group to increase the maximum block groups limit + // and hence support much larger filesystems. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#meta-block-groups. + SbMetaBG = 0x10 + + // SbExtents indicates that the filesystem uses extents. Must be set in ext4 + // filesystems. + SbExtents = 0x40 + + // SbIs64Bit indicates that this filesystem addresses blocks with 64-bits. + // Hence can support 2^64 data blocks. + SbIs64Bit = 0x80 + + // SbMMP indicates that this filesystem has multiple mount protection. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#multiple-mount-protection. + SbMMP = 0x100 + + // SbFlexBg indicates that this filesystem has flexible block groups. Several + // block groups are tied into one logical block group so that all the metadata + // for the block groups (bitmaps and inode tables) are close together for + // faster loading. Consequently, large files will be continuous on disk. + // However, this does not affect the placement of redundant superblocks and + // group descriptors. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#flexible-block-groups. + SbFlexBg = 0x200 + + // SbLargeDir shows that large directory enabled. Directory htree can be 3 + // levels deep. Directory htrees are allowed to be 2 levels deep otherwise. + SbLargeDir = 0x4000 + + // SbInlineData allows inline data in inodes for really small files. + SbInlineData = 0x8000 + + // SbEncrypted indicates that this fs contains encrypted inodes. + SbEncrypted = 0x10000 +) + +// IncompatFeatures represents a superblock's incompatible feature set. If the +// kernel does not understand any of these feature, it should refuse to mount. +type IncompatFeatures struct { + DirentFileType bool + Recovery bool + JournalDev bool + MetaBG bool + Extents bool + Is64Bit bool + MMP bool + FlexBg bool + LargeDir bool + InlineData bool + Encrypted bool +} + +// ToInt converts superblock incompatible features back to its 32-bit rep. +func (f IncompatFeatures) ToInt() uint32 { + var res uint32 + + if f.DirentFileType { + res |= SbDirentFileType + } + if f.Recovery { + res |= SbRecovery + } + if f.JournalDev { + res |= SbJournalDev + } + if f.MetaBG { + res |= SbMetaBG + } + if f.Extents { + res |= SbExtents + } + if f.Is64Bit { + res |= SbIs64Bit + } + if f.MMP { + res |= SbMMP + } + if f.FlexBg { + res |= SbFlexBg + } + if f.LargeDir { + res |= SbLargeDir + } + if f.InlineData { + res |= SbInlineData + } + if f.Encrypted { + res |= SbEncrypted + } + + return res +} + +// IncompatFeaturesFromInt converts the integer representation of superblock +// incompatible features to IncompatFeatures struct. +func IncompatFeaturesFromInt(f uint32) IncompatFeatures { + return IncompatFeatures{ + DirentFileType: f&SbDirentFileType > 0, + Recovery: f&SbRecovery > 0, + JournalDev: f&SbJournalDev > 0, + MetaBG: f&SbMetaBG > 0, + Extents: f&SbExtents > 0, + Is64Bit: f&SbIs64Bit > 0, + MMP: f&SbMMP > 0, + FlexBg: f&SbFlexBg > 0, + LargeDir: f&SbLargeDir > 0, + InlineData: f&SbInlineData > 0, + Encrypted: f&SbEncrypted > 0, + } +} + +// Superblock readonly compatible features. +// This is not exhaustive, unused features are not listed. +const ( + // SbSparse indicates sparse superblocks. Only groups with number either 0 or + // a power of 3, 5, or 7 will have redundant copies of the superblock and + // block descriptors. + SbSparse = 0x1 + + // SbLargeFile indicates that this fs has been used to store a file >= 2GiB. + SbLargeFile = 0x2 + + // SbHugeFile indicates that this fs contains files whose sizes are + // represented in units of logicals blocks, not 512-byte sectors. + SbHugeFile = 0x8 + + // SbGdtCsum indicates that group descriptors have checksums. + SbGdtCsum = 0x10 + + // SbDirNlink indicates that the new subdirectory limit is 64,999. Ext3 has a + // 32,000 subdirectory limit. + SbDirNlink = 0x20 + + // SbExtraIsize indicates that large inodes exist on this filesystem. + SbExtraIsize = 0x40 + + // SbHasSnapshot indicates the existence of a snapshot. + SbHasSnapshot = 0x80 + + // SbQuota enables usage tracking for all quota types. + SbQuota = 0x100 + + // SbBigalloc maps to the bigalloc feature. When set, the minimum allocation + // unit becomes a cluster rather than a data block. Then block bitmaps track + // clusters, not data blocks. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#bigalloc. + SbBigalloc = 0x200 + + // SbMetadataCsum indicates that the fs supports metadata checksumming. + SbMetadataCsum = 0x400 + + // SbReadOnly marks this filesystem as readonly. Should refuse to mount in + // read/write mode. + SbReadOnly = 0x1000 +) + +// RoCompatFeatures represents a superblock's readonly compatible feature set. +// If the kernel does not understand any of these feature, it can still mount +// readonly. But if the user wants to mount read/write, the kernel should +// refuse to mount. +type RoCompatFeatures struct { + Sparse bool + LargeFile bool + HugeFile bool + GdtCsum bool + DirNlink bool + ExtraIsize bool + HasSnapshot bool + Quota bool + Bigalloc bool + MetadataCsum bool + ReadOnly bool +} + +// ToInt converts superblock readonly compatible features to its 32-bit rep. +func (f RoCompatFeatures) ToInt() uint32 { + var res uint32 + + if f.Sparse { + res |= SbSparse + } + if f.LargeFile { + res |= SbLargeFile + } + if f.HugeFile { + res |= SbHugeFile + } + if f.GdtCsum { + res |= SbGdtCsum + } + if f.DirNlink { + res |= SbDirNlink + } + if f.ExtraIsize { + res |= SbExtraIsize + } + if f.HasSnapshot { + res |= SbHasSnapshot + } + if f.Quota { + res |= SbQuota + } + if f.Bigalloc { + res |= SbBigalloc + } + if f.MetadataCsum { + res |= SbMetadataCsum + } + if f.ReadOnly { + res |= SbReadOnly + } + + return res +} + +// RoCompatFeaturesFromInt converts the integer representation of superblock +// readonly compatible features to RoCompatFeatures struct. +func RoCompatFeaturesFromInt(f uint32) RoCompatFeatures { + return RoCompatFeatures{ + Sparse: f&SbSparse > 0, + LargeFile: f&SbLargeFile > 0, + HugeFile: f&SbHugeFile > 0, + GdtCsum: f&SbGdtCsum > 0, + DirNlink: f&SbDirNlink > 0, + ExtraIsize: f&SbExtraIsize > 0, + HasSnapshot: f&SbHasSnapshot > 0, + Quota: f&SbQuota > 0, + Bigalloc: f&SbBigalloc > 0, + MetadataCsum: f&SbMetadataCsum > 0, + ReadOnly: f&SbReadOnly > 0, + } +} -- cgit v1.2.3 From e21d49c2d82b7d109bc7c8955edea1787de59ac0 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 28 Jun 2019 13:22:28 -0700 Subject: platform/ptrace: return more detailed errors Right now, if we can't create a stub process, we will see this error: panic: unable to activate mm: resource temporarily unavailable It would be better to know the root cause of this "resource temporarily unavailable". PiperOrigin-RevId: 255656831 --- pkg/sentry/platform/ptrace/subprocess_linux.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index dc3475494..87ded0bbd 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -306,7 +306,7 @@ func (s *subprocess) createStub() (*thread, error) { arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}) if err != nil { - return nil, err + return nil, fmt.Errorf("creating stub process: %v", err) } // Wait for child to enter group-stop, so we don't stop its @@ -325,7 +325,7 @@ func (s *subprocess) createStub() (*thread, error) { arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}) if err != nil { - return nil, err + return nil, fmt.Errorf("waiting on stub process: %v", err) } childT := &thread{ -- cgit v1.2.3 From 295078fa7ae7893f3deb0e6042e2aecfaba0dd1c Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Fri, 28 Jun 2019 15:27:25 -0700 Subject: Automated rollback of changelist 255263686 PiperOrigin-RevId: 255679453 --- pkg/sentry/fs/inode_overlay.go | 6 ++++++ runsc/boot/fs.go | 13 ------------- test/syscalls/BUILD | 2 +- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index b247fa514..24b769cfc 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -567,6 +567,12 @@ func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error { if o.upper != nil { err = o.upper.check(ctx, p) } else { + if p.Write { + // Since writes will be redirected to the upper filesystem, the lower + // filesystem need not be writable, but must be readable for copy-up. + p.Write = false + p.Read = true + } err = o.lower.check(ctx, p) } o.copyMu.RUnlock() diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index af52286a6..9da0c7067 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -85,19 +85,6 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, if err != nil { return nil, fmt.Errorf("creating tmpfs overlay: %v", err) } - - // Replicate permissions and owner from lower to upper mount point. - attr, err := lower.UnstableAttr(ctx) - if err != nil { - return nil, fmt.Errorf("reading attributes from lower mount point: %v", err) - } - if !upper.InodeOperations.SetPermissions(ctx, upper, attr.Perms) { - return nil, fmt.Errorf("error setting permission to upper mount point") - } - if err := upper.InodeOperations.SetOwner(ctx, upper, attr.Owner); err != nil { - return nil, fmt.Errorf("setting owner to upper mount point: %v", err) - } - return fs.NewOverlayRoot(ctx, upper, lower, upperFlags) } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index b5f89033b..4ac511740 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -240,7 +240,7 @@ syscall_test( syscall_test(test = "//test/syscalls/linux:munmap_test") syscall_test( - add_overlay = True, + add_overlay = False, # TODO(gvisor.dev/issue/316): enable when fixed. test = "//test/syscalls/linux:open_create_test", ) -- cgit v1.2.3 From cf51e77d6d7cf5302f03812bb726a86e9be572f5 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Fri, 28 Jun 2019 15:28:24 -0700 Subject: Fix suggestions from clang. PiperOrigin-RevId: 255679603 --- test/syscalls/linux/chmod.cc | 8 ++-- test/syscalls/linux/chown.cc | 8 ++-- test/syscalls/linux/chroot.cc | 3 +- test/syscalls/linux/exec.cc | 3 +- test/syscalls/linux/exec_proc_exe_workload.cc | 3 +- test/syscalls/linux/file_base.h | 4 +- test/syscalls/linux/inotify.cc | 17 ++++---- test/syscalls/linux/madvise.cc | 3 +- test/syscalls/linux/memfd.cc | 3 +- test/syscalls/linux/mknod.cc | 6 +-- test/syscalls/linux/mmap.cc | 19 ++++---- test/syscalls/linux/proc.cc | 51 +++++++++++++--------- test/syscalls/linux/proc_net_unix.cc | 7 +-- test/syscalls/linux/pty.cc | 2 +- test/syscalls/linux/readv_common.h | 2 +- test/syscalls/linux/sendfile.cc | 6 +-- test/syscalls/linux/socket_netlink_route.cc | 4 +- .../syscalls/linux/unix_domain_socket_test_util.cc | 9 ++-- test/syscalls/linux/unix_domain_socket_test_util.h | 2 +- test/util/fs_util.cc | 6 +-- test/util/fs_util.h | 20 ++++----- test/util/fs_util_test.cc | 9 ++-- test/util/logging.cc | 2 +- test/util/mount_util.h | 6 ++- test/util/posix_error.h | 2 +- test/util/proc_util.cc | 3 +- test/util/temp_path.cc | 7 ++- test/util/temp_path.h | 4 +- test/util/test_util.cc | 5 ++- 29 files changed, 128 insertions(+), 96 deletions(-) diff --git a/test/syscalls/linux/chmod.cc b/test/syscalls/linux/chmod.cc index 79e98597f..7e918b9b2 100644 --- a/test/syscalls/linux/chmod.cc +++ b/test/syscalls/linux/chmod.cc @@ -140,7 +140,8 @@ TEST(ChmodTest, FchmodatFile) { SyscallSucceeds()); ASSERT_THAT( - fchmodat(parent_fd, std::string(Basename(temp_file.path())).c_str(), 0444, 0), + fchmodat(parent_fd, std::string(Basename(temp_file.path())).c_str(), 0444, + 0), SyscallSucceeds()); EXPECT_THAT(close(parent_fd), SyscallSucceeds()); @@ -165,8 +166,9 @@ TEST(ChmodTest, FchmodatDir) { SyscallSucceeds()); EXPECT_THAT(close(fd), SyscallSucceeds()); - ASSERT_THAT(fchmodat(parent_fd, std::string(Basename(dir.path())).c_str(), 0, 0), - SyscallSucceeds()); + ASSERT_THAT( + fchmodat(parent_fd, std::string(Basename(dir.path())).c_str(), 0, 0), + SyscallSucceeds()); EXPECT_THAT(close(parent_fd), SyscallSucceeds()); EXPECT_THAT(open(dir.path().c_str(), O_RDONLY | O_DIRECTORY), diff --git a/test/syscalls/linux/chown.cc b/test/syscalls/linux/chown.cc index eb1762ddf..2e82f0b3a 100644 --- a/test/syscalls/linux/chown.cc +++ b/test/syscalls/linux/chown.cc @@ -186,10 +186,10 @@ INSTANTIATE_TEST_SUITE_P( return errorFromReturn("fchownat-fd", rc); }, [](const std::string& path, uid_t owner, gid_t group) -> PosixError { - ASSIGN_OR_RETURN_ERRNO( - auto dirfd, Open(std::string(Dirname(path)), O_DIRECTORY | O_RDONLY)); - int rc = fchownat(dirfd.get(), std::string(Basename(path)).c_str(), owner, - group, 0); + ASSIGN_OR_RETURN_ERRNO(auto dirfd, Open(std::string(Dirname(path)), + O_DIRECTORY | O_RDONLY)); + int rc = fchownat(dirfd.get(), std::string(Basename(path)).c_str(), + owner, group, 0); MaybeSave(); return errorFromReturn("fchownat-dirfd", rc); })); diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc index a4354ff62..498c45f16 100644 --- a/test/syscalls/linux/chroot.cc +++ b/test/syscalls/linux/chroot.cc @@ -273,7 +273,8 @@ TEST(ChrootTest, ProcMemSelfMapsNoEscapeProcOpen) { ASSERT_GT(bytes_read, 0); // Finally we want to make sure the maps don't contain the chroot path - ASSERT_EQ(std::string(buf, bytes_read).find(temp_dir.path()), std::string::npos); + ASSERT_EQ(std::string(buf, bytes_read).find(temp_dir.path()), + std::string::npos); } // Test that mounts outside the chroot will not appear in /proc/self/mounts or diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc index 06c322a99..4c7c95321 100644 --- a/test/syscalls/linux/exec.cc +++ b/test/syscalls/linux/exec.cc @@ -255,7 +255,8 @@ TEST(ExecDeathTest, InterpreterScriptArgNUL) { TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( GetAbsoluteTestTmpdir(), - absl::StrCat("#!", link.path(), " foo", std::string(1, '\0'), "bar"), 0755)); + absl::StrCat("#!", link.path(), " foo", std::string(1, '\0'), "bar"), + 0755)); CheckOutput(script.path(), {script.path()}, {}, ArgEnvExitStatus(2, 0), absl::StrCat(link.path(), "\nfoo\n", script.path(), "\n")); diff --git a/test/syscalls/linux/exec_proc_exe_workload.cc b/test/syscalls/linux/exec_proc_exe_workload.cc index b3fbd5042..b790fe5be 100644 --- a/test/syscalls/linux/exec_proc_exe_workload.cc +++ b/test/syscalls/linux/exec_proc_exe_workload.cc @@ -21,7 +21,8 @@ #include "test/util/posix_error.h" int main(int argc, char** argv, char** envp) { - std::string exe = gvisor::testing::ProcessExePath(getpid()).ValueOrDie(); + std::string exe = + gvisor::testing::ProcessExePath(getpid()).ValueOrDie(); if (exe[0] != '/') { std::cerr << "relative path: " << exe << std::endl; exit(1); diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h index b5b972c07..36efabcae 100644 --- a/test/syscalls/linux/file_base.h +++ b/test/syscalls/linux/file_base.h @@ -160,7 +160,7 @@ class SocketTest : public ::testing::Test { // MatchesStringLength checks that a tuple argument of (struct iovec *, int) // corresponding to an iovec array and its length, contains data that matches -// the std::string length strlen. +// the string length strlen. MATCHER_P(MatchesStringLength, strlen, "") { struct iovec* iovs = arg.first; int niov = arg.second; @@ -177,7 +177,7 @@ MATCHER_P(MatchesStringLength, strlen, "") { // MatchesStringValue checks that a tuple argument of (struct iovec *, int) // corresponding to an iovec array and its length, contains data that matches -// the std::string value str. +// the string value str. MATCHER_P(MatchesStringValue, str, "") { struct iovec* iovs = arg.first; int len = strlen(str); diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc index 6a3539e22..7384c27dc 100644 --- a/test/syscalls/linux/inotify.cc +++ b/test/syscalls/linux/inotify.cc @@ -122,8 +122,8 @@ std::string DumpEvents(const std::vector& events, int indent_level) { (events.size() > 1) ? "s" : ""); int i = 0; for (const Event& ev : events) { - ss << StreamFormat("%sevents[%d]: %s\n", std::string(indent_level, '\t'), i++, - DumpEvent(ev)); + ss << StreamFormat("%sevents[%d]: %s\n", std::string(indent_level, '\t'), + i++, DumpEvent(ev)); } return ss.str(); } @@ -295,10 +295,10 @@ PosixErrorOr> DrainEvents(int fd) { if (event.len > 0) { TEST_CHECK(static_cast(sizeof(struct inotify_event) + event.len) <= readlen); - ev.name = - std::string(cursor + offsetof(struct inotify_event, name)); // NOLINT + ev.name = std::string(cursor + + offsetof(struct inotify_event, name)); // NOLINT // Name field should always be smaller than event.len, otherwise we have - // a buffer overflow. The two sizes aren't equal because the std::string + // a buffer overflow. The two sizes aren't equal because the string // constructor will stop at the first null byte, while event.name may be // padded up to event.len using multiple null bytes. TEST_CHECK(ev.name.size() <= event.len); @@ -319,7 +319,8 @@ PosixErrorOr InotifyInit1(int flags) { return FileDescriptor(fd); } -PosixErrorOr InotifyAddWatch(int fd, const std::string& path, uint32_t mask) { +PosixErrorOr InotifyAddWatch(int fd, const std::string& path, + uint32_t mask) { int wd; EXPECT_THAT(wd = inotify_add_watch(fd, path.c_str(), mask), SyscallSucceeds()); @@ -980,8 +981,8 @@ TEST(Inotify, WatchOnRelativePath) { EXPECT_THAT(chdir(root.path().c_str()), SyscallSucceeds()); // Add a watch on file1 with a relative path. - const int wd = ASSERT_NO_ERRNO_AND_VALUE( - InotifyAddWatch(fd.get(), std::string(Basename(file1.path())), IN_ALL_EVENTS)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + fd.get(), std::string(Basename(file1.path())), IN_ALL_EVENTS)); // Perform a read on file1, this should generate an IN_ACCESS event. char c; diff --git a/test/syscalls/linux/madvise.cc b/test/syscalls/linux/madvise.cc index 352fcc6c4..08ff4052c 100644 --- a/test/syscalls/linux/madvise.cc +++ b/test/syscalls/linux/madvise.cc @@ -162,7 +162,8 @@ TEST(MadviseDontforkTest, DontforkShared) { // Mmap two shared file-backed pages and MADV_DONTFORK the second page. TempPath f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( /* parent = */ GetAbsoluteTestTmpdir(), - /* content = */ std::string(kPageSize * 2, 2), TempPath::kDefaultFileMode)); + /* content = */ std::string(kPageSize * 2, 2), + TempPath::kDefaultFileMode)); FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR)); Mapping m = ASSERT_NO_ERRNO_AND_VALUE(Mmap( diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc index 3494bcd13..e57b49a4a 100644 --- a/test/syscalls/linux/memfd.cc +++ b/test/syscalls/linux/memfd.cc @@ -60,7 +60,8 @@ int memfd_create(const std::string& name, unsigned int flags) { return syscall(__NR_memfd_create, name.c_str(), flags); } -PosixErrorOr MemfdCreate(const std::string& name, uint32_t flags) { +PosixErrorOr MemfdCreate(const std::string& name, + uint32_t flags) { int fd = memfd_create(name, flags); if (fd < 0) { return PosixError( diff --git a/test/syscalls/linux/mknod.cc b/test/syscalls/linux/mknod.cc index febf2eb14..4c45766c7 100644 --- a/test/syscalls/linux/mknod.cc +++ b/test/syscalls/linux/mknod.cc @@ -90,7 +90,7 @@ TEST(MknodTest, Fifo) { ASSERT_THAT(stat(fifo.c_str(), &st), SyscallSucceeds()); EXPECT_TRUE(S_ISFIFO(st.st_mode)); - std::string msg = "some string"; + std::string msg = "some std::string"; std::vector buf(512); // Read-end of the pipe. @@ -116,7 +116,7 @@ TEST(MknodTest, FifoOtrunc) { ASSERT_THAT(stat(fifo.c_str(), &st), SyscallSucceeds()); EXPECT_TRUE(S_ISFIFO(st.st_mode)); - std::string msg = "some string"; + std::string msg = "some std::string"; std::vector buf(512); // Read-end of the pipe. ScopedThread t([&fifo, &buf, &msg]() { @@ -144,7 +144,7 @@ TEST(MknodTest, FifoTruncNoOp) { ASSERT_THAT(stat(fifo.c_str(), &st), SyscallSucceeds()); EXPECT_TRUE(S_ISFIFO(st.st_mode)); - std::string msg = "some string"; + std::string msg = "some std::string"; std::vector buf(512); // Read-end of the pipe. ScopedThread t([&fifo, &buf, &msg]() { diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc index 5b5b4c2e8..a112316e9 100644 --- a/test/syscalls/linux/mmap.cc +++ b/test/syscalls/linux/mmap.cc @@ -113,7 +113,7 @@ class MMapTest : public ::testing::Test { size_t length_ = 0; }; -// Matches if arg contains the same contents as std::string str. +// Matches if arg contains the same contents as string str. MATCHER_P(EqualsMemory, str, "") { if (0 == memcmp(arg, str.c_str(), str.size())) { return true; @@ -1086,8 +1086,8 @@ TEST_F(MMapFileTest, WriteShared) { ASSERT_THAT(Read(buf.data(), buf.size()), SyscallSucceedsWithValue(buf.size())); // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a - // NUL-terminated C std::string. EXPECT_THAT will try to print a char* as a C - // std::string, possibly overruning the buffer. + // NUL-terminated C string. EXPECT_THAT will try to print a char* as a C + // string, possibly overruning the buffer. EXPECT_THAT(reinterpret_cast(buf.data()), EqualsMemory(std::string(kFileContents))); } @@ -1122,6 +1122,7 @@ TEST_F(MMapFileTest, WriteSharedBeyondEnd) { ASSERT_THAT(Read(buf.data(), buf.size()), SyscallSucceedsWithValue(first.size())); // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a + // NUL-terminated C string. EXPECT_THAT will try to print a char* as a C // NUL-terminated C std::string. EXPECT_THAT will try to print a char* as a C // std::string, possibly overruning the buffer. EXPECT_THAT(reinterpret_cast(buf.data()), EqualsMemory(first)); @@ -1159,8 +1160,8 @@ TEST_F(MMapFileTest, WriteSharedTruncateUp) { ASSERT_THAT(Read(buf.data(), buf.size()), SyscallSucceedsWithValue(buf.size())); // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a - // NUL-terminated C std::string. EXPECT_THAT will try to print a char* as a C - // std::string, possibly overruning the buffer. + // NUL-terminated C string. EXPECT_THAT will try to print a char* as a C + // string, possibly overruning the buffer. EXPECT_THAT(reinterpret_cast(buf.data()), EqualsMemory(first)); EXPECT_THAT(reinterpret_cast(buf.data() + kPageSize / 2), EqualsMemory(second)); @@ -1234,8 +1235,8 @@ TEST_F(MMapFileTest, WriteSharedTruncateDownThenUp) { ASSERT_THAT(Read(buf.data(), buf.size()), SyscallSucceedsWithValue(buf.size())); // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a - // NUL-terminated C std::string. EXPECT_THAT will try to print a char* as a C - // std::string, possibly overruning the buffer. + // NUL-terminated C string. EXPECT_THAT will try to print a char* as a C + // string, possibly overruning the buffer. EXPECT_THAT(reinterpret_cast(buf.data()), EqualsMemory(zeroed)); } @@ -1363,8 +1364,8 @@ TEST_F(MMapFileTest, WritePrivate) { ASSERT_THAT(Read(buf.data(), buf.size()), SyscallSucceedsWithValue(buf.size())); // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a - // NUL-terminated C std::string. EXPECT_THAT will try to print a char* as a C - // std::string, possibly overruning the buffer. + // NUL-terminated C string. EXPECT_THAT will try to print a char* as a C + // string, possibly overruning the buffer. EXPECT_THAT(reinterpret_cast(buf.data()), EqualsMemory(std::string(len, '\0'))); } diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc index 924b98e3a..490bf4424 100644 --- a/test/syscalls/linux/proc.cc +++ b/test/syscalls/linux/proc.cc @@ -206,8 +206,8 @@ PosixError WithSubprocess(SubprocessCallback const& running, } // Access the file returned by name when a subprocess is running. -PosixError AccessWhileRunning(std::function name, int flags, - std::function access) { +PosixError AccessWhileRunning(std::function name, + int flags, std::function access) { FileDescriptor fd; return WithSubprocess( [&](int pid) -> PosixError { @@ -221,8 +221,8 @@ PosixError AccessWhileRunning(std::function name, int flag } // Access the file returned by name when the a subprocess is zombied. -PosixError AccessWhileZombied(std::function name, int flags, - std::function access) { +PosixError AccessWhileZombied(std::function name, + int flags, std::function access) { FileDescriptor fd; return WithSubprocess( [&](int pid) -> PosixError { @@ -239,8 +239,8 @@ PosixError AccessWhileZombied(std::function name, int flag } // Access the file returned by name when the a subprocess is exited. -PosixError AccessWhileExited(std::function name, int flags, - std::function access) { +PosixError AccessWhileExited(std::function name, + int flags, std::function access) { FileDescriptor fd; return WithSubprocess( [&](int pid) -> PosixError { @@ -704,7 +704,8 @@ TEST(ProcSelfExe, Absolute) { // Sanity check for /proc/cpuinfo fields that must be present. TEST(ProcCpuinfo, RequiredFieldsArePresent) { - std::string proc_cpuinfo = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/cpuinfo")); + std::string proc_cpuinfo = + ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/cpuinfo")); ASSERT_FALSE(proc_cpuinfo.empty()); std::vector cpuinfo_fields = absl::StrSplit(proc_cpuinfo, '\n'); @@ -743,7 +744,8 @@ TEST(ProcCpuinfo, DeniesWrite) { // Sanity checks that uptime is present. TEST(ProcUptime, IsPresent) { - std::string proc_uptime = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/uptime")); + std::string proc_uptime = + ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/uptime")); ASSERT_FALSE(proc_uptime.empty()); std::vector uptime_parts = absl::StrSplit(proc_uptime, ' '); @@ -775,7 +777,8 @@ TEST(ProcUptime, IsPresent) { } TEST(ProcMeminfo, ContainsBasicFields) { - std::string proc_meminfo = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/meminfo")); + std::string proc_meminfo = + ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/meminfo")); EXPECT_THAT(proc_meminfo, AllOf(ContainsRegex(R"(MemTotal:\s+[0-9]+ kB)"), ContainsRegex(R"(MemFree:\s+[0-9]+ kB)"))); } @@ -853,12 +856,14 @@ TEST(ProcStat, Fields) { } TEST(ProcLoadavg, EndsWithNewline) { - std::string proc_loadvg = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/loadavg")); + std::string proc_loadvg = + ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/loadavg")); EXPECT_EQ(proc_loadvg.back(), '\n'); } TEST(ProcLoadavg, Fields) { - std::string proc_loadvg = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/loadavg")); + std::string proc_loadvg = + ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/loadavg")); std::vector lines = absl::StrSplit(proc_loadvg, '\n'); // Single line. @@ -1238,10 +1243,12 @@ TEST(ProcPidStatTest, VmStats) { EXPECT_NE('0', data_str[0]); } -// Parse an array of NUL-terminated char* arrays, returning a vector of strings. +// Parse an array of NUL-terminated char* arrays, returning a vector of +// strings. std::vector ParseNulTerminatedStrings(std::string contents) { EXPECT_EQ('\0', contents.back()); - // The split will leave an empty std::string if the NUL-byte remains, so pop it. + // The split will leave an empty string if the NUL-byte remains, so pop + // it. contents.pop_back(); return absl::StrSplit(contents, '\0'); @@ -1491,7 +1498,8 @@ TEST(ProcPidFile, SubprocessExited) { } PosixError DirContainsImpl(absl::string_view path, - const std::vector& targets, bool strict) { + const std::vector& targets, + bool strict) { ASSIGN_OR_RETURN_ERRNO(auto listing, ListDir(path, false)); bool success = true; @@ -1530,8 +1538,8 @@ PosixError DirContainsExactly(absl::string_view path, return DirContainsImpl(path, targets, true); } -PosixError EventuallyDirContainsExactly(absl::string_view path, - const std::vector& targets) { +PosixError EventuallyDirContainsExactly( + absl::string_view path, const std::vector& targets) { constexpr int kRetryCount = 100; const absl::Duration kRetryDelay = absl::Milliseconds(100); @@ -1553,11 +1561,13 @@ TEST(ProcTask, Basic) { DirContains("/proc/self/task", {".", "..", absl::StrCat(getpid())})); } -std::vector TaskFiles(const std::vector& initial_contents, - const std::vector& pids) { +std::vector TaskFiles( + const std::vector& initial_contents, + const std::vector& pids) { return VecCat( initial_contents, - ApplyVec([](const pid_t p) { return absl::StrCat(p); }, pids)); + ApplyVec([](const pid_t p) { return absl::StrCat(p); }, + pids)); } std::vector TaskFiles(const std::vector& pids) { @@ -1894,7 +1904,8 @@ void CheckDuplicatesRecursively(std::string path) { continue; } - ASSERT_EQ(children.find(std::string(dp->d_name)), children.end()) << dp->d_name; + ASSERT_EQ(children.find(std::string(dp->d_name)), children.end()) + << dp->d_name; children.insert(std::string(dp->d_name)); ASSERT_NE(dp->d_type, DT_UNKNOWN); diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc index 74acbe92c..9b9be66ff 100644 --- a/test/syscalls/linux/proc_net_unix.cc +++ b/test/syscalls/linux/proc_net_unix.cc @@ -67,7 +67,7 @@ std::string ExtractPath(const struct sockaddr* addr) { // Abstract socket paths are null padded to the end of the struct // sockaddr. However, these null bytes may or may not show up in // /proc/net/unix depending on the kernel version. Truncate after the first - // null byte (by treating path as a c-std::string). + // null byte (by treating path as a c-string). return StrCat("@", &path[1]); } return std::string(path); @@ -80,7 +80,7 @@ PosixErrorOr> ProcNetUnixEntries() { bool skipped_header = false; std::vector entries; - std::vector lines = absl::StrSplit(content, absl::ByAnyChar("\n")); + std::vector lines = absl::StrSplit(content, '\n'); std::cerr << "" << std::endl; for (std::string line : lines) { // Emit the proc entry to the test output to provide context for the test @@ -123,7 +123,8 @@ PosixErrorOr> ProcNetUnixEntries() { UnixEntry entry; // Process the first 6 fields, up to but not including "Inode". - std::vector fields = absl::StrSplit(line, absl::MaxSplits(' ', 6)); + std::vector fields = + absl::StrSplit(line, absl::MaxSplits(' ', 6)); if (fields.size() < 7) { return PosixError(EINVAL, StrFormat("Invalid entry: '%s'\n", line)); diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc index f926ac0f9..d1ab4703f 100644 --- a/test/syscalls/linux/pty.cc +++ b/test/syscalls/linux/pty.cc @@ -105,7 +105,7 @@ struct Field { uint64_t value; }; -// ParseFields returns a std::string representation of value, using the names in +// ParseFields returns a string representation of value, using the names in // fields. std::string ParseFields(const Field* fields, size_t len, uint64_t value) { bool first = true; diff --git a/test/syscalls/linux/readv_common.h b/test/syscalls/linux/readv_common.h index b16179fca..2fa40c35f 100644 --- a/test/syscalls/linux/readv_common.h +++ b/test/syscalls/linux/readv_common.h @@ -20,7 +20,7 @@ namespace gvisor { namespace testing { -// A NUL-terminated std::string containing the data used by tests using the following +// A NUL-terminated string containing the data used by tests using the following // test helpers. extern const char kReadvTestData[]; diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc index 2fbb3f4ef..e5d72e28a 100644 --- a/test/syscalls/linux/sendfile.cc +++ b/test/syscalls/linux/sendfile.cc @@ -135,7 +135,7 @@ TEST(SendFileTest, SendTriviallyWithBothFilesReadWrite) { TEST(SendFileTest, SendAndUpdateFileOffset) { // Create temp files. - // Test input std::string length must be > 2 AND even. + // Test input string length must be > 2 AND even. constexpr char kData[] = "The slings and arrows of outrageous fortune,"; constexpr int kDataSize = sizeof(kData) - 1; constexpr int kHalfDataSize = kDataSize / 2; @@ -180,7 +180,7 @@ TEST(SendFileTest, SendAndUpdateFileOffset) { TEST(SendFileTest, SendAndUpdateFileOffsetFromNonzeroStartingPoint) { // Create temp files. - // Test input std::string length must be > 2 AND divisible by 4. + // Test input string length must be > 2 AND divisible by 4. constexpr char kData[] = "The slings and arrows of outrageous fortune,"; constexpr int kDataSize = sizeof(kData) - 1; constexpr int kHalfDataSize = kDataSize / 2; @@ -233,7 +233,7 @@ TEST(SendFileTest, SendAndUpdateFileOffsetFromNonzeroStartingPoint) { TEST(SendFileTest, SendAndUpdateGivenOffset) { // Create temp files. - // Test input std::string length must be >= 4 AND divisible by 4. + // Test input string length must be >= 4 AND divisible by 4. constexpr char kData[] = "Or to take Arms against a Sea of troubles,"; constexpr int kDataSize = sizeof(kData) + 1; constexpr int kHalfDataSize = kDataSize / 2; diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc index 53dd1ca78..b5c38f27e 100644 --- a/test/syscalls/linux/socket_netlink_route.cc +++ b/test/syscalls/linux/socket_netlink_route.cc @@ -151,8 +151,8 @@ TEST(NetlinkRouteTest, GetPeerName) { // the value is considered ok. // 2: A description of what the sockopt value is expected to be. Should complete // the sentence " was unexpected, expected " -using SockOptTest = - ::testing::TestWithParam, std::string>>; +using SockOptTest = ::testing::TestWithParam< + std::tuple, std::string>>; TEST_P(SockOptTest, GetSockOpt) { int sockopt = std::get<0>(GetParam()); diff --git a/test/syscalls/linux/unix_domain_socket_test_util.cc b/test/syscalls/linux/unix_domain_socket_test_util.cc index ff28850b2..7fb9eed8d 100644 --- a/test/syscalls/linux/unix_domain_socket_test_util.cc +++ b/test/syscalls/linux/unix_domain_socket_test_util.cc @@ -53,7 +53,7 @@ SocketPairKind UnixDomainSocketPair(int type) { SocketPairKind FilesystemBoundUnixDomainSocketPair(int type) { std::string description = absl::StrCat(DescribeUnixDomainSocketType(type), - " created with filesystem binding"); + " created with filesystem binding"); if ((type & SOCK_DGRAM) == SOCK_DGRAM) { return SocketPairKind{ description, AF_UNIX, type, 0, @@ -65,8 +65,9 @@ SocketPairKind FilesystemBoundUnixDomainSocketPair(int type) { } SocketPairKind AbstractBoundUnixDomainSocketPair(int type) { - std::string description = absl::StrCat(DescribeUnixDomainSocketType(type), - " created with abstract namespace binding"); + std::string description = + absl::StrCat(DescribeUnixDomainSocketType(type), + " created with abstract namespace binding"); if ((type & SOCK_DGRAM) == SOCK_DGRAM) { return SocketPairKind{ description, AF_UNIX, type, 0, @@ -78,7 +79,7 @@ SocketPairKind AbstractBoundUnixDomainSocketPair(int type) { SocketPairKind SocketpairGoferUnixDomainSocketPair(int type) { std::string description = absl::StrCat(DescribeUnixDomainSocketType(type), - " created with the socketpair gofer"); + " created with the socketpair gofer"); return SocketPairKind{description, AF_UNIX, type, 0, SocketpairGoferSocketPairCreator(AF_UNIX, type, 0)}; } diff --git a/test/syscalls/linux/unix_domain_socket_test_util.h b/test/syscalls/linux/unix_domain_socket_test_util.h index 25729e421..5eca0b7f0 100644 --- a/test/syscalls/linux/unix_domain_socket_test_util.h +++ b/test/syscalls/linux/unix_domain_socket_test_util.h @@ -21,7 +21,7 @@ namespace gvisor { namespace testing { -// DescribeUnixDomainSocketType returns a human-readable std::string explaining the +// DescribeUnixDomainSocketType returns a human-readable string explaining the // given Unix domain socket type. std::string DescribeUnixDomainSocketType(int type); diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc index bc90bd78e..ae49725a0 100644 --- a/test/util/fs_util.cc +++ b/test/util/fs_util.cc @@ -290,7 +290,7 @@ PosixError WalkTree( } PosixErrorOr> ListDir(absl::string_view abspath, - bool skipdots) { + bool skipdots) { std::vector files; DIR* dir = opendir(std::string(abspath).c_str()); @@ -381,7 +381,7 @@ PosixError RecursivelyCreateDir(absl::string_view path) { // Makes a path absolute with respect to an optional base. If no base is // provided it will use the current working directory. PosixErrorOr MakeAbsolute(absl::string_view filename, - absl::string_view base) { + absl::string_view base) { if (filename.empty()) { return PosixError(EINVAL, "filename cannot be empty."); } @@ -494,7 +494,7 @@ std::string CleanPath(const absl::string_view unclean_path) { } PosixErrorOr GetRelativePath(absl::string_view source, - absl::string_view dest) { + absl::string_view dest) { if (!absl::StartsWith(source, "/") || !absl::StartsWith(dest, "/")) { // At least one of the inputs is not an absolute path. return PosixError( diff --git a/test/util/fs_util.h b/test/util/fs_util.h index eb7cdaa24..3969f8309 100644 --- a/test/util/fs_util.h +++ b/test/util/fs_util.h @@ -61,7 +61,7 @@ PosixError SetContents(absl::string_view path, absl::string_view contents); PosixError CreateWithContents(absl::string_view path, absl::string_view contents, int mode = 0666); -// Attempts to read the entire contents of the file into the provided std::string +// Attempts to read the entire contents of the file into the provided string // buffer or returns an error. PosixError GetContents(absl::string_view path, std::string* output); @@ -69,7 +69,7 @@ PosixError GetContents(absl::string_view path, std::string* output); PosixErrorOr GetContents(absl::string_view path); // Attempts to read the entire contents of the provided fd into the provided -// std::string or returns an error. +// string or returns an error. PosixError GetContentsFD(int fd, std::string* output); // Attempts to read the entire contents of the provided fd or returns an error. @@ -94,7 +94,7 @@ PosixError WalkTree( // method does not walk the tree recursively it only returns the elements // in that directory. PosixErrorOr> ListDir(absl::string_view abspath, - bool skipdots); + bool skipdots); // Attempt to recursively delete a directory or file. Returns an error and // the number of undeleted directories and files. If either @@ -108,20 +108,20 @@ PosixError RecursivelyCreateDir(absl::string_view path); // Makes a path absolute with respect to an optional base. If no base is // provided it will use the current working directory. PosixErrorOr MakeAbsolute(absl::string_view filename, - absl::string_view base); + absl::string_view base); // Generates a relative path from the source directory to the destination // (dest) file or directory. This uses ../ when necessary for destinations // which are not nested within the source. Both source and dest are required -// to be absolute paths, and an empty std::string will be returned if they are not. +// to be absolute paths, and an empty string will be returned if they are not. PosixErrorOr GetRelativePath(absl::string_view source, - absl::string_view dest); + absl::string_view dest); // Returns the part of the path before the final "/", EXCEPT: // * If there is a single leading "/" in the path, the result will be the // leading "/". // * If there is no "/" in the path, the result is the empty prefix of the -// input std::string. +// input string. absl::string_view Dirname(absl::string_view path); // Return the parts of the path, split on the final "/". If there is no @@ -135,7 +135,7 @@ std::pair SplitPath( // "/" in the path, the result is the same as the input. // Note that this function's behavior differs from the Unix basename // command if path ends with "/". For such paths, this function returns the -// empty std::string. +// empty string. absl::string_view Basename(absl::string_view path); // Collapse duplicate "/"s, resolve ".." and "." path elements, remove @@ -144,7 +144,7 @@ absl::string_view Basename(absl::string_view path); // NOTE: This respects relative vs. absolute paths, but does not // invoke any system calls (getcwd(2)) in order to resolve relative // paths wrt actual working directory. That is, this is purely a -// std::string manipulation, completely independent of process state. +// string manipulation, completely independent of process state. std::string CleanPath(absl::string_view path); // Returns the full path to the executable of the given pid or a PosixError. @@ -174,7 +174,7 @@ inline std::string JoinPath(absl::string_view path) { std::string JoinPath(absl::string_view path1, absl::string_view path2); template inline std::string JoinPath(absl::string_view path1, absl::string_view path2, - absl::string_view path3, const T&... args) { + absl::string_view path3, const T&... args) { return internal::JoinPathImpl({path1, path2, path3, args...}); } } // namespace testing diff --git a/test/util/fs_util_test.cc b/test/util/fs_util_test.cc index 4e12076a1..2a200320a 100644 --- a/test/util/fs_util_test.cc +++ b/test/util/fs_util_test.cc @@ -29,7 +29,8 @@ namespace { TEST(FsUtilTest, RecursivelyCreateDirManualDelete) { const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - const std::string base_path = JoinPath(root.path(), "/a/b/c/d/e/f/g/h/i/j/k/l/m"); + const std::string base_path = + JoinPath(root.path(), "/a/b/c/d/e/f/g/h/i/j/k/l/m"); ASSERT_THAT(Exists(base_path), IsPosixErrorOkAndHolds(false)); ASSERT_NO_ERRNO(RecursivelyCreateDir(base_path)); @@ -48,7 +49,8 @@ TEST(FsUtilTest, RecursivelyCreateDirManualDelete) { TEST(FsUtilTest, RecursivelyCreateAndDeleteDir) { const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - const std::string base_path = JoinPath(root.path(), "/a/b/c/d/e/f/g/h/i/j/k/l/m"); + const std::string base_path = + JoinPath(root.path(), "/a/b/c/d/e/f/g/h/i/j/k/l/m"); ASSERT_THAT(Exists(base_path), IsPosixErrorOkAndHolds(false)); ASSERT_NO_ERRNO(RecursivelyCreateDir(base_path)); @@ -60,7 +62,8 @@ TEST(FsUtilTest, RecursivelyCreateAndDeleteDir) { TEST(FsUtilTest, RecursivelyCreateAndDeletePartial) { const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - const std::string base_path = JoinPath(root.path(), "/a/b/c/d/e/f/g/h/i/j/k/l/m"); + const std::string base_path = + JoinPath(root.path(), "/a/b/c/d/e/f/g/h/i/j/k/l/m"); ASSERT_THAT(Exists(base_path), IsPosixErrorOkAndHolds(false)); ASSERT_NO_ERRNO(RecursivelyCreateDir(base_path)); diff --git a/test/util/logging.cc b/test/util/logging.cc index cc71d77b0..5d5e76c46 100644 --- a/test/util/logging.cc +++ b/test/util/logging.cc @@ -50,7 +50,7 @@ int WriteNumber(int fd, uint32_t val) { constexpr int kBufferSize = 11; char buf[kBufferSize]; - // Convert the number to std::string. + // Convert the number to string. char* s = buf + sizeof(buf) - 1; size_t size = 0; diff --git a/test/util/mount_util.h b/test/util/mount_util.h index 7782e6bf2..38ec6c8a1 100644 --- a/test/util/mount_util.h +++ b/test/util/mount_util.h @@ -30,9 +30,11 @@ namespace testing { // Mount mounts the filesystem, and unmounts when the returned reference is // destroyed. -inline PosixErrorOr Mount(const std::string &source, const std::string &target, +inline PosixErrorOr Mount(const std::string &source, + const std::string &target, const std::string &fstype, uint64_t mountflags, - const std::string &data, uint64_t umountflags) { + const std::string &data, + uint64_t umountflags) { if (mount(source.c_str(), target.c_str(), fstype.c_str(), mountflags, data.c_str()) == -1) { return PosixError(errno, "mount failed"); diff --git a/test/util/posix_error.h b/test/util/posix_error.h index b604f4f8f..ad666bce0 100644 --- a/test/util/posix_error.h +++ b/test/util/posix_error.h @@ -51,7 +51,7 @@ class ABSL_MUST_USE_RESULT PosixError { std::string error_message() const { return msg_; } - // ToString produces a full std::string representation of this posix error + // ToString produces a full string representation of this posix error // including the printable representation of the errno and the error message. std::string ToString() const; diff --git a/test/util/proc_util.cc b/test/util/proc_util.cc index 9d4db37c3..75b24da37 100644 --- a/test/util/proc_util.cc +++ b/test/util/proc_util.cc @@ -76,7 +76,8 @@ PosixErrorOr ParseProcMapsLine(absl::string_view line) { if (parts.size() == 6) { // A filename is present. However, absl::StrSplit retained the whitespace // between the inode number and the filename. - map_entry.filename = std::string(absl::StripLeadingAsciiWhitespace(parts[5])); + map_entry.filename = + std::string(absl::StripLeadingAsciiWhitespace(parts[5])); } return map_entry; diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc index de7c04a6f..35aacb172 100644 --- a/test/util/temp_path.cc +++ b/test/util/temp_path.cc @@ -70,13 +70,16 @@ std::string NewTempAbsPathInDir(absl::string_view const dir) { return JoinPath(dir, NextTempBasename()); } -std::string NewTempAbsPath() { return NewTempAbsPathInDir(GetAbsoluteTestTmpdir()); } +std::string NewTempAbsPath() { + return NewTempAbsPathInDir(GetAbsoluteTestTmpdir()); +} std::string NewTempRelPath() { return NextTempBasename(); } std::string GetAbsoluteTestTmpdir() { char* env_tmpdir = getenv("TEST_TMPDIR"); - std::string tmp_dir = env_tmpdir != nullptr ? std::string(env_tmpdir) : "/tmp"; + std::string tmp_dir = + env_tmpdir != nullptr ? std::string(env_tmpdir) : "/tmp"; return MakeAbsolute(tmp_dir, "").ValueOrDie(); } diff --git a/test/util/temp_path.h b/test/util/temp_path.h index 89302e0fd..92d669503 100644 --- a/test/util/temp_path.h +++ b/test/util/temp_path.h @@ -30,7 +30,7 @@ namespace testing { // Distinct calls to NewTempAbsPathInDir from the same process, even from // multiple threads, are guaranteed to return different paths. Distinct calls to // NewTempAbsPathInDir from different processes are not synchronized. -std::string NewTempAbsPathInDir(absl::string_view base); +std::string NewTempAbsPathInDir(absl::string_view const dir); // Like NewTempAbsPathInDir, but the returned path is in the test's temporary // directory, as provided by the testing framework. @@ -105,7 +105,7 @@ class TempPath { // Changes the path this TempPath represents. If the TempPath already // represented a path, deletes and returns that path. Otherwise returns the - // empty std::string. + // empty string. std::string reset(std::string newpath); std::string reset() { return reset(""); } diff --git a/test/util/test_util.cc b/test/util/test_util.cc index bf0029951..e42bba04a 100644 --- a/test/util/test_util.cc +++ b/test/util/test_util.cc @@ -73,7 +73,7 @@ Platform GvisorPlatform() { CPUVendor GetCPUVendor() { uint32_t eax, ebx, ecx, edx; std::string vendor_str; - // Get vendor std::string (issue CPUID with eax = 0) + // Get vendor string (issue CPUID with eax = 0) GETCPUID(eax, ebx, ecx, edx, 0, 0); vendor_str.append(reinterpret_cast(&ebx), 4); vendor_str.append(reinterpret_cast(&edx), 4); @@ -93,7 +93,8 @@ bool operator==(const KernelVersion& first, const KernelVersion& second) { PosixErrorOr ParseKernelVersion(absl::string_view vers_str) { KernelVersion version = {}; - std::vector values = absl::StrSplit(vers_str, absl::ByAnyChar(".-")); + std::vector values = + absl::StrSplit(vers_str, absl::ByAnyChar(".-")); if (values.size() == 2) { ASSIGN_OR_RETURN_ERRNO(version.major, Atoi(values[0])); ASSIGN_OR_RETURN_ERRNO(version.minor, Atoi(values[1])); -- cgit v1.2.3 From c4da599e22e1490dc2a108cbf85ba19af2f5e2df Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Fri, 28 Jun 2019 16:17:19 -0700 Subject: ext4: disklayout: SuperBlock interface implementations. PiperOrigin-RevId: 255687771 --- pkg/sentry/fs/ext4/disklayout/BUILD | 11 ++- pkg/sentry/fs/ext4/disklayout/block_group_test.go | 16 +--- pkg/sentry/fs/ext4/disklayout/superblock_32.go | 76 +++++++++++++++ pkg/sentry/fs/ext4/disklayout/superblock_64.go | 94 +++++++++++++++++++ pkg/sentry/fs/ext4/disklayout/superblock_old.go | 108 ++++++++++++++++++++++ pkg/sentry/fs/ext4/disklayout/superblock_test.go | 27 ++++++ pkg/sentry/fs/ext4/disklayout/test_utils.go | 30 ++++++ 7 files changed, 348 insertions(+), 14 deletions(-) create mode 100644 pkg/sentry/fs/ext4/disklayout/superblock_32.go create mode 100644 pkg/sentry/fs/ext4/disklayout/superblock_64.go create mode 100644 pkg/sentry/fs/ext4/disklayout/superblock_old.go create mode 100644 pkg/sentry/fs/ext4/disklayout/superblock_test.go create mode 100644 pkg/sentry/fs/ext4/disklayout/test_utils.go diff --git a/pkg/sentry/fs/ext4/disklayout/BUILD b/pkg/sentry/fs/ext4/disklayout/BUILD index be637bcbb..cdac63655 100644 --- a/pkg/sentry/fs/ext4/disklayout/BUILD +++ b/pkg/sentry/fs/ext4/disklayout/BUILD @@ -9,14 +9,21 @@ go_library( "block_group_32.go", "block_group_64.go", "superblock.go", + "superblock_32.go", + "superblock_64.go", + "superblock_old.go", + "test_utils.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext4/disklayout", + deps = ["//pkg/binary"], ) go_test( name = "disklayout_test", size = "small", - srcs = ["block_group_test.go"], + srcs = [ + "block_group_test.go", + "superblock_test.go", + ], embed = [":disklayout"], - deps = ["//pkg/binary"], ) diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_test.go b/pkg/sentry/fs/ext4/disklayout/block_group_test.go index 794ae99cc..0ef4294c0 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group_test.go +++ b/pkg/sentry/fs/ext4/disklayout/block_group_test.go @@ -16,19 +16,11 @@ package disklayout import ( "testing" - - "gvisor.dev/gvisor/pkg/binary" ) -// TestBlockGroupSize tests the fact that the block group struct for -// 32-bit ext filesystems should be exactly 32 bytes big and for 64-bit fs it -// should be 64 bytes. +// TestBlockGroupSize tests that the block group descriptor structs are of the +// correct size. func TestBlockGroupSize(t *testing.T) { - if got, want := int(binary.Size(BlockGroup32Bit{})), 32; got != want { - t.Errorf("BlockGroup32Bit should be exactly 32 bytes but is %d bytes", got) - } - - if got, want := int(binary.Size(BlockGroup64Bit{})), 64; got != want { - t.Errorf("BlockGroup64Bit should be exactly 64 bytes but is %d bytes", got) - } + assertSize(t, BlockGroup32Bit{}, 32) + assertSize(t, BlockGroup64Bit{}, 64) } diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_32.go b/pkg/sentry/fs/ext4/disklayout/superblock_32.go new file mode 100644 index 000000000..4c3233eed --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/superblock_32.go @@ -0,0 +1,76 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlock32Bit implements SuperBlock and represents the 32-bit version of +// the ext4_super_block struct in fs/ext4/ext4.h. +// +// The suffix `Raw` has been added to indicate that the field does not have a +// counterpart in the 64-bit version and to resolve name collision with the +// interface. +type SuperBlock32Bit struct { + // We embed the old superblock struct here because the 32-bit version is just + // an extension of the old version. + SuperBlockOld + + FirstInode uint32 + InodeSizeRaw uint16 + BlockGroupNumber uint16 + FeatureCompat uint32 + FeatureIncompat uint32 + FeatureRoCompat uint32 + UUID [16]byte + VolumeName [16]byte + LastMounted [64]byte + AlgoUsageBitmap uint32 + PreallocBlocks uint8 + PreallocDirBlocks uint8 + ReservedGdtBlocks uint16 + JournalUUID [16]byte + JournalInum uint32 + JournalDev uint32 + LastOrphan uint32 + HashSeed [4]uint32 + DefaultHashVersion uint8 + JnlBackupType uint8 + BgDescSizeRaw uint16 + DefaultMountOpts uint32 + FirstMetaBg uint32 + MkfsTime uint32 + JnlBlocks [17]uint32 +} + +// Only override methods which change based on the additional fields above. +// Not overriding SuperBlock.BgDescSize because it would still return 32 here. + +// InodeSize implements SuperBlock.InodeSize. +func (sb *SuperBlock32Bit) InodeSize() uint16 { + return sb.InodeSizeRaw +} + +// CompatibleFeatures implements SuperBlock.CompatibleFeatures. +func (sb *SuperBlock32Bit) CompatibleFeatures() CompatFeatures { + return CompatFeaturesFromInt(sb.FeatureCompat) +} + +// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. +func (sb *SuperBlock32Bit) IncompatibleFeatures() IncompatFeatures { + return IncompatFeaturesFromInt(sb.FeatureIncompat) +} + +// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. +func (sb *SuperBlock32Bit) ReadOnlyCompatibleFeatures() RoCompatFeatures { + return RoCompatFeaturesFromInt(sb.FeatureRoCompat) +} diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_64.go b/pkg/sentry/fs/ext4/disklayout/superblock_64.go new file mode 100644 index 000000000..2e945a7c7 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/superblock_64.go @@ -0,0 +1,94 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlock64Bit implements SuperBlock and represents the 64-bit version of +// the ext4_super_block struct in fs/ext4/ext4.h. This sums up to be exactly +// 1024 bytes (smallest possible block size) and hence the superblock always +// fits in no more than one data block. +// +// The suffix `Hi` here stands for upper bits because they represent the upper +// half of the fields. +type SuperBlock64Bit struct { + // We embed the 32-bit struct here because 64-bit version is just an extension + // of the 32-bit version. + SuperBlock32Bit + + BlocksCountHi uint32 + ReservedBlocksCountHi uint32 + FreeBlocksCountHi uint32 + MinInodeSize uint16 + WantInodeSize uint16 + Flags uint32 + RaidStride uint16 + MmpInterval uint16 + MmpBlock uint64 + RaidStripeWidth uint32 + LogGroupsPerFlex uint8 + ChecksumType uint8 + _ uint16 + KbytesWritten uint64 + SnapshotInum uint32 + SnapshotID uint32 + SnapshotRsrvBlocksCount uint64 + SnapshotList uint32 + ErrorCount uint32 + FirstErrorTime uint32 + FirstErrorInode uint32 + FirstErrorBlock uint64 + FirstErrorFunction [32]byte + FirstErrorLine uint32 + LastErrorTime uint32 + LastErrorInode uint32 + LastErrorLine uint32 + LastErrorBlock uint64 + LastErrorFunction [32]byte + MountOpts [64]byte + UserQuotaInum uint32 + GroupQuotaInum uint32 + OverheadBlocks uint32 + BackupBgs [2]uint32 + EncryptAlgos [4]uint8 + EncryptPwSalt [16]uint8 + LostFoundInode uint32 + ProjectQuotaInode uint32 + ChecksumSeed uint32 + WtimeHi uint8 + MtimeHi uint8 + MkfsTimeHi uint8 + LastCheckHi uint8 + FirstErrorTimeHi uint8 + LastErrorTimeHi uint8 + _ [2]uint8 + Encoding uint16 + EncodingFlags uint16 + _ [95]uint32 + Checksum uint32 +} + +// Only override methods which change based on the 64-bit feature. + +// BlocksCount implements SuperBlock.BlocksCount. +func (sb *SuperBlock64Bit) BlocksCount() uint64 { + return (uint64(sb.BlocksCountHi) << 32) | uint64(sb.BlocksCountLo) +} + +// FreeBlocksCount implements SuperBlock.FreeBlocksCount. +func (sb *SuperBlock64Bit) FreeBlocksCount() uint64 { + return (uint64(sb.FreeBlocksCountHi) << 32) | uint64(sb.FreeBlocksCountLo) +} + +// BgDescSize implements SuperBlock.BgDescSize. +func (sb *SuperBlock64Bit) BgDescSize() uint16 { return sb.BgDescSizeRaw } diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_old.go b/pkg/sentry/fs/ext4/disklayout/superblock_old.go new file mode 100644 index 000000000..1f7425ba3 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/superblock_old.go @@ -0,0 +1,108 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlockOld implements SuperBlock and represents the old version of the +// superblock struct in ext2 and ext3 systems. +// +// The suffix `Lo` here stands for lower bits because this is also used in the +// 64-bit version where these fields represent the lower half of the fields. +// The suffix `Raw` has been added to indicate that the field does not have a +// counterpart in the 64-bit version and to resolve name collision with the +// interface. +type SuperBlockOld struct { + InodesCountRaw uint32 + BlocksCountLo uint32 + ReservedBlocksCount uint32 + FreeBlocksCountLo uint32 + FreeInodesCountRaw uint32 + FirstDataBlockRaw uint32 + LogBlockSize uint32 + LogClusterSize uint32 + BlocksPerGroupRaw uint32 + ClustersPerGroupRaw uint32 + InodesPerGroupRaw uint32 + Mtime uint32 + Wtime uint32 + MountCountRaw uint16 + MaxMountCountRaw uint16 + MagicRaw uint16 + State uint16 + Errors uint16 + MinorRevLevel uint16 + LastCheck uint32 + CheckInterval uint32 + CreatorOS uint32 + RevLevel uint32 + DefResUID uint16 + DefResGID uint16 +} + +// InodesCount implements SuperBlock.InodesCount. +func (sb *SuperBlockOld) InodesCount() uint32 { return sb.InodesCountRaw } + +// BlocksCount implements SuperBlock.BlocksCount. +func (sb *SuperBlockOld) BlocksCount() uint64 { return uint64(sb.BlocksCountLo) } + +// FreeBlocksCount implements SuperBlock.FreeBlocksCount. +func (sb *SuperBlockOld) FreeBlocksCount() uint64 { return uint64(sb.FreeBlocksCountLo) } + +// FreeInodesCount implements SuperBlock.FreeInodesCount. +func (sb *SuperBlockOld) FreeInodesCount() uint32 { return sb.FreeInodesCountRaw } + +// MountCount implements SuperBlock.MountCount. +func (sb *SuperBlockOld) MountCount() uint16 { return sb.MountCountRaw } + +// MaxMountCount implements SuperBlock.MaxMountCount. +func (sb *SuperBlockOld) MaxMountCount() uint16 { return sb.MaxMountCountRaw } + +// FirstDataBlock implements SuperBlock.FirstDataBlock. +func (sb *SuperBlockOld) FirstDataBlock() uint32 { return sb.FirstDataBlockRaw } + +// BlockSize implements SuperBlock.BlockSize. +func (sb *SuperBlockOld) BlockSize() uint64 { return 1 << (10 + sb.LogBlockSize) } + +// BlocksPerGroup implements SuperBlock.BlocksPerGroup. +func (sb *SuperBlockOld) BlocksPerGroup() uint32 { return sb.BlocksPerGroupRaw } + +// ClusterSize implements SuperBlock.ClusterSize. +func (sb *SuperBlockOld) ClusterSize() uint64 { return 1 << (10 + sb.LogClusterSize) } + +// ClustersPerGroup implements SuperBlock.ClustersPerGroup. +func (sb *SuperBlockOld) ClustersPerGroup() uint32 { return sb.ClustersPerGroupRaw } + +// InodeSize implements SuperBlock.InodeSize. +func (sb *SuperBlockOld) InodeSize() uint16 { return 128 } + +// InodesPerGroup implements SuperBlock.InodesPerGroup. +func (sb *SuperBlockOld) InodesPerGroup() uint32 { return sb.InodesPerGroupRaw } + +// BgDescSize implements SuperBlock.BgDescSize. +func (sb *SuperBlockOld) BgDescSize() uint16 { return 32 } + +// CompatibleFeatures implements SuperBlock.CompatibleFeatures. +func (sb *SuperBlockOld) CompatibleFeatures() CompatFeatures { return CompatFeatures{} } + +// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. +func (sb *SuperBlockOld) IncompatibleFeatures() IncompatFeatures { return IncompatFeatures{} } + +// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. +func (sb *SuperBlockOld) ReadOnlyCompatibleFeatures() RoCompatFeatures { return RoCompatFeatures{} } + +// Magic implements SuperBlock.Magic. +func (sb *SuperBlockOld) Magic() uint16 { return sb.MagicRaw } + +// Revision implements SuperBlock.Revision. +func (sb *SuperBlockOld) Revision() SbRevision { return SbRevision(sb.RevLevel) } diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_test.go b/pkg/sentry/fs/ext4/disklayout/superblock_test.go new file mode 100644 index 000000000..463b5ba21 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/superblock_test.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "testing" +) + +// TestSuperBlockSize tests that the superblock structs are of the correct +// size. +func TestSuperBlockSize(t *testing.T) { + assertSize(t, SuperBlockOld{}, 84) + assertSize(t, SuperBlock32Bit{}, 336) + assertSize(t, SuperBlock64Bit{}, 1024) +} diff --git a/pkg/sentry/fs/ext4/disklayout/test_utils.go b/pkg/sentry/fs/ext4/disklayout/test_utils.go new file mode 100644 index 000000000..9c63f04c0 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/test_utils.go @@ -0,0 +1,30 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "reflect" + "testing" + + "gvisor.dev/gvisor/pkg/binary" +) + +func assertSize(t *testing.T, v interface{}, want uintptr) { + t.Helper() + + if got := binary.Size(v); got != want { + t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got) + } +} -- cgit v1.2.3 From d3f97aec49e5364e33fa7acd9f81206ed3fa4aed Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Fri, 28 Jun 2019 16:49:16 -0700 Subject: Remove events from name_to_handle_at and open_by_handle_at. These syscalls require filesystem support that gVisor does not provide, and is not planning to implement. Their absense should not trigger an event. PiperOrigin-RevId: 255692871 --- pkg/sentry/syscalls/linux/linux64.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 7f18b1ac8..94816b038 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -351,8 +351,8 @@ var AMD64 = &kernel.SyscallTable{ 300: syscalls.ErrorWithEvent("fanotify_init", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 301: syscalls.ErrorWithEvent("fanotify_mark", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 302: syscalls.Undocumented("prlimit64", Prlimit64), - 303: syscalls.ErrorWithEvent("name_to_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil), - 304: syscalls.ErrorWithEvent("open_by_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil), + 303: syscalls.Error("name_to_handle_at", syscall.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 304: syscalls.Error("open_by_handle_at", syscall.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), 306: syscalls.Undocumented("syncfs", Syncfs), 307: syscalls.Undocumented("sendmmsg", SendMMsg), -- cgit v1.2.3 From 7dae043fecc109d7a0ae1d5d6274e48b1bf04d13 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Fri, 28 Jun 2019 17:19:04 -0700 Subject: Drop ashmem and binder. These are unfortunately unused and unmaintained. They can be brought back in the future if need requires it. PiperOrigin-RevId: 255697132 --- pkg/abi/linux/BUILD | 2 - pkg/abi/linux/ashmem.go | 29 ---- pkg/abi/linux/binder.go | 20 --- pkg/abi/linux/ioctl.go | 25 --- pkg/sentry/fs/ashmem/BUILD | 65 ------- pkg/sentry/fs/ashmem/area.go | 308 --------------------------------- pkg/sentry/fs/ashmem/device.go | 61 ------- pkg/sentry/fs/ashmem/pin_board.go | 127 -------------- pkg/sentry/fs/ashmem/pin_board_test.go | 130 -------------- pkg/sentry/fs/binder/BUILD | 27 --- pkg/sentry/fs/binder/binder.go | 260 ---------------------------- pkg/sentry/fs/dev/BUILD | 2 - pkg/sentry/fs/dev/dev.go | 14 +- pkg/sentry/fs/dev/fs.go | 37 +--- 14 files changed, 2 insertions(+), 1105 deletions(-) delete mode 100644 pkg/abi/linux/ashmem.go delete mode 100644 pkg/abi/linux/binder.go delete mode 100644 pkg/sentry/fs/ashmem/BUILD delete mode 100644 pkg/sentry/fs/ashmem/area.go delete mode 100644 pkg/sentry/fs/ashmem/device.go delete mode 100644 pkg/sentry/fs/ashmem/pin_board.go delete mode 100644 pkg/sentry/fs/ashmem/pin_board_test.go delete mode 100644 pkg/sentry/fs/binder/BUILD delete mode 100644 pkg/sentry/fs/binder/binder.go diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index 29ea12ad5..cd405aa96 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -10,9 +10,7 @@ go_library( name = "linux", srcs = [ "aio.go", - "ashmem.go", "audit.go", - "binder.go", "bpf.go", "capability.go", "dev.go", diff --git a/pkg/abi/linux/ashmem.go b/pkg/abi/linux/ashmem.go deleted file mode 100644 index 2a722abe0..000000000 --- a/pkg/abi/linux/ashmem.go +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package linux - -// Constants used by ashmem in pin-related ioctls. -const ( - AshmemNotPurged = 0 - AshmemWasPurged = 1 - AshmemIsUnpinned = 0 - AshmemIsPinned = 1 -) - -// AshmemPin structure is used for pin-related ioctls. -type AshmemPin struct { - Offset uint32 - Len uint32 -} diff --git a/pkg/abi/linux/binder.go b/pkg/abi/linux/binder.go deleted file mode 100644 index 63b08324a..000000000 --- a/pkg/abi/linux/binder.go +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package linux - -// BinderVersion structure is used for BINDER_VERSION ioctl. -type BinderVersion struct { - ProtocolVersion int32 -} diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go index 04bb767dc..0e18db9ef 100644 --- a/pkg/abi/linux/ioctl.go +++ b/pkg/abi/linux/ioctl.go @@ -72,28 +72,3 @@ const ( SIOCGMIIPHY = 0x8947 SIOCGMIIREG = 0x8948 ) - -// ioctl(2) requests provided by uapi/linux/android/binder.h -const ( - BinderWriteReadIoctl = 0xc0306201 - BinderSetIdleTimeoutIoctl = 0x40086203 - BinderSetMaxThreadsIoctl = 0x40046205 - BinderSetIdlePriorityIoctl = 0x40046206 - BinderSetContextMgrIoctl = 0x40046207 - BinderThreadExitIoctl = 0x40046208 - BinderVersionIoctl = 0xc0046209 -) - -// ioctl(2) requests provided by drivers/staging/android/uapi/ashmem.h -const ( - AshmemSetNameIoctl = 0x41007701 - AshmemGetNameIoctl = 0x81007702 - AshmemSetSizeIoctl = 0x40087703 - AshmemGetSizeIoctl = 0x00007704 - AshmemSetProtMaskIoctl = 0x40087705 - AshmemGetProtMaskIoctl = 0x00007706 - AshmemPinIoctl = 0x40087707 - AshmemUnpinIoctl = 0x40087708 - AshmemGetPinStatusIoctl = 0x00007709 - AshmemPurgeAllCachesIoctl = 0x0000770a -) diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD deleted file mode 100644 index 5b755cec5..000000000 --- a/pkg/sentry/fs/ashmem/BUILD +++ /dev/null @@ -1,65 +0,0 @@ -package(licenses = ["notice"]) - -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -go_library( - name = "ashmem", - srcs = [ - "area.go", - "device.go", - "pin_board.go", - "uint64_range.go", - "uint64_set.go", - ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ashmem", - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/sentry/arch", - "//pkg/sentry/context", - "//pkg/sentry/fs", - "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fs/tmpfs", - "//pkg/sentry/kernel/time", - "//pkg/sentry/memmap", - "//pkg/sentry/usage", - "//pkg/sentry/usermem", - "//pkg/syserror", - "//pkg/waiter", - ], -) - -go_test( - name = "ashmem_test", - size = "small", - srcs = ["pin_board_test.go"], - embed = [":ashmem"], - deps = [ - "//pkg/abi/linux", - "//pkg/sentry/usermem", - ], -) - -go_template_instance( - name = "uint64_range", - out = "uint64_range.go", - package = "ashmem", - template = "//pkg/segment:generic_range", - types = { - "T": "uint64", - }, -) - -go_template_instance( - name = "uint64_set", - out = "uint64_set.go", - package = "ashmem", - template = "//pkg/segment:generic_set", - types = { - "Key": "uint64", - "Range": "Range", - "Value": "noValue", - "Functions": "setFunctions", - }, -) diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go deleted file mode 100644 index 3b8d6ca89..000000000 --- a/pkg/sentry/fs/ashmem/area.go +++ /dev/null @@ -1,308 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ashmem - -import ( - "sync" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/waiter" -) - -const ( - // namePrefix is the name prefix assumed and forced by the Linux implementation. - namePrefix = "dev/ashmem" - - // nameLen is the maximum name length. - nameLen = 256 -) - -// Area implements fs.FileOperations. -// -// +stateify savable -type Area struct { - fsutil.FileNoFsync `state:"nosave"` - fsutil.FileNoSplice `state:"nosave"` - fsutil.FileNoopFlush `state:"nosave"` - fsutil.FileNotDirReaddir `state:"nosave"` - fsutil.FileUseInodeUnstableAttr `state:"nosave"` - waiter.AlwaysReady `state:"nosave"` - - ad *Device - - // mu protects fields below. - mu sync.Mutex `state:"nosave"` - tmpfsFile *fs.File - name string - size uint64 - perms usermem.AccessType - pb *PinBoard -} - -// Release implements fs.FileOperations.Release. -func (a *Area) Release() { - a.mu.Lock() - defer a.mu.Unlock() - if a.tmpfsFile != nil { - a.tmpfsFile.DecRef() - a.tmpfsFile = nil - } -} - -// Seek implements fs.FileOperations.Seek. -func (a *Area) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { - a.mu.Lock() - defer a.mu.Unlock() - if a.size == 0 { - return 0, syserror.EINVAL - } - if a.tmpfsFile == nil { - return 0, syserror.EBADF - } - return a.tmpfsFile.FileOperations.Seek(ctx, file, whence, offset) -} - -// Read implements fs.FileOperations.Read. -func (a *Area) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { - a.mu.Lock() - defer a.mu.Unlock() - if a.size == 0 { - return 0, nil - } - if a.tmpfsFile == nil { - return 0, syserror.EBADF - } - return a.tmpfsFile.FileOperations.Read(ctx, file, dst, offset) -} - -// Write implements fs.FileOperations.Write. -func (a *Area) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { - return 0, syserror.ENOSYS -} - -// ConfigureMMap implements fs.FileOperations.ConfigureMMap. -func (a *Area) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { - a.mu.Lock() - defer a.mu.Unlock() - if a.size == 0 { - return syserror.EINVAL - } - - if !a.perms.SupersetOf(opts.Perms) { - return syserror.EPERM - } - opts.MaxPerms = opts.MaxPerms.Intersect(a.perms) - - if a.tmpfsFile == nil { - tmpfsInodeOps := tmpfs.NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{}) - tmpfsInode := fs.NewInode(ctx, tmpfsInodeOps, fs.NewPseudoMountSource(ctx), fs.StableAttr{}) - dirent := fs.NewDirent(ctx, tmpfsInode, namePrefix+"/"+a.name) - tmpfsFile, err := tmpfsInode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}) - // Drop the extra reference on the Dirent. - dirent.DecRef() - - if err != nil { - return err - } - - // Truncate to the size set by ASHMEM_SET_SIZE ioctl. - err = tmpfsInodeOps.Truncate(ctx, tmpfsInode, int64(a.size)) - if err != nil { - return err - } - a.tmpfsFile = tmpfsFile - a.pb = NewPinBoard() - } - - return a.tmpfsFile.ConfigureMMap(ctx, opts) -} - -// Ioctl implements fs.FileOperations.Ioctl. -func (a *Area) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { - // Switch on ioctl request. - switch args[1].Uint() { - case linux.AshmemSetNameIoctl: - name, err := usermem.CopyStringIn(ctx, io, args[2].Pointer(), nameLen-1, usermem.IOOpts{ - AddressSpaceActive: true, - }) - if err != nil { - return 0, err - } - - a.mu.Lock() - defer a.mu.Unlock() - - // Cannot set name for already mapped ashmem. - if a.tmpfsFile != nil { - return 0, syserror.EINVAL - } - a.name = name - return 0, nil - - case linux.AshmemGetNameIoctl: - a.mu.Lock() - var local []byte - if a.name != "" { - nameLen := len([]byte(a.name)) - local = make([]byte, nameLen, nameLen+1) - copy(local, []byte(a.name)) - local = append(local, 0) - } else { - nameLen := len([]byte(namePrefix)) - local = make([]byte, nameLen, nameLen+1) - copy(local, []byte(namePrefix)) - local = append(local, 0) - } - a.mu.Unlock() - - if _, err := io.CopyOut(ctx, args[2].Pointer(), local, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { - return 0, syserror.EFAULT - } - return 0, nil - - case linux.AshmemSetSizeIoctl: - a.mu.Lock() - defer a.mu.Unlock() - - // Cannot set size for already mapped ashmem. - if a.tmpfsFile != nil { - return 0, syserror.EINVAL - } - a.size = uint64(args[2].SizeT()) - return 0, nil - - case linux.AshmemGetSizeIoctl: - return uintptr(a.size), nil - - case linux.AshmemPinIoctl, linux.AshmemUnpinIoctl, linux.AshmemGetPinStatusIoctl: - // Locking and unlocking is ok since once tmpfsFile is set, it won't be nil again - // even after unmapping! Unlocking is needed in order to avoid a deadlock on - // usermem.CopyObjectIn. - - // Cannot execute pin-related ioctls before mapping. - a.mu.Lock() - if a.tmpfsFile == nil { - a.mu.Unlock() - return 0, syserror.EINVAL - } - a.mu.Unlock() - - var pin linux.AshmemPin - _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pin, usermem.IOOpts{ - AddressSpaceActive: true, - }) - if err != nil { - return 0, syserror.EFAULT - } - - a.mu.Lock() - defer a.mu.Unlock() - return a.pinOperation(pin, args[1].Uint()) - - case linux.AshmemPurgeAllCachesIoctl: - return 0, nil - - case linux.AshmemSetProtMaskIoctl: - prot := uint64(args[2].ModeT()) - perms := usermem.AccessType{ - Read: prot&linux.PROT_READ != 0, - Write: prot&linux.PROT_WRITE != 0, - Execute: prot&linux.PROT_EXEC != 0, - } - - a.mu.Lock() - defer a.mu.Unlock() - - // Can only narrow prot mask. - if !a.perms.SupersetOf(perms) { - return 0, syserror.EINVAL - } - - // TODO(b/30946773,gvisor.dev/issue/153): If personality flag - // READ_IMPLIES_EXEC is set, set PROT_EXEC if PORT_READ is set. - - a.perms = perms - return 0, nil - - case linux.AshmemGetProtMaskIoctl: - return uintptr(a.perms.Prot()), nil - default: - // Ioctls irrelevant to Ashmem. - return 0, syserror.EINVAL - } -} - -// pinOperation should only be called while holding a.mu. -func (a *Area) pinOperation(pin linux.AshmemPin, op uint32) (uintptr, error) { - // Page-align a.size for checks. - pageAlignedSize, ok := usermem.Addr(a.size).RoundUp() - if !ok { - return 0, syserror.EINVAL - } - // Len 0 means everything onward. - if pin.Len == 0 { - pin.Len = uint32(pageAlignedSize) - pin.Offset - } - // Both Offset and Len have to be page-aligned. - if pin.Offset%uint32(usermem.PageSize) != 0 { - return 0, syserror.EINVAL - } - if pin.Len%uint32(usermem.PageSize) != 0 { - return 0, syserror.EINVAL - } - // Adding Offset and Len must not cause an uint32 overflow. - if end := pin.Offset + pin.Len; end < pin.Offset { - return 0, syserror.EINVAL - } - // Pin range must not exceed a's size. - if uint32(pageAlignedSize) < pin.Offset+pin.Len { - return 0, syserror.EINVAL - } - // Handle each operation. - r := RangeFromAshmemPin(pin) - switch op { - case linux.AshmemPinIoctl: - if a.pb.PinRange(r) { - return linux.AshmemWasPurged, nil - } - return linux.AshmemNotPurged, nil - - case linux.AshmemUnpinIoctl: - // TODO(b/30946773): Implement purge on unpin. - a.pb.UnpinRange(r) - return 0, nil - - case linux.AshmemGetPinStatusIoctl: - if a.pb.RangePinnedStatus(r) { - return linux.AshmemIsPinned, nil - } - return linux.AshmemIsUnpinned, nil - - default: - panic("unreachable") - } - -} diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go deleted file mode 100644 index 776f54abe..000000000 --- a/pkg/sentry/fs/ashmem/device.go +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package ashmem implements Android ashmem module (Anonymus Shared Memory). -package ashmem - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" -) - -// Device implements fs.InodeOperations. -// -// +stateify savable -type Device struct { - fsutil.InodeGenericChecker `state:"nosave"` - fsutil.InodeNoExtendedAttributes `state:"nosave"` - fsutil.InodeNoopAllocate `state:"nosave"` - fsutil.InodeNoopRelease `state:"nosave"` - fsutil.InodeNoopTruncate `state:"nosave"` - fsutil.InodeNoopWriteOut `state:"nosave"` - fsutil.InodeNotDirectory `state:"nosave"` - fsutil.InodeNotMappable `state:"nosave"` - fsutil.InodeNotSocket `state:"nosave"` - fsutil.InodeNotSymlink `state:"nosave"` - fsutil.InodeVirtual `state:"nosave"` - - fsutil.InodeSimpleAttributes -} - -var _ fs.InodeOperations = (*Device)(nil) - -// NewDevice creates and initializes a Device structure. -func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device { - return &Device{ - InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fp, linux.ANON_INODE_FS_MAGIC), - } -} - -// GetFile implements fs.InodeOperations.GetFile. -func (ad *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { - return fs.NewFile(ctx, d, flags, &Area{ - ad: ad, - tmpfsFile: nil, - perms: usermem.AnyAccess, - }), nil -} diff --git a/pkg/sentry/fs/ashmem/pin_board.go b/pkg/sentry/fs/ashmem/pin_board.go deleted file mode 100644 index c5400dd94..000000000 --- a/pkg/sentry/fs/ashmem/pin_board.go +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ashmem - -import "gvisor.dev/gvisor/pkg/abi/linux" - -const maxUint64 = ^uint64(0) - -// setFunctions implements segment.Functions generated from segment.Functions for -// uint64 Key and noValue Value. For more information, see the build file and -// segment set implementation at pkg/segment/set.go. -type setFunctions struct{} - -// noValue is a type of range attached value, which is irrelevant here. -type noValue struct{} - -// MinKey implements segment.Functions.MinKey. -func (setFunctions) MinKey() uint64 { - return 0 -} - -// MaxKey implements segment.Functions.MaxKey. -func (setFunctions) MaxKey() uint64 { - return maxUint64 -} - -// ClearValue implements segment.Functions.ClearValue. -func (setFunctions) ClearValue(*noValue) { - return -} - -// Merge implements segment.Functions.Merge. -func (setFunctions) Merge(Range, noValue, Range, noValue) (noValue, bool) { - return noValue{}, true -} - -// Split implements segment.Functions.Split. -func (setFunctions) Split(Range, noValue, uint64) (noValue, noValue) { - return noValue{}, noValue{} -} - -// PinBoard represents a set of pinned ranges in ashmem. -// -// segment.Set is used for implementation where segments represent -// ranges of pinned bytes, while gaps represent ranges of unpinned -// bytes. All ranges are page-aligned. -// -// +stateify savable -type PinBoard struct { - Set -} - -// NewPinBoard creates a new pin board with all pages pinned. -func NewPinBoard() *PinBoard { - var pb PinBoard - pb.PinRange(Range{0, maxUint64}) - return &pb -} - -// PinRange pins all pages in the specified range and returns true -// if there are any newly pinned pages. -func (pb *PinBoard) PinRange(r Range) bool { - pinnedPages := false - for gap := pb.LowerBoundGap(r.Start); gap.Ok() && gap.Start() < r.End; { - common := gap.Range().Intersect(r) - if common.Length() == 0 { - gap = gap.NextGap() - continue - } - pinnedPages = true - gap = pb.Insert(gap, common, noValue{}).NextGap() - } - return pinnedPages -} - -// UnpinRange unpins all pages in the specified range. -func (pb *PinBoard) UnpinRange(r Range) { - for seg := pb.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; { - common := seg.Range().Intersect(r) - if common.Length() == 0 { - seg = seg.NextSegment() - continue - } - seg = pb.RemoveRange(common).NextSegment() - } -} - -// RangePinnedStatus returns false if there's at least one unpinned page in the -// specified range. -func (pb *PinBoard) RangePinnedStatus(r Range) bool { - for gap := pb.LowerBoundGap(r.Start); gap.Ok() && gap.Start() < r.End; { - common := gap.Range().Intersect(r) - if common.Length() == 0 { - gap = gap.NextGap() - continue - } - return false - } - return true -} - -// RangeFromAshmemPin converts ashmem's original pin structure -// to Range. -func RangeFromAshmemPin(ap linux.AshmemPin) Range { - if ap.Len == 0 { - return Range{ - uint64(ap.Offset), - maxUint64, - } - } - return Range{ - uint64(ap.Offset), - uint64(ap.Offset) + uint64(ap.Len), - } -} diff --git a/pkg/sentry/fs/ashmem/pin_board_test.go b/pkg/sentry/fs/ashmem/pin_board_test.go deleted file mode 100644 index 1ccd38f56..000000000 --- a/pkg/sentry/fs/ashmem/pin_board_test.go +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ashmem - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" -) - -func TestPinBoard(t *testing.T) { - pb := NewPinBoard() - - // Confirm that all pages are pinned. - if !pb.RangePinnedStatus(RangeFromAshmemPin(linux.AshmemPin{0, 0})) { - t.Errorf("RangePinnedStatus(all pages) returned false (unpinned) at start.") - } - - // Unpin pages [1, 11) (counting from 0) - pb.UnpinRange(RangeFromAshmemPin(linux.AshmemPin{ - usermem.PageSize, - usermem.PageSize * 10, - })) - - // Confirm that pages [1, 11) are unpinned and that page 0 and pages - // larger than 10 are pinned. - pinned := []linux.AshmemPin{ - { - 0, - usermem.PageSize, - }, { - usermem.PageSize * 11, - 0, - }, - } - - for _, pin := range pinned { - if !pb.RangePinnedStatus(RangeFromAshmemPin(pin)) { - t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned false (unpinned).", - pin.Offset, pin.Len) - } - } - - unpinned := []linux.AshmemPin{ - { - usermem.PageSize, - usermem.PageSize * 10, - }, - } - - for _, pin := range unpinned { - if pb.RangePinnedStatus(RangeFromAshmemPin(pin)) { - t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned true (pinned).", - pin.Offset, pin.Len) - } - } - - // Pin pages [2, 6). - pb.PinRange(RangeFromAshmemPin(linux.AshmemPin{ - usermem.PageSize * 2, - usermem.PageSize * 4, - })) - - // Confirm that pages 0, [2, 6) and pages larger than 10 are pinned - // while others remain unpinned. - pinned = []linux.AshmemPin{ - { - 0, - usermem.PageSize, - }, - { - usermem.PageSize * 2, - usermem.PageSize * 4, - }, - { - usermem.PageSize * 11, - 0, - }, - } - - for _, pin := range pinned { - if !pb.RangePinnedStatus(RangeFromAshmemPin(pin)) { - t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned false (unpinned).", - pin.Offset, pin.Len) - } - } - - unpinned = []linux.AshmemPin{ - { - usermem.PageSize, - usermem.PageSize, - }, { - usermem.PageSize * 6, - usermem.PageSize * 5, - }, - } - - for _, pin := range unpinned { - if pb.RangePinnedStatus(RangeFromAshmemPin(pin)) { - t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned true (pinned).", - pin.Offset, pin.Len) - } - } - - // Status of a partially pinned range is unpinned. - if pb.RangePinnedStatus(RangeFromAshmemPin(linux.AshmemPin{0, 0})) { - t.Errorf("RangePinnedStatus(all pages) returned true (pinned).") - } - - // Pin the whole range again. - pb.PinRange(RangeFromAshmemPin(linux.AshmemPin{0, 0})) - - // Confirm that all pages are pinned. - if !pb.RangePinnedStatus(RangeFromAshmemPin(linux.AshmemPin{0, 0})) { - t.Errorf("RangePinnedStatus(all pages) returned false (unpinned) at start.") - } -} diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD deleted file mode 100644 index 639a5603a..000000000 --- a/pkg/sentry/fs/binder/BUILD +++ /dev/null @@ -1,27 +0,0 @@ -package(licenses = ["notice"]) - -load("//tools/go_stateify:defs.bzl", "go_library") - -go_library( - name = "binder", - srcs = [ - "binder.go", - ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/binder", - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/sentry/arch", - "//pkg/sentry/context", - "//pkg/sentry/fs", - "//pkg/sentry/fs/fsutil", - "//pkg/sentry/kernel", - "//pkg/sentry/memmap", - "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", - "//pkg/sentry/usage", - "//pkg/sentry/usermem", - "//pkg/syserror", - "//pkg/waiter", - ], -) diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go deleted file mode 100644 index eef54d787..000000000 --- a/pkg/sentry/fs/binder/binder.go +++ /dev/null @@ -1,260 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package binder implements Android Binder IPC module. -package binder - -import ( - "sync" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/waiter" -) - -const ( - currentProtocolVersion = 8 - - // mmapSizeLimit is the upper limit for mapped memory size in Binder. - mmapSizeLimit = 4 * 1024 * 1024 // 4MB -) - -// Device implements fs.InodeOperations. -// -// +stateify savable -type Device struct { - fsutil.InodeGenericChecker `state:"nosave"` - fsutil.InodeNoExtendedAttributes `state:"nosave"` - fsutil.InodeNoopAllocate `state:"nosave"` - fsutil.InodeNoopRelease `state:"nosave"` - fsutil.InodeNoopTruncate `state:"nosave"` - fsutil.InodeNoopWriteOut `state:"nosave"` - fsutil.InodeNotDirectory `state:"nosave"` - fsutil.InodeNotMappable `state:"nosave"` - fsutil.InodeNotSocket `state:"nosave"` - fsutil.InodeNotSymlink `state:"nosave"` - fsutil.InodeVirtual `state:"nosave"` - - fsutil.InodeSimpleAttributes -} - -var _ fs.InodeOperations = (*Device)(nil) - -// NewDevice creates and initializes a Device structure. -func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device { - return &Device{ - InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fp, 0), - } -} - -// GetFile implements fs.InodeOperations.GetFile. -// -// TODO(b/30946773): Add functionality to GetFile: Additional fields will be -// needed in the Device structure, initialize them here. Also, Device will need -// to keep track of the created Procs in order to implement BINDER_READ_WRITE -// ioctl. -func (bd *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { - return fs.NewFile(ctx, d, flags, &Proc{ - bd: bd, - task: kernel.TaskFromContext(ctx), - mfp: pgalloc.MemoryFileProviderFromContext(ctx), - }), nil -} - -// Proc implements fs.FileOperations and fs.IoctlGetter. -// -// +stateify savable -type Proc struct { - fsutil.FileNoFsync `state:"nosave"` - fsutil.FileNoSplice `state:"nosave"` - fsutil.FileNotDirReaddir `state:"nosave"` - fsutil.FileUseInodeUnstableAttr `state:"nosave"` - waiter.AlwaysReady `state:"nosave"` - - bd *Device - task *kernel.Task - mfp pgalloc.MemoryFileProvider - - // mu protects fr. - mu sync.Mutex `state:"nosave"` - - // mapped is memory allocated from mfp.MemoryFile() by AddMapping. - mapped platform.FileRange -} - -// Release implements fs.FileOperations.Release. -func (bp *Proc) Release() { - bp.mu.Lock() - defer bp.mu.Unlock() - if bp.mapped.Length() != 0 { - bp.mfp.MemoryFile().DecRef(bp.mapped) - } -} - -// Seek implements fs.FileOperations.Seek. -// -// Binder doesn't support seek operation (unless in debug mode). -func (bp *Proc) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { - return offset, syserror.EOPNOTSUPP -} - -// Read implements fs.FileOperations.Read. -// -// Binder doesn't support read operation (unless in debug mode). -func (bp *Proc) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { - return 0, syserror.EOPNOTSUPP -} - -// Write implements fs.FileOperations.Write. -// -// Binder doesn't support write operation. -func (bp *Proc) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { - return 0, syserror.EOPNOTSUPP -} - -// Flush implements fs.FileOperations.Flush. -// -// TODO(b/30946773): Implement. -func (bp *Proc) Flush(ctx context.Context, file *fs.File) error { - return nil -} - -// ConfigureMMap implements fs.FileOperations.ConfigureMMap. -func (bp *Proc) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { - // Compare drivers/android/binder.c:binder_mmap(). - if caller := kernel.TaskFromContext(ctx); caller != bp.task { - return syserror.EINVAL - } - if opts.Length > mmapSizeLimit { - opts.Length = mmapSizeLimit - } - opts.MaxPerms.Write = false - - // TODO(b/30946773): Binder sets VM_DONTCOPY, preventing the created vma - // from being copied across fork(), but we don't support this yet. As - // a result, MMs containing a Binder mapping cannot be forked (MM.Fork will - // fail when AddMapping returns EBUSY). - - return fsutil.GenericConfigureMMap(file, bp, opts) -} - -// Ioctl implements fs.FileOperations.Ioctl. -// -// TODO(b/30946773): Implement. -func (bp *Proc) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { - // Switch on ioctl request. - switch uint32(args[1].Int()) { - case linux.BinderVersionIoctl: - ver := &linux.BinderVersion{ - ProtocolVersion: currentProtocolVersion, - } - // Copy result to user-space. - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ver, usermem.IOOpts{ - AddressSpaceActive: true, - }) - return 0, err - case linux.BinderWriteReadIoctl: - // TODO(b/30946773): Implement. - fallthrough - case linux.BinderSetIdleTimeoutIoctl: - // TODO(b/30946773): Implement. - fallthrough - case linux.BinderSetMaxThreadsIoctl: - // TODO(b/30946773): Implement. - fallthrough - case linux.BinderSetIdlePriorityIoctl: - // TODO(b/30946773): Implement. - fallthrough - case linux.BinderSetContextMgrIoctl: - // TODO(b/30946773): Implement. - fallthrough - case linux.BinderThreadExitIoctl: - // TODO(b/30946773): Implement. - return 0, syserror.ENOSYS - default: - // Ioctls irrelevant to Binder. - return 0, syserror.EINVAL - } -} - -// AddMapping implements memmap.Mappable.AddMapping. -func (bp *Proc) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, _ bool) error { - bp.mu.Lock() - defer bp.mu.Unlock() - if bp.mapped.Length() != 0 { - // mmap has been called before, which binder_mmap() doesn't like. - return syserror.EBUSY - } - // Binder only allocates and maps a single page up-front - // (drivers/android/binder.c:binder_mmap() => binder_update_page_range()). - fr, err := bp.mfp.MemoryFile().Allocate(usermem.PageSize, usage.Anonymous) - if err != nil { - return err - } - bp.mapped = fr - return nil -} - -// RemoveMapping implements memmap.Mappable.RemoveMapping. -func (*Proc) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) { - // Nothing to do. Notably, we don't free bp.mapped to allow another mmap. -} - -// CopyMapping implements memmap.Mappable.CopyMapping. -func (bp *Proc) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, _ bool) error { - // Nothing to do. Notably, this is one case where CopyMapping isn't - // equivalent to AddMapping, as AddMapping would return EBUSY. - return nil -} - -// Translate implements memmap.Mappable.Translate. -func (bp *Proc) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { - // TODO(b/30946773): In addition to the page initially allocated and mapped - // in AddMapping (Linux: binder_mmap), Binder allocates and maps pages for - // each transaction (Linux: binder_ioctl => binder_ioctl_write_read => - // binder_thread_write => binder_transaction => binder_alloc_buf => - // binder_update_page_range). Since we don't actually implement - // BinderWriteReadIoctl (Linux: BINDER_WRITE_READ), we only ever have the - // first page. - var err error - if required.End > usermem.PageSize { - err = &memmap.BusError{syserror.EFAULT} - } - if required.Start == 0 { - return []memmap.Translation{ - { - Source: memmap.MappableRange{0, usermem.PageSize}, - File: bp.mfp.MemoryFile(), - Offset: bp.mapped.Start, - Perms: usermem.AnyAccess, - }, - }, err - } - return nil, err -} - -// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. -func (bp *Proc) InvalidateUnsavable(ctx context.Context) error { - return nil -} diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index a9b03d172..59de615fb 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -20,8 +20,6 @@ go_library( "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", - "//pkg/sentry/fs/ashmem", - "//pkg/sentry/fs/binder", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/ramfs", "//pkg/sentry/fs/tmpfs", diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go index fb6c30ff0..d4bbd9807 100644 --- a/pkg/sentry/fs/dev/dev.go +++ b/pkg/sentry/fs/dev/dev.go @@ -20,8 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/ashmem" - "gvisor.dev/gvisor/pkg/sentry/fs/binder" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -81,7 +79,7 @@ func newSymlink(ctx context.Context, target string, msrc *fs.MountSource) *fs.In } // New returns the root node of a device filesystem. -func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEnabled bool) *fs.Inode { +func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode { contents := map[string]*fs.Inode{ "fd": newSymlink(ctx, "/proc/self/fd", msrc), "stdin": newSymlink(ctx, "/proc/self/fd/0", msrc), @@ -118,16 +116,6 @@ func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEn "ptmx": newSymlink(ctx, "pts/ptmx", msrc), } - if binderEnabled { - binder := binder.NewDevice(ctx, fs.RootOwner, fs.FilePermsFromMode(0666)) - contents["binder"] = newCharacterDevice(ctx, binder, msrc) - } - - if ashmemEnabled { - ashmem := ashmem.NewDevice(ctx, fs.RootOwner, fs.FilePermsFromMode(0666)) - contents["ashmem"] = newCharacterDevice(ctx, ashmem, msrc) - } - iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) return fs.NewInode(ctx, iops, msrc, fs.StableAttr{ DeviceID: devDevice.DeviceID(), diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go index cbc2c2f9b..55f8af704 100644 --- a/pkg/sentry/fs/dev/fs.go +++ b/pkg/sentry/fs/dev/fs.go @@ -15,19 +15,10 @@ package dev import ( - "strconv" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" ) -// Optional key containing boolean flag which specifies if Android Binder IPC should be enabled. -const binderEnabledKey = "binder_enabled" - -// Optional key containing boolean flag which specifies if Android ashmem should be enabled. -const ashmemEnabledKey = "ashmem_enabled" - // filesystem is a devtmpfs. // // +stateify savable @@ -67,33 +58,7 @@ func (*filesystem) Flags() fs.FilesystemFlags { // Mount returns a devtmpfs root that can be positioned in the vfs. func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) { - // device is always ignored. // devtmpfs backed by ramfs ignores bad options. See fs/ramfs/inode.c:ramfs_parse_options. // -> we should consider parsing the mode and backing devtmpfs by this. - - // Parse generic comma-separated key=value options. - options := fs.GenericMountSourceOptions(data) - - // binerEnabledKey is optional and binder is disabled by default. - binderEnabled := false - if beStr, exists := options[binderEnabledKey]; exists { - var err error - binderEnabled, err = strconv.ParseBool(beStr) - if err != nil { - return nil, syserror.EINVAL - } - } - - // ashmemEnabledKey is optional and ashmem is disabled by default. - ashmemEnabled := false - if aeStr, exists := options[ashmemEnabledKey]; exists { - var err error - ashmemEnabled, err = strconv.ParseBool(aeStr) - if err != nil { - return nil, syserror.EINVAL - } - } - - // Construct the devtmpfs root. - return New(ctx, fs.NewNonCachingMountSource(ctx, f, flags), binderEnabled, ashmemEnabled), nil + return New(ctx, fs.NewNonCachingMountSource(ctx, f, flags)), nil } -- cgit v1.2.3 From 45566fa4e4bc55e870ae1d7f80778be8432bdef5 Mon Sep 17 00:00:00 2001 From: Ian Gudger Date: Fri, 28 Jun 2019 20:06:33 -0700 Subject: Add finalizer on AtomicRefCount to check for leaks. PiperOrigin-RevId: 255711454 --- pkg/refs/BUILD | 3 + pkg/refs/refcounter.go | 69 ++++++++++++++++++++++ pkg/sentry/fs/dirent.go | 4 +- pkg/sentry/fs/file.go | 5 +- pkg/sentry/fs/file_overlay.go | 5 +- pkg/sentry/fs/gofer/handles.go | 5 +- pkg/sentry/fs/gofer/path.go | 7 ++- pkg/sentry/fs/gofer/session.go | 7 ++- pkg/sentry/fs/gofer/session_state.go | 1 - pkg/sentry/fs/host/socket.go | 8 ++- pkg/sentry/fs/inode.go | 4 +- pkg/sentry/fs/mount.go | 4 +- pkg/sentry/fs/mounts.go | 6 +- pkg/sentry/fs/tty/terminal.go | 4 +- pkg/sentry/kernel/fd_map.go | 4 +- pkg/sentry/kernel/fs_context.go | 4 +- pkg/sentry/kernel/sessions.go | 16 +++-- pkg/sentry/kernel/shm/shm.go | 1 + pkg/sentry/mm/aio_context.go | 4 +- pkg/sentry/mm/special_mappable.go | 4 +- pkg/sentry/socket/unix/transport/connectioned.go | 4 ++ pkg/sentry/socket/unix/transport/connectionless.go | 4 +- pkg/sentry/socket/unix/unix.go | 7 ++- runsc/boot/BUILD | 1 + runsc/boot/loader.go | 6 ++ 25 files changed, 152 insertions(+), 35 deletions(-) diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD index 0652fbbe6..9c08452fc 100644 --- a/pkg/refs/BUILD +++ b/pkg/refs/BUILD @@ -24,6 +24,9 @@ go_library( ], importpath = "gvisor.dev/gvisor/pkg/refs", visibility = ["//:sandbox"], + deps = [ + "//pkg/log", + ], ) go_test( diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go index 20f515391..6d6c3a782 100644 --- a/pkg/refs/refcounter.go +++ b/pkg/refs/refcounter.go @@ -18,8 +18,11 @@ package refs import ( "reflect" + "runtime" "sync" "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" ) // RefCounter is the interface to be implemented by objects that are reference @@ -189,6 +192,11 @@ type AtomicRefCount struct { // how these fields are used. refCount int64 + // name is the name of the type which owns this ref count. + // + // name is immutable after EnableLeakCheck is called. + name string + // mu protects the list below. mu sync.Mutex `state:"nosave"` @@ -196,6 +204,67 @@ type AtomicRefCount struct { weakRefs weakRefList `state:"nosave"` } +// LeakMode configures the leak checker. +type LeakMode uint32 + +const ( + // uninitializedLeakChecking indicates that the leak checker has not yet been initialized. + uninitializedLeakChecking LeakMode = iota + + // NoLeakChecking indicates that no effort should be made to check for + // leaks. + NoLeakChecking + + // LeaksLogWarning indicates that a warning should be logged when leaks + // are found. + LeaksLogWarning +) + +// leakMode stores the current mode for the reference leak checker. +// +// Values must be one of the LeakMode values. +// +// leakMode must be accessed atomically. +var leakMode uint32 + +// SetLeakMode configures the reference leak checker. +func SetLeakMode(mode LeakMode) { + atomic.StoreUint32(&leakMode, uint32(mode)) +} + +func (r *AtomicRefCount) finalize() { + var note string + switch LeakMode(atomic.LoadUint32(&leakMode)) { + case NoLeakChecking: + return + case uninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sAtomicRefCount %p owned by %q garbage collected with ref count of %d (want 0)", note, r, r.name, n) + } +} + +// EnableLeakCheck checks for reference leaks when the AtomicRefCount gets +// garbage collected. +// +// This function adds a finalizer to the AtomicRefCount, so the AtomicRefCount +// must be at the beginning of its parent. +// +// name is a friendly name that will be listed as the owner of the +// AtomicRefCount in logs. It should be the name of the parent type, including +// package. +func (r *AtomicRefCount) EnableLeakCheck(name string) { + if name == "" { + panic("invalid name") + } + if LeakMode(atomic.LoadUint32(&leakMode)) == NoLeakChecking { + return + } + r.name = name + runtime.SetFinalizer(r, (*AtomicRefCount).finalize) +} + // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *AtomicRefCount) ReadRefs() int64 { diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index 28651e58b..fbca06761 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -229,11 +229,13 @@ func newDirent(inode *Inode, name string) *Dirent { if inode != nil { inode.MountSource.IncDirentRefs() } - return &Dirent{ + d := Dirent{ Inode: inode, name: name, children: make(map[string]*refs.WeakRef), } + d.EnableLeakCheck("fs.Dirent") + return &d } // NewNegativeDirent returns a new root negative Dirent. Otherwise same as NewDirent. diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 8e1f5674d..bb8117f89 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -130,14 +130,15 @@ type File struct { // to false respectively. func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOperations) *File { dirent.IncRef() - f := &File{ + f := File{ UniqueID: uniqueid.GlobalFromContext(ctx), Dirent: dirent, FileOperations: fops, flags: flags, } f.mu.Init() - return f + f.EnableLeakCheck("fs.File") + return &f } // DecRef destroys the File when it is no longer referenced. diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index c6cbd5631..9820f0b13 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -347,13 +347,14 @@ func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opt // preventing us from saving a proper inode mapping for the // file. file.IncRef() - id := &overlayMappingIdentity{ + id := overlayMappingIdentity{ id: opts.MappingIdentity, overlayFile: file, } + id.EnableLeakCheck("fs.overlayMappingIdentity") // Swap out the old MappingIdentity for the wrapped one. - opts.MappingIdentity = id + opts.MappingIdentity = &id return nil } diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go index b87c4f150..27eeae3d9 100644 --- a/pkg/sentry/fs/gofer/handles.go +++ b/pkg/sentry/fs/gofer/handles.go @@ -79,11 +79,12 @@ func newHandles(ctx context.Context, file contextFile, flags fs.FileFlags) (*han newFile.close(ctx) return nil, err } - h := &handles{ + h := handles{ File: newFile, Host: hostFile, } - return h, nil + h.EnableLeakCheck("gofer.handles") + return &h, nil } type handleReadWriter struct { diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index b91386909..8c17603f8 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -145,16 +145,17 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string defer d.DecRef() // Construct the new file, caching the handles if allowed. - h := &handles{ + h := handles{ File: newFile, Host: hostFile, } + h.EnableLeakCheck("gofer.handles") if iops.fileState.canShareHandles() { iops.fileState.handlesMu.Lock() - iops.fileState.setSharedHandlesLocked(flags, h) + iops.fileState.setSharedHandlesLocked(flags, &h) iops.fileState.handlesMu.Unlock() } - return NewFile(ctx, d, name, flags, iops, h), nil + return NewFile(ctx, d, name, flags, iops, &h), nil } // CreateLink uses Create to create a symlink between oldname and newname. diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index 9f7660ed1..69d08a627 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -241,7 +241,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF } // Construct the session. - s := &session{ + s := session{ connID: dev, msize: o.msize, version: o.version, @@ -250,13 +250,14 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF superBlockFlags: superBlockFlags, mounter: mounter, } + s.EnableLeakCheck("gofer.session") if o.privateunixsocket { s.endpoints = newEndpointMaps() } // Construct the MountSource with the session and superBlockFlags. - m := fs.NewMountSource(ctx, s, filesystem, superBlockFlags) + m := fs.NewMountSource(ctx, &s, filesystem, superBlockFlags) // Given that gofer files can consume host FDs, restrict the number // of files that can be held by the cache. @@ -290,7 +291,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF return nil, err } - sattr, iops := newInodeOperations(ctx, s, s.attach, qid, valid, attr, false) + sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr, false) return fs.NewInode(ctx, iops, m, sattr), nil } diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go index 29a79441e..d045e04ff 100644 --- a/pkg/sentry/fs/gofer/session_state.go +++ b/pkg/sentry/fs/gofer/session_state.go @@ -111,5 +111,4 @@ func (s *session) afterLoad() { panic("failed to restore endpoint maps: " + err.Error()) } } - } diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index 7fedc88bc..44c4ee5f2 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -47,12 +47,12 @@ const maxSendBufferSize = 8 << 20 // // +stateify savable type ConnectedEndpoint struct { - queue *waiter.Queue - path string - // ref keeps track of references to a connectedEndpoint. ref refs.AtomicRefCount + queue *waiter.Queue + path string + // If srfd >= 0, it is the host FD that file was imported from. srfd int `state:"wait"` @@ -133,6 +133,8 @@ func NewConnectedEndpoint(ctx context.Context, file *fd.FD, queue *waiter.Queue, // AtomicRefCounters start off with a single reference. We need two. e.ref.IncRef() + e.ref.EnableLeakCheck("host.ConnectedEndpoint") + return &e, nil } diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index e4aae1135..f4ddfa406 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -86,12 +86,14 @@ type LockCtx struct { // NewInode takes a reference on msrc. func NewInode(ctx context.Context, iops InodeOperations, msrc *MountSource, sattr StableAttr) *Inode { msrc.IncRef() - return &Inode{ + i := Inode{ InodeOperations: iops, StableAttr: sattr, Watches: newWatches(), MountSource: msrc, } + i.EnableLeakCheck("fs.Inode") + return &i } // DecRef drops a reference on the Inode. diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go index 912495528..7a9692800 100644 --- a/pkg/sentry/fs/mount.go +++ b/pkg/sentry/fs/mount.go @@ -138,12 +138,14 @@ func NewMountSource(ctx context.Context, mops MountSourceOperations, filesystem if filesystem != nil { fsType = filesystem.Name() } - return &MountSource{ + msrc := MountSource{ MountSourceOperations: mops, Flags: flags, FilesystemType: fsType, fscache: NewDirentCache(DefaultDirentCacheSize), } + msrc.EnableLeakCheck("fs.MountSource") + return &msrc } // DirentRefs returns the current mount direntRefs. diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index 281364dfc..ce7ffeed2 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -181,12 +181,14 @@ func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error d: newRootMount(1, d), } - return &MountNamespace{ + mns := MountNamespace{ userns: creds.UserNamespace, root: d, mounts: mnts, mountID: 2, - }, nil + } + mns.EnableLeakCheck("fs.MountNamespace") + return &mns, nil } // UserNamespace returns the user namespace associated with this mount manager. diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go index 8290f2530..b7cecb2ed 100644 --- a/pkg/sentry/fs/tty/terminal.go +++ b/pkg/sentry/fs/tty/terminal.go @@ -38,9 +38,11 @@ type Terminal struct { func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal { termios := linux.DefaultSlaveTermios - return &Terminal{ + t := Terminal{ d: d, n: n, ld: newLineDiscipline(termios), } + t.EnableLeakCheck("tty.Terminal") + return &t } diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go index 786936a7d..1b84bfe14 100644 --- a/pkg/sentry/kernel/fd_map.go +++ b/pkg/sentry/kernel/fd_map.go @@ -98,11 +98,13 @@ func (f *FDMap) ID() uint64 { // NewFDMap allocates a new FDMap that may be used by tasks in k. func (k *Kernel) NewFDMap() *FDMap { - return &FDMap{ + f := FDMap{ k: k, files: make(map[kdefs.FD]descriptor), uid: atomic.AddUint64(&k.fdMapUids, 1), } + f.EnableLeakCheck("kernel.FDMap") + return &f } // destroy removes all of the file descriptors from the map. diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 938239aeb..ded27d668 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -51,11 +51,13 @@ type FSContext struct { func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { root.IncRef() cwd.IncRef() - return &FSContext{ + f := FSContext{ root: root, cwd: cwd, umask: umask, } + f.EnableLeakCheck("kernel.FSContext") + return &f } // destroy is the destructor for an FSContext. diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 355984140..81fcd8258 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -294,6 +294,7 @@ func (tg *ThreadGroup) createSession() error { id: SessionID(id), leader: tg, } + s.refs.EnableLeakCheck("kernel.Session") // Create a new ProcessGroup, belonging to that Session. // This also has a single reference (assigned below). @@ -307,6 +308,7 @@ func (tg *ThreadGroup) createSession() error { session: s, ancestors: 0, } + pg.refs.EnableLeakCheck("kernel.ProcessGroup") // Tie them and return the result. s.processGroups.PushBack(pg) @@ -378,11 +380,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // We manually adjust the ancestors if the parent is in the same // session. tg.processGroup.session.incRef() - pg := &ProcessGroup{ + pg := ProcessGroup{ id: ProcessGroupID(id), originator: tg, session: tg.processGroup.session, } + pg.refs.EnableLeakCheck("kernel.ProcessGroup") + if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session { pg.ancestors++ } @@ -390,20 +394,20 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // Assign the new process group; adjust children. oldParentPG := tg.parentPG() tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { - childTG.processGroup.incRefWithParent(pg) + childTG.processGroup.incRefWithParent(&pg) childTG.processGroup.decRefWithParent(oldParentPG) }) tg.processGroup.decRefWithParent(oldParentPG) - tg.processGroup = pg + tg.processGroup = &pg // Add the new process group to the session. - pg.session.processGroups.PushBack(pg) + pg.session.processGroups.PushBack(&pg) // Ensure this translation is added to all namespaces. for ns := tg.pidns; ns != nil; ns = ns.parent { local := ns.tgids[tg] - ns.pgids[pg] = ProcessGroupID(local) - ns.processGroups[ProcessGroupID(local)] = pg + ns.pgids[&pg] = ProcessGroupID(local) + ns.processGroups[ProcessGroupID(local)] = &pg } return nil diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 3e9fe70e2..5bd610f68 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -224,6 +224,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi creatorPID: pid, changeTime: ktime.NowFromContext(ctx), } + shm.EnableLeakCheck("kernel.Shm") // Find the next available ID. for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index c9a942023..1b746d030 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -213,7 +213,9 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { if err != nil { return nil, err } - return &aioMappable{mfp: mfp, fr: fr}, nil + m := aioMappable{mfp: mfp, fr: fr} + m.EnableLeakCheck("mm.aioMappable") + return &m, nil } // DecRef implements refs.RefCounter.DecRef. diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index 2f8651a0a..ea2d7af74 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -45,7 +45,9 @@ type SpecialMappable struct { // // Preconditions: fr.Length() != 0. func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable { - return &SpecialMappable{mfp: mfp, fr: fr, name: name} + m := SpecialMappable{mfp: mfp, fr: fr, name: name} + m.EnableLeakCheck("mm.SpecialMappable") + return &m } // DecRef implements refs.RefCounter.DecRef. diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index e4c416233..73d2df15d 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -143,7 +143,9 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E } q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit} + q1.EnableLeakCheck("transport.queue") q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit} + q2.EnableLeakCheck("transport.queue") if stype == linux.SOCK_STREAM { a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}} @@ -294,12 +296,14 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn } readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit} + readQueue.EnableLeakCheck("transport.queue") ne.connected = &connectedEndpoint{ endpoint: ce, writeQueue: readQueue, } writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit} + writeQueue.EnableLeakCheck("transport.queue") if e.stype == linux.SOCK_STREAM { ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}} } else { diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index e987519f0..c591f261d 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -41,7 +41,9 @@ var ( // NewConnectionless creates a new unbound dgram endpoint. func NewConnectionless(ctx context.Context) Endpoint { ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}} - ep.receiver = &queueReceiver{readQueue: &queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit}} + q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit} + q.EnableLeakCheck("transport.queue") + ep.receiver = &queueReceiver{readQueue: &q} return ep } diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 6190de0c5..bf7d2cfa2 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -69,10 +69,13 @@ func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) // NewWithDirent creates a new unix socket using an existing dirent. func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, stype linux.SockType, flags fs.FileFlags) *fs.File { - return fs.NewFile(ctx, d, flags, &SocketOperations{ + s := SocketOperations{ ep: ep, stype: stype, - }) + } + s.EnableLeakCheck("unix.SocketOperations") + + return fs.NewFile(ctx, d, flags, &s) } // DecRef implements RefCounter.DecRef. diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 7ec15a524..d91f66d95 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -34,6 +34,7 @@ go_library( "//pkg/log", "//pkg/memutil", "//pkg/rand", + "//pkg/refs", "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/context", diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 41027416d..89f7d9f94 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -32,6 +32,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fs/host" @@ -1006,3 +1007,8 @@ func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host } return ep.tg, ep.tty, nil } + +func init() { + // TODO(gvisor.dev/issue/365): Make this configurable. + refs.SetLeakMode(refs.NoLeakChecking) +} -- cgit v1.2.3 From 6d204f6a3474f411dc39b59886973e3518cf1cec Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Fri, 28 Jun 2019 20:34:03 -0700 Subject: Drop local_server support. PiperOrigin-RevId: 255713414 --- pkg/p9/local_server/BUILD | 14 -- pkg/p9/local_server/local_server.go | 353 ------------------------------------ 2 files changed, 367 deletions(-) delete mode 100644 pkg/p9/local_server/BUILD delete mode 100644 pkg/p9/local_server/local_server.go diff --git a/pkg/p9/local_server/BUILD b/pkg/p9/local_server/BUILD deleted file mode 100644 index aa6db186c..000000000 --- a/pkg/p9/local_server/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") - -package(licenses = ["notice"]) - -go_binary( - name = "local_server", - srcs = ["local_server.go"], - deps = [ - "//pkg/fd", - "//pkg/log", - "//pkg/p9", - "//pkg/unet", - ], -) diff --git a/pkg/p9/local_server/local_server.go b/pkg/p9/local_server/local_server.go deleted file mode 100644 index 905188aaa..000000000 --- a/pkg/p9/local_server/local_server.go +++ /dev/null @@ -1,353 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Binary local_server provides a local 9P2000.L server for the p9 package. -// -// To use, first start the server: -// local_server /tmp/my_bind_addr -// -// Then, connect using the Linux 9P filesystem: -// mount -t 9p -o trans=unix /tmp/my_bind_addr /mnt -// -// This package also serves as an examplar. -package main - -import ( - "os" - "path" - "syscall" - - "gvisor.dev/gvisor/pkg/fd" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/unet" -) - -// local wraps a local file. -type local struct { - p9.DefaultWalkGetAttr - - path string - file *os.File -} - -// info constructs a QID for this file. -func (l *local) info() (p9.QID, os.FileInfo, error) { - var ( - qid p9.QID - fi os.FileInfo - err error - ) - - // Stat the file. - if l.file != nil { - fi, err = l.file.Stat() - } else { - fi, err = os.Lstat(l.path) - } - if err != nil { - log.Warningf("error stating %#v: %v", l, err) - return qid, nil, err - } - - // Construct the QID type. - qid.Type = p9.ModeFromOS(fi.Mode()).QIDType() - - // Save the path from the Ino. - qid.Path = fi.Sys().(*syscall.Stat_t).Ino - return qid, fi, nil -} - -// Attach implements p9.Attacher.Attach. -func (l *local) Attach() (p9.File, error) { - return &local{path: "/"}, nil -} - -// Walk implements p9.File.Walk. -func (l *local) Walk(names []string) ([]p9.QID, p9.File, error) { - var qids []p9.QID - last := &local{path: l.path} - for _, name := range names { - c := &local{path: path.Join(last.path, name)} - qid, _, err := c.info() - if err != nil { - return nil, nil, err - } - qids = append(qids, qid) - last = c - } - return qids, last, nil -} - -// StatFS implements p9.File.StatFS. -// -// Not implemented. -func (l *local) StatFS() (p9.FSStat, error) { - return p9.FSStat{}, syscall.ENOSYS -} - -// FSync implements p9.File.FSync. -func (l *local) FSync() error { - return l.file.Sync() -} - -// GetAttr implements p9.File.GetAttr. -// -// Not fully implemented. -func (l *local) GetAttr(req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { - qid, fi, err := l.info() - if err != nil { - return qid, p9.AttrMask{}, p9.Attr{}, err - } - - stat := fi.Sys().(*syscall.Stat_t) - attr := p9.Attr{ - Mode: p9.FileMode(stat.Mode), - UID: p9.UID(stat.Uid), - GID: p9.GID(stat.Gid), - NLink: stat.Nlink, - RDev: stat.Rdev, - Size: uint64(stat.Size), - BlockSize: uint64(stat.Blksize), - Blocks: uint64(stat.Blocks), - ATimeSeconds: uint64(stat.Atim.Sec), - ATimeNanoSeconds: uint64(stat.Atim.Nsec), - MTimeSeconds: uint64(stat.Mtim.Sec), - MTimeNanoSeconds: uint64(stat.Mtim.Nsec), - CTimeSeconds: uint64(stat.Ctim.Sec), - CTimeNanoSeconds: uint64(stat.Ctim.Nsec), - } - valid := p9.AttrMask{ - Mode: true, - UID: true, - GID: true, - NLink: true, - RDev: true, - Size: true, - Blocks: true, - ATime: true, - MTime: true, - CTime: true, - } - - return qid, valid, attr, nil -} - -// SetAttr implements p9.File.SetAttr. -// -// Not implemented. -func (l *local) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { - return syscall.ENOSYS -} - -// Remove implements p9.File.Remove. -// -// Not implemented. -func (l *local) Remove() error { - return syscall.ENOSYS -} - -// Rename implements p9.File.Rename. -// -// Not implemented. -func (l *local) Rename(directory p9.File, name string) error { - return syscall.ENOSYS -} - -// Close implements p9.File.Close. -func (l *local) Close() error { - if l.file != nil { - return l.file.Close() - } - return nil -} - -// Open implements p9.File.Open. -func (l *local) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { - qid, _, err := l.info() - if err != nil { - return nil, qid, 0, err - } - - // Do the actual open. - f, err := os.OpenFile(l.path, int(mode), 0) - if err != nil { - return nil, qid, 0, err - } - l.file = f - - // Note: we don't send the local file for this server. - return nil, qid, 4096, nil -} - -// Read implements p9.File.Read. -func (l *local) ReadAt(p []byte, offset uint64) (int, error) { - return l.file.ReadAt(p, int64(offset)) -} - -// Write implements p9.File.Write. -func (l *local) WriteAt(p []byte, offset uint64) (int, error) { - return l.file.WriteAt(p, int64(offset)) -} - -// Create implements p9.File.Create. -func (l *local) Create(name string, mode p9.OpenFlags, permissions p9.FileMode, _ p9.UID, _ p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) { - f, err := os.OpenFile(l.path, int(mode)|syscall.O_CREAT|syscall.O_EXCL, os.FileMode(permissions)) - if err != nil { - return nil, nil, p9.QID{}, 0, err - } - - l2 := &local{path: path.Join(l.path, name), file: f} - qid, _, err := l2.info() - if err != nil { - l2.Close() - return nil, nil, p9.QID{}, 0, err - } - - return nil, l2, qid, 4096, nil -} - -// Mkdir implements p9.File.Mkdir. -// -// Not properly implemented. -func (l *local) Mkdir(name string, permissions p9.FileMode, _ p9.UID, _ p9.GID) (p9.QID, error) { - if err := os.Mkdir(path.Join(l.path, name), os.FileMode(permissions)); err != nil { - return p9.QID{}, err - } - - // Blank QID. - return p9.QID{}, nil -} - -// Symlink implements p9.File.Symlink. -// -// Not properly implemented. -func (l *local) Symlink(oldname string, newname string, _ p9.UID, _ p9.GID) (p9.QID, error) { - if err := os.Symlink(oldname, path.Join(l.path, newname)); err != nil { - return p9.QID{}, err - } - - // Blank QID. - return p9.QID{}, nil -} - -// Link implements p9.File.Link. -// -// Not properly implemented. -func (l *local) Link(target p9.File, newname string) error { - return os.Link(target.(*local).path, path.Join(l.path, newname)) -} - -// Mknod implements p9.File.Mknod. -// -// Not implemented. -func (l *local) Mknod(name string, mode p9.FileMode, major uint32, minor uint32, _ p9.UID, _ p9.GID) (p9.QID, error) { - return p9.QID{}, syscall.ENOSYS -} - -// RenameAt implements p9.File.RenameAt. -// -// Not implemented. -func (l *local) RenameAt(oldname string, newdir p9.File, newname string) error { - return syscall.ENOSYS -} - -// UnlinkAt implements p9.File.UnlinkAt. -// -// Not implemented. -func (l *local) UnlinkAt(name string, flags uint32) error { - return syscall.ENOSYS -} - -// Readdir implements p9.File.Readdir. -func (l *local) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) { - // We only do *all* dirents in single shot. - const maxDirentBuffer = 1024 * 1024 - buf := make([]byte, maxDirentBuffer) - n, err := syscall.ReadDirent(int(l.file.Fd()), buf) - if err != nil { - // Return zero entries. - return nil, nil - } - - // Parse the entries; note that we read up to offset+count here. - _, newCount, newNames := syscall.ParseDirent(buf[:n], int(offset)+int(count), nil) - var dirents []p9.Dirent - for i := int(offset); i >= 0 && i < newCount; i++ { - entry := local{path: path.Join(l.path, newNames[i])} - qid, _, err := entry.info() - if err != nil { - continue - } - dirents = append(dirents, p9.Dirent{ - QID: qid, - Type: qid.Type, - Name: newNames[i], - Offset: uint64(i + 1), - }) - } - - return dirents, nil -} - -// Readlink implements p9.File.Readlink. -// -// Not properly implemented. -func (l *local) Readlink() (string, error) { - return os.Readlink(l.path) -} - -// Flush implements p9.File.Flush. -func (l *local) Flush() error { - return nil -} - -// Connect implements p9.File.Connect. -func (l *local) Connect(p9.ConnectFlags) (*fd.FD, error) { - return nil, syscall.ECONNREFUSED -} - -// Renamed implements p9.File.Renamed. -func (l *local) Renamed(parent p9.File, newName string) { - l.path = path.Join(parent.(*local).path, newName) -} - -// Allocate implements p9.File.Allocate. -func (l *local) Allocate(mode p9.AllocateMode, offset, length uint64) error { - return syscall.Fallocate(int(l.file.Fd()), mode.ToLinux(), int64(offset), int64(length)) -} - -func main() { - log.SetLevel(log.Debug) - - if len(os.Args) != 2 { - log.Warningf("usage: %s ", os.Args[0]) - os.Exit(1) - } - - // Bind and listen on the socket. - serverSocket, err := unet.BindAndListen(os.Args[1], false) - if err != nil { - log.Warningf("err binding: %v", err) - os.Exit(1) - } - - // Run the server. - s := p9.NewServer(&local{}) - s.Serve(serverSocket) -} - -var ( - _ p9.File = &local{} -) -- cgit v1.2.3 From 3446f4e29bd547e5576caf16d8c2bb45560439e9 Mon Sep 17 00:00:00 2001 From: Ian Gudger Date: Sat, 29 Jun 2019 09:22:09 -0700 Subject: Add stack trace printing to reference leak checking. PiperOrigin-RevId: 255759891 --- pkg/refs/refcounter.go | 92 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 2 deletions(-) diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go index 6d6c3a782..3dd6f0dd4 100644 --- a/pkg/refs/refcounter.go +++ b/pkg/refs/refcounter.go @@ -17,6 +17,8 @@ package refs import ( + "bytes" + "fmt" "reflect" "runtime" "sync" @@ -197,6 +199,11 @@ type AtomicRefCount struct { // name is immutable after EnableLeakCheck is called. name string + // stack optionally records the caller of EnableLeakCheck. + // + // stack is immutable after EnableLeakCheck is called. + stack []uintptr + // mu protects the list below. mu sync.Mutex `state:"nosave"` @@ -218,6 +225,10 @@ const ( // LeaksLogWarning indicates that a warning should be logged when leaks // are found. LeaksLogWarning + + // LeaksLogTraces indicates that a trace collected during allocation + // should be logged when leaks are found. + LeaksLogTraces ) // leakMode stores the current mode for the reference leak checker. @@ -232,6 +243,76 @@ func SetLeakMode(mode LeakMode) { atomic.StoreUint32(&leakMode, uint32(mode)) } +const maxStackFrames = 40 + +type fileLine struct { + file string + line int +} + +// A stackKey is a representation of a stack frame for use as a map key. +// +// The fileLine type is used as PC values seem to vary across collections, even +// for the same call stack. +type stackKey [maxStackFrames]fileLine + +var stackCache = struct { + sync.Mutex + entries map[stackKey][]uintptr +}{entries: map[stackKey][]uintptr{}} + +func makeStackKey(pcs []uintptr) stackKey { + frames := runtime.CallersFrames(pcs) + var key stackKey + keySlice := key[:0] + for { + frame, more := frames.Next() + keySlice = append(keySlice, fileLine{frame.File, frame.Line}) + + if !more || len(keySlice) == len(key) { + break + } + } + return key +} + +func recordStack() []uintptr { + pcs := make([]uintptr, maxStackFrames) + n := runtime.Callers(1, pcs) + if n == 0 { + // No pcs available. Stop now. + // + // This can happen if the first argument to runtime.Callers + // is large. + return nil + } + pcs = pcs[:n] + key := makeStackKey(pcs) + stackCache.Lock() + v, ok := stackCache.entries[key] + if !ok { + // Reallocate to prevent pcs from escaping. + v = append([]uintptr(nil), pcs...) + stackCache.entries[key] = v + } + stackCache.Unlock() + return v +} + +func formatStack(pcs []uintptr) string { + frames := runtime.CallersFrames(pcs) + var trace bytes.Buffer + for { + frame, more := frames.Next() + fmt.Fprintf(&trace, "%s:%d: %s\n", frame.File, frame.Line, frame.Function) + + if !more { + break + } + } + return trace.String() +} + func (r *AtomicRefCount) finalize() { var note string switch LeakMode(atomic.LoadUint32(&leakMode)) { @@ -241,7 +322,11 @@ func (r *AtomicRefCount) finalize() { note = "(Leak checker uninitialized): " } if n := r.ReadRefs(); n != 0 { - log.Warningf("%sAtomicRefCount %p owned by %q garbage collected with ref count of %d (want 0)", note, r, r.name, n) + msg := fmt.Sprintf("%sAtomicRefCount %p owned by %q garbage collected with ref count of %d (want 0)", note, r, r.name, n) + if len(r.stack) != 0 { + msg += ":\nCaller:\n" + formatStack(r.stack) + } + log.Warningf(msg) } } @@ -258,8 +343,11 @@ func (r *AtomicRefCount) EnableLeakCheck(name string) { if name == "" { panic("invalid name") } - if LeakMode(atomic.LoadUint32(&leakMode)) == NoLeakChecking { + switch LeakMode(atomic.LoadUint32(&leakMode)) { + case NoLeakChecking: return + case LeaksLogTraces: + r.stack = recordStack() } r.name = name runtime.SetFinalizer(r, (*AtomicRefCount).finalize) -- cgit v1.2.3 From 06537129a67cbdced394f514a7d2399c19082f47 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Mon, 1 Jul 2019 15:24:18 -0700 Subject: Check remaining traversal limit when creating a file through a symlink. This fixes the case when an app tries to create a file that already exists, and is a symlink to itself. A test was added. PiperOrigin-RevId: 256044811 --- pkg/sentry/syscalls/linux/sys_file.go | 6 ++++++ test/syscalls/linux/symlink.cc | 13 +++++++++++++ 2 files changed, 19 insertions(+) diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 3ef7441c2..3410af69c 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -354,6 +354,12 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod break } + // Are we able to resolve further? + if remainingTraversals == 0 { + found.DecRef() + return syscall.ELOOP + } + // Resolve the symlink to a path via Readlink. path, err := found.Inode.Readlink(t) if err != nil { diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc index dce8de9ec..69650a1d3 100644 --- a/test/syscalls/linux/symlink.cc +++ b/test/syscalls/linux/symlink.cc @@ -312,6 +312,19 @@ TEST_P(ParamSymlinkTest, OpenLinkCreatesTarget) { ASSERT_THAT(unlink(target.c_str()), SyscallSucceeds()); } +// Test that opening a self-symlink with O_CREAT will fail with ELOOP. +TEST_P(ParamSymlinkTest, CreateExistingSelfLink) { + ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds()); + + const std::string linkpath = GetParam(); + ASSERT_THAT(symlink(linkpath.c_str(), linkpath.c_str()), SyscallSucceeds()); + + EXPECT_THAT(open(linkpath.c_str(), O_CREAT, 0666), + SyscallFailsWithErrno(ELOOP)); + + ASSERT_THAT(unlink(linkpath.c_str()), SyscallSucceeds()); +} + // Test that opening an existing symlink with O_CREAT|O_EXCL will fail with // EEXIST. TEST_P(ParamSymlinkTest, OpenLinkExclFails) { -- cgit v1.2.3 From 4a72c8078ebd3e7b5982f0f7d9fcc4f4d9665ef8 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Mon, 1 Jul 2019 17:00:08 -0700 Subject: Use new location of python-hello image in tests. PiperOrigin-RevId: 256062988 --- runsc/test/image/image_test.go | 7 +++++-- runsc/test/integration/integration_test.go | 10 ++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go index 14cbd30c4..ddaa2c13b 100644 --- a/runsc/test/image/image_test.go +++ b/runsc/test/image/image_test.go @@ -209,11 +209,14 @@ func TestMysql(t *testing.T) { } func TestPythonHello(t *testing.T) { - if err := testutil.Pull("google/python-hello"); err != nil { + // TODO(b/136503277): Once we have more complete python runtime tests, + // we can drop this one. + const img = "gcr.io/gvisor-presubmit/python-hello" + if err := testutil.Pull(img); err != nil { t.Fatalf("docker pull failed: %v", err) } d := testutil.MakeDocker("python-hello-test") - if err := d.Run("-p", "8080", "google/python-hello"); err != nil { + if err := d.Run("-p", "8080", img); err != nil { t.Fatalf("docker run failed: %v", err) } defer d.CleanUp() diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go index 55ebc2f5d..7cef4b9dd 100644 --- a/runsc/test/integration/integration_test.go +++ b/runsc/test/integration/integration_test.go @@ -86,16 +86,17 @@ func TestLifeCycle(t *testing.T) { } func TestPauseResume(t *testing.T) { + const img = "gcr.io/gvisor-presubmit/python-hello" if !testutil.IsPauseResumeSupported() { t.Log("Pause/resume is not supported, skipping test.") return } - if err := testutil.Pull("google/python-hello"); err != nil { + if err := testutil.Pull(img); err != nil { t.Fatal("docker pull failed:", err) } d := testutil.MakeDocker("pause-resume-test") - if err := d.Run("-p", "8080", "google/python-hello"); err != nil { + if err := d.Run("-p", "8080", img); err != nil { t.Fatalf("docker run failed: %v", err) } defer d.CleanUp() @@ -149,15 +150,16 @@ func TestPauseResume(t *testing.T) { } func TestCheckpointRestore(t *testing.T) { + const img = "gcr.io/gvisor-presubmit/python-hello" if !testutil.IsPauseResumeSupported() { t.Log("Pause/resume is not supported, skipping test.") return } - if err := testutil.Pull("google/python-hello"); err != nil { + if err := testutil.Pull(img); err != nil { t.Fatal("docker pull failed:", err) } d := testutil.MakeDocker("save-restore-test") - if err := d.Run("-p", "8080", "google/python-hello"); err != nil { + if err := d.Run("-p", "8080", img); err != nil { t.Fatalf("docker run failed: %v", err) } defer d.CleanUp() -- cgit v1.2.3 From 0aa9418a778b2fffef4ea4691e97868b498e3b22 Mon Sep 17 00:00:00 2001 From: Ian Gudger Date: Mon, 1 Jul 2019 17:45:19 -0700 Subject: Fix unix/transport.queue reference leaks. Fix two leaks for connectionless Unix sockets: * Double connect: Subsequent connects would leak a reference on the previously connected endpoint. * Close unconnected: Sockets which were not connected at the time of closure would leak a reference on their receiver. PiperOrigin-RevId: 256070451 --- pkg/sentry/socket/unix/transport/connectionless.go | 26 ++++++++++------------ 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index c591f261d..c7f7c5b16 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -54,29 +54,24 @@ func (e *connectionlessEndpoint) isBound() bool { // Close puts the endpoint in a closed state and frees all resources associated // with it. -// -// The socket will be a fresh state after a call to close and may be reused. -// That is, close may be used to "unbind" or "disconnect" the socket in error -// paths. func (e *connectionlessEndpoint) Close() { e.Lock() - var r Receiver - if e.Connected() { - e.receiver.CloseRecv() - r = e.receiver - e.receiver = nil - + if e.connected != nil { e.connected.Release() e.connected = nil } + if e.isBound() { e.path = "" } + + e.receiver.CloseRecv() + r := e.receiver + e.receiver = nil e.Unlock() - if r != nil { - r.CloseNotify() - r.Release() - } + + r.CloseNotify() + r.Release() } // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. @@ -139,6 +134,9 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi } e.Lock() + if e.connected != nil { + e.connected.Release() + } e.connected = connected e.Unlock() -- cgit v1.2.3 From 4cd28c6e270324665184d15ba5f2d25edea04af3 Mon Sep 17 00:00:00 2001 From: Ahmet Alp Balkan Date: Tue, 2 Jul 2019 12:05:41 -0700 Subject: sentry/kernel: add syslog message It feels like "reticulating splines" is missing from the list of meaningless syslog messages. Signed-off-by: Ahmet Alp Balkan --- pkg/sentry/kernel/syslog.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go index 175d1b247..8227ecf1d 100644 --- a/pkg/sentry/kernel/syslog.go +++ b/pkg/sentry/kernel/syslog.go @@ -67,6 +67,7 @@ func (s *syslog) Log() []byte { "Creating process schedule...", "Generating random numbers by fair dice roll...", "Rewriting operating system in Javascript...", + "Reticulating splines...", "Consulting tar man page...", "Forking spaghetti code...", "Checking naughty and nice process list...", -- cgit v1.2.3 From 4f2f44320f9b96bdc8c0fb25c116104ea501ab8b Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Tue, 2 Jul 2019 12:53:47 -0700 Subject: Simplify (and fix) refcounts in createAt. fileOpAt holds references on the Dirents passed as arguments to the callback, and drops refs when finished, so we don't need to DecRef those Dirents ourselves However, all Dirents that we get from FindInode/FindLink must be DecRef'd. This CL cleans up the ref-counting logic, and fixes some refcount issues in the process. PiperOrigin-RevId: 256220882 --- pkg/sentry/syscalls/linux/sys_file.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 3410af69c..2776fdec7 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -326,6 +326,7 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod if err != nil { break } + defer found.DecRef() // We found something (possibly a symlink). If the // O_EXCL flag was passed, then we can immediately @@ -346,17 +347,18 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod } // Try to resolve the symlink directly to a Dirent. - resolved, err := found.Inode.Getlink(t) - if err == nil || err != fs.ErrResolveViaReadlink { + var resolved *fs.Dirent + resolved, err = found.Inode.Getlink(t) + if err == nil { // No more resolution necessary. - found.DecRef() - found = resolved + defer resolved.DecRef() break + } else if err != fs.ErrResolveViaReadlink { + return err } // Are we able to resolve further? if remainingTraversals == 0 { - found.DecRef() return syscall.ELOOP } @@ -373,10 +375,10 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod if err != nil { break } + defer newParent.DecRef() // Repeat the process with the parent and name of the // symlink target. - parent.DecRef() parent = newParent name = newName } @@ -384,9 +386,6 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod var newFile *fs.File switch err { case nil: - // The file existed. - defer found.DecRef() - // Like sys_open, check for a few things about the // filesystem before trying to get a reference to the // fs.File. The same constraints on Check apply. -- cgit v1.2.3 From d8ec2fb671e68a2504c10bd254ae64f33752430d Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Tue, 2 Jul 2019 14:03:31 -0700 Subject: Ext4: DiskLayout: Inode interface. PiperOrigin-RevId: 256234390 --- pkg/sentry/fs/ext4/disklayout/BUILD | 8 +- pkg/sentry/fs/ext4/disklayout/block_group.go | 22 +- pkg/sentry/fs/ext4/disklayout/block_group_32.go | 4 +- pkg/sentry/fs/ext4/disklayout/inode.go | 267 ++++++++++++++++++++++++ pkg/sentry/fs/ext4/disklayout/superblock.go | 22 +- 5 files changed, 302 insertions(+), 21 deletions(-) create mode 100644 pkg/sentry/fs/ext4/disklayout/inode.go diff --git a/pkg/sentry/fs/ext4/disklayout/BUILD b/pkg/sentry/fs/ext4/disklayout/BUILD index cdac63655..e3d2f8d71 100644 --- a/pkg/sentry/fs/ext4/disklayout/BUILD +++ b/pkg/sentry/fs/ext4/disklayout/BUILD @@ -8,6 +8,7 @@ go_library( "block_group.go", "block_group_32.go", "block_group_64.go", + "inode.go", "superblock.go", "superblock_32.go", "superblock_64.go", @@ -15,7 +16,12 @@ go_library( "test_utils.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext4/disklayout", - deps = ["//pkg/binary"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + ], ) go_test( diff --git a/pkg/sentry/fs/ext4/disklayout/block_group.go b/pkg/sentry/fs/ext4/disklayout/block_group.go index 7df76a036..32ea3d97d 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group.go +++ b/pkg/sentry/fs/ext4/disklayout/block_group.go @@ -12,26 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package disklayout provides ext4 disk level structures which can be directly -// filled with bytes from the underlying device. All structures on disk are in -// little-endian order. Only jbd2 (journal) structures are in big-endian order. -// Structs aim to emulate structures `exactly` how they are layed out on disk. -// -// Note: All fields in these structs are exported because binary.Read would -// panic otherwise. package disklayout -// BlockGroup represents Linux struct ext4_group_desc which is internally -// called a block group descriptor. An ext4 file system is split into a series -// of block groups. This provides an access layer to information needed to -// access and use a block group. +// BlockGroup represents a Linux ext block group descriptor. An ext file system +// is split into a series of block groups. This provides an access layer to +// information needed to access and use a block group. // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors. type BlockGroup interface { // InodeTable returns the absolute block number of the block containing the // inode table. This points to an array of Inode structs. Inode tables are // statically allocated at mkfs time. The superblock records the number of - // inodes per group (length of this table). + // inodes per group (length of this table) and the size of each inode struct. InodeTable() uint64 // BlockBitmap returns the absolute block number of the block containing the @@ -73,15 +65,15 @@ type BlockGroup interface { // Checksum returns this block group's checksum. // - // If RO_COMPAT_METADATA_CSUM feature is set: + // If SbMetadataCsum feature is set: // - checksum is crc32c(FS UUID + group number + group descriptor // structure) & 0xFFFF. // - // If RO_COMPAT_GDT_CSUM feature is set: + // If SbGdtCsum feature is set: // - checksum is crc16(FS UUID + group number + group descriptor // structure). // - // RO_COMPAT_METADATA_CSUM and RO_COMPAT_GDT_CSUM should not be both set. + // SbMetadataCsum and SbGdtCsum should not be both set. // If they are, Linux warns and asks to run fsck. Checksum() uint16 diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_32.go b/pkg/sentry/fs/ext4/disklayout/block_group_32.go index 087f1fb4a..dadecb3e3 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group_32.go +++ b/pkg/sentry/fs/ext4/disklayout/block_group_32.go @@ -15,8 +15,8 @@ package disklayout // BlockGroup32Bit emulates the first half of struct ext4_group_desc in -// fs/ext4/ext4.h. It is the block group descriptor struct for 32-bit ext4 -// filesystems. It implements BlockGroup interface. +// fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and +// 32-bit ext4 filesystems. It implements BlockGroup interface. // // The suffix `Lo` here stands for lower bits because this is also used in the // 64-bit version where these fields represent the lower half of the fields. diff --git a/pkg/sentry/fs/ext4/disklayout/inode.go b/pkg/sentry/fs/ext4/disklayout/inode.go new file mode 100644 index 000000000..b48001910 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/inode.go @@ -0,0 +1,267 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +// The Inode interface must be implemented by structs representing ext inodes. +// The inode stores all the metadata pertaining to the file (except for the +// file name which is held by the directory entry). It does NOT expose all +// fields and should be extended if need be. +// +// Some file systems (e.g. FAT) use the directory entry to store all this +// information. Ext file systems do not so that they can support hard links. +// However, ext4 cheats a little bit and duplicates the file type in the +// directory entry for performance gains. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes. +type Inode interface { + // Mode returns the linux file mode which is majorly used to extract + // information like: + // - File permissions (read/write/execute by user/group/others). + // - Sticky, set UID and GID bits. + // - File type. + // + // Masks to extract this information are provided in pkg/abi/linux/file.go. + Mode() linux.FileMode + + // UID returns the owner UID. + UID() auth.KUID + + // GID returns the owner GID. + GID() auth.KGID + + // Size returns the size of the file in bytes. + Size() uint64 + + // InodeSize returns the size of this inode struct in bytes. + // In ext2 and ext3, the inode struct and inode disk record size was fixed at + // 128 bytes. Ext4 makes it possible for the inode struct to be bigger. + // However, accessing any field beyond the 128 bytes marker must be verified + // using this method. + InodeSize() uint16 + + // AccessTime returns the last access time. Shows when the file was last read. + // + // If InExtendedAttr is set, then this should NOT be used because the + // underlying field is used to store the extended attribute value checksum. + AccessTime() time.Time + + // ChangeTime returns the last change time. Shows when the file meta data + // (like permissions) was last changed. + // + // If InExtendedAttr is set, then this should NOT be used because the + // underlying field is used to store the lower 32 bits of the attribute + // value’s reference count. + ChangeTime() time.Time + + // ModificationTime returns the last modification time. Shows when the file + // content was last modified. + // + // If InExtendedAttr is set, then this should NOT be used because + // the underlying field contains the number of the inode that owns the + // extended attribute. + ModificationTime() time.Time + + // DeletionTime returns the deletion time. Inodes are marked as deleted by + // writing to the underlying field. FS tools can restore files until they are + // actually overwritten. + DeletionTime() time.Time + + // LinksCount returns the number of hard links to this inode. + // + // Normally there is an upper limit on the number of hard links: + // - ext2/ext3 = 32,000 + // - ext4 = 65,000 + // + // This implies that an ext4 directory cannot have more than 64,998 + // subdirectories because each subdirectory will have a hard link to the + // directory via the `..` entry. The directory has hard link via the `.` entry + // of its own. And finally the inode is initiated with 1 hard link (itself). + // + // The underlying value is reset to 1 if all the following hold: + // - Inode is a directory. + // - SbDirNlink is enabled. + // - Number of hard links is incremented past 64,999. + // Hard link value of 1 for a directory would indicate that the number of hard + // links is unknown because a directory can have minimum 2 hard links (itself + // and `.` entry). + LinksCount() uint16 + + // Flags returns InodeFlags which represents the inode flags. + Flags() InodeFlags + + // Blocks returns the underlying inode.i_block array. This field is special + // and is used to store various kinds of things depending on the filesystem + // version and inode type. + // - In ext2/ext3, it contains the block map. + // - In ext4, it contains the extent tree. + // - For inline files, it contains the file contents. + // - For symlinks, it contains the link path (if it fits here). + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#the-contents-of-inode-i-block. + Blocks() [60]byte +} + +// Inode flags. This is not comprehensive and flags which were not used in +// the Linux kernel have been excluded. +const ( + // InSync indicates that all writes to the file must be synchronous. + InSync = 0x8 + + // InImmutable indicates that this file is immutable. + InImmutable = 0x10 + + // InAppend indicates that this file can only be appended to. + InAppend = 0x20 + + // InNoDump indicates that teh dump(1) utility should not dump this file. + InNoDump = 0x40 + + // InNoAccessTime indicates that the access time of this inode must not be + // updated. + InNoAccessTime = 0x80 + + // InIndex indicates that this directory has hashed indexes. + InIndex = 0x1000 + + // InJournalData indicates that file data must always be written through a + // journal device. + InJournalData = 0x4000 + + // InDirSync indicates that all the directory entiry data must be written + // synchronously. + InDirSync = 0x10000 + + // InTopDir indicates that this inode is at the top of the directory hierarchy. + InTopDir = 0x20000 + + // InHugeFile indicates that this is a huge file. + InHugeFile = 0x40000 + + // InExtents indicates that this inode uses extents. + InExtents = 0x80000 + + // InExtendedAttr indicates that this inode stores a large extended attribute + // value in its data blocks. + InExtendedAttr = 0x200000 + + // InInline indicates that this inode has inline data. + InInline = 0x10000000 + + // InReserved indicates that this inode is reserved for the ext4 library. + InReserved = 0x80000000 +) + +// InodeFlags represents all possible combinations of inode flags. It aims to +// cover the bit masks and provide a more user-friendly interface. +type InodeFlags struct { + Sync bool + Immutable bool + Append bool + NoDump bool + NoAccessTime bool + Index bool + JournalData bool + DirSync bool + TopDir bool + HugeFile bool + Extents bool + ExtendedAttr bool + Inline bool + Reserved bool +} + +// ToInt converts inode flags back to its 32-bit rep. +func (f InodeFlags) ToInt() uint32 { + var res uint32 + + if f.Sync { + res |= InSync + } + if f.Immutable { + res |= InImmutable + } + if f.Append { + res |= InAppend + } + if f.NoDump { + res |= InNoDump + } + if f.NoAccessTime { + res |= InNoAccessTime + } + if f.Index { + res |= InIndex + } + if f.JournalData { + res |= InJournalData + } + if f.DirSync { + res |= InDirSync + } + if f.TopDir { + res |= InTopDir + } + if f.HugeFile { + res |= InHugeFile + } + if f.Extents { + res |= InExtents + } + if f.ExtendedAttr { + res |= InExtendedAttr + } + if f.Inline { + res |= InInline + } + if f.Reserved { + res |= InReserved + } + + return res +} + +// InodeFlagsFromInt converts the integer representation of inode flags to +// a InodeFlags struct. +func InodeFlagsFromInt(f uint32) InodeFlags { + return InodeFlags{ + Sync: f&InSync > 0, + Immutable: f&InImmutable > 0, + Append: f&InAppend > 0, + NoDump: f&InNoDump > 0, + NoAccessTime: f&InNoAccessTime > 0, + Index: f&InIndex > 0, + JournalData: f&InJournalData > 0, + DirSync: f&InDirSync > 0, + TopDir: f&InTopDir > 0, + HugeFile: f&InHugeFile > 0, + Extents: f&InExtents > 0, + ExtendedAttr: f&InExtendedAttr > 0, + Inline: f&InInline > 0, + Reserved: f&InReserved > 0, + } +} + +// These masks define how users can view/modify inode flags. The rest of the +// flags are for internal kernel usage only. +const ( + InUserReadFlagMask = 0x4BDFFF + InUserWriteFlagMask = 0x4B80FF +) diff --git a/pkg/sentry/fs/ext4/disklayout/superblock.go b/pkg/sentry/fs/ext4/disklayout/superblock.go index d630ba8a6..030c73410 100644 --- a/pkg/sentry/fs/ext4/disklayout/superblock.go +++ b/pkg/sentry/fs/ext4/disklayout/superblock.go @@ -12,9 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Package disklayout provides Linux ext file system's disk level structures +// which can be directly read into from the underlying device. All structures +// on disk are in little-endian order. Only jbd2 (journal) structures are in +// big-endian order. Structs aim to emulate structures `exactly` how they are +// layed out on disk. +// +// This library aims to be compatible with all ext(2/3/4) systems so it +// provides a generic interface for all major structures and various +// implementations (for different versions). The user code is responsible for +// using appropriate implementations based on the underlying device. +// +// Notes: +// - All fields in these structs are exported because binary.Read would +// panic otherwise. +// - All OS dependent fields in these structures will be interpretted using +// the Linux version of that field. package disklayout -// SuperBlock should be implemented by structs representing ext4 superblock. +// SuperBlock should be implemented by structs representing the ext superblock. // The superblock holds a lot of information about the enclosing filesystem. // This interface aims to provide access methods to important information held // by the superblock. It does NOT expose all fields of the superblock, only the @@ -23,11 +39,11 @@ package disklayout // Location and replication: // - The superblock is located at offset 1024 in block group 0. // - Redundant copies of the superblock and group descriptors are kept in -// all groups if sparse_super feature flag is NOT set. If it is set, the +// all groups if SbSparse feature flag is NOT set. If it is set, the // replicas only exist in groups whose group number is either 0 or a // power of 3, 5, or 7. // - There is also a sparse superblock feature v2 in which there are just -// two replicas saved in block groups pointed by the s_backup_bgs field. +// two replicas saved in the block groups pointed by sb.s_backup_bgs. // // Replicas should eventually be updated if the superblock is updated. // -- cgit v1.2.3 From 1178a278ae2501cdd19ffaf6e10c2e3c34b143d0 Mon Sep 17 00:00:00 2001 From: Neel Natu Date: Tue, 2 Jul 2019 17:56:57 -0700 Subject: Mark timers_test flaky because setrlimit(RLIMIT_CPU) is broken in some kernels. https://bugzilla.redhat.com/show_bug.cgi?id=1568337 PiperOrigin-RevId: 256276198 --- test/syscalls/linux/BUILD | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 8a24d8c0b..4735284ba 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -3001,6 +3001,8 @@ cc_binary( testonly = 1, srcs = ["timers.cc"], linkstatic = 1, + # FIXME(b/136599201) + tags = ["flaky"], deps = [ "//test/util:cleanup", "//test/util:logging", -- cgit v1.2.3 From 3f14caeb999f5b93699c46925cbeeee61ec74a86 Mon Sep 17 00:00:00 2001 From: Ian Lewis Date: Tue, 2 Jul 2019 18:44:15 -0700 Subject: Add documentation for remaining syscalls (fixes #197, #186) Adds support level documentation for all syscalls. Removes the Undocumented utility function to discourage usage while leaving SupportUndocumented as the default support level for Syscall structs. PiperOrigin-RevId: 256281927 --- pkg/sentry/syscalls/linux/linux64.go | 516 +++++++++++++++++------------------ pkg/sentry/syscalls/syscalls.go | 17 +- 2 files changed, 262 insertions(+), 271 deletions(-) diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 94816b038..99af33cfd 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -50,244 +50,244 @@ var AMD64 = &kernel.SyscallTable{ Table: map[uintptr]kernel.Syscall{ 0: syscalls.Supported("read", Read), 1: syscalls.Supported("write", Write), - 2: syscalls.Supported("open", Open), + 2: syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil), 3: syscalls.Supported("close", Close), - 4: syscalls.Undocumented("stat", Stat), - 5: syscalls.Undocumented("fstat", Fstat), - 6: syscalls.Undocumented("lstat", Lstat), - 7: syscalls.Undocumented("poll", Poll), - 8: syscalls.Undocumented("lseek", Lseek), - 9: syscalls.Undocumented("mmap", Mmap), - 10: syscalls.Undocumented("mprotect", Mprotect), - 11: syscalls.Undocumented("munmap", Munmap), - 12: syscalls.Undocumented("brk", Brk), - 13: syscalls.Undocumented("rt_sigaction", RtSigaction), - 14: syscalls.Undocumented("rt_sigprocmask", RtSigprocmask), - 15: syscalls.Undocumented("rt_sigreturn", RtSigreturn), - 16: syscalls.Undocumented("ioctl", Ioctl), - 17: syscalls.Undocumented("pread64", Pread64), - 18: syscalls.Undocumented("pwrite64", Pwrite64), - 19: syscalls.Undocumented("readv", Readv), - 20: syscalls.Undocumented("writev", Writev), - 21: syscalls.Undocumented("access", Access), - 22: syscalls.Undocumented("pipe", Pipe), - 23: syscalls.Undocumented("select", Select), - 24: syscalls.Undocumented("sched_yield", SchedYield), - 25: syscalls.Undocumented("mremap", Mremap), - 26: syscalls.Undocumented("msync", Msync), - 27: syscalls.Undocumented("mincore", Mincore), - 28: syscalls.Undocumented("madvise", Madvise), - 29: syscalls.Undocumented("shmget", Shmget), - 30: syscalls.Undocumented("shmat", Shmat), - 31: syscalls.Undocumented("shmctl", Shmctl), - 32: syscalls.Undocumented("dup", Dup), - 33: syscalls.Undocumented("dup2", Dup2), - 34: syscalls.Undocumented("pause", Pause), - 35: syscalls.Undocumented("nanosleep", Nanosleep), - 36: syscalls.Undocumented("getitimer", Getitimer), - 37: syscalls.Undocumented("alarm", Alarm), - 38: syscalls.Undocumented("setitimer", Setitimer), - 39: syscalls.Undocumented("getpid", Getpid), - 40: syscalls.Undocumented("sendfile", Sendfile), - 41: syscalls.Undocumented("socket", Socket), - 42: syscalls.Undocumented("connect", Connect), - 43: syscalls.Undocumented("accept", Accept), - 44: syscalls.Undocumented("sendto", SendTo), - 45: syscalls.Undocumented("recvfrom", RecvFrom), - 46: syscalls.Undocumented("sendmsg", SendMsg), - 47: syscalls.Undocumented("recvmsg", RecvMsg), - 48: syscalls.Undocumented("shutdown", Shutdown), - 49: syscalls.Undocumented("bind", Bind), - 50: syscalls.Undocumented("listen", Listen), - 51: syscalls.Undocumented("getsockname", GetSockName), - 52: syscalls.Undocumented("getpeername", GetPeerName), - 53: syscalls.Undocumented("socketpair", SocketPair), - 54: syscalls.Undocumented("setsockopt", SetSockOpt), - 55: syscalls.Undocumented("getsockopt", GetSockOpt), - 56: syscalls.Undocumented("clone", Clone), - 57: syscalls.Undocumented("fork", Fork), - 58: syscalls.Undocumented("vfork", Vfork), - 59: syscalls.Undocumented("execve", Execve), - 60: syscalls.Undocumented("exit", Exit), - 61: syscalls.Undocumented("wait4", Wait4), - 62: syscalls.Undocumented("kill", Kill), - 63: syscalls.Undocumented("uname", Uname), - 64: syscalls.Undocumented("semget", Semget), - 65: syscalls.Undocumented("semop", Semop), - 66: syscalls.Undocumented("semctl", Semctl), - 67: syscalls.Undocumented("shmdt", Shmdt), + 4: syscalls.Supported("stat", Stat), + 5: syscalls.Supported("fstat", Fstat), + 6: syscalls.Supported("lstat", Lstat), + 7: syscalls.Supported("poll", Poll), + 8: syscalls.Supported("lseek", Lseek), + 9: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), + 10: syscalls.Supported("mprotect", Mprotect), + 11: syscalls.Supported("munmap", Munmap), + 12: syscalls.Supported("brk", Brk), + 13: syscalls.Supported("rt_sigaction", RtSigaction), + 14: syscalls.Supported("rt_sigprocmask", RtSigprocmask), + 15: syscalls.Supported("rt_sigreturn", RtSigreturn), + 16: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), + 17: syscalls.Supported("pread64", Pread64), + 18: syscalls.Supported("pwrite64", Pwrite64), + 19: syscalls.Supported("readv", Readv), + 20: syscalls.Supported("writev", Writev), + 21: syscalls.Supported("access", Access), + 22: syscalls.Supported("pipe", Pipe), + 23: syscalls.Supported("select", Select), + 24: syscalls.Supported("sched_yield", SchedYield), + 25: syscalls.Supported("mremap", Mremap), + 26: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), + 27: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), + 28: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), + 29: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), + 30: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), + 31: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), + 32: syscalls.Supported("dup", Dup), + 33: syscalls.Supported("dup2", Dup2), + 34: syscalls.Supported("pause", Pause), + 35: syscalls.Supported("nanosleep", Nanosleep), + 36: syscalls.Supported("getitimer", Getitimer), + 37: syscalls.Supported("alarm", Alarm), + 38: syscalls.Supported("setitimer", Setitimer), + 39: syscalls.Supported("getpid", Getpid), + 40: syscalls.Supported("sendfile", Sendfile), + 41: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), + 42: syscalls.Supported("connect", Connect), + 43: syscalls.Supported("accept", Accept), + 44: syscalls.Supported("sendto", SendTo), + 45: syscalls.Supported("recvfrom", RecvFrom), + 46: syscalls.Supported("sendmsg", SendMsg), + 47: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), + 48: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), + 49: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), + 50: syscalls.Supported("listen", Listen), + 51: syscalls.Supported("getsockname", GetSockName), + 52: syscalls.Supported("getpeername", GetPeerName), + 53: syscalls.Supported("socketpair", SocketPair), + 54: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), + 55: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), + 56: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), + 57: syscalls.Supported("fork", Fork), + 58: syscalls.Supported("vfork", Vfork), + 59: syscalls.Supported("execve", Execve), + 60: syscalls.Supported("exit", Exit), + 61: syscalls.Supported("wait4", Wait4), + 62: syscalls.Supported("kill", Kill), + 63: syscalls.Supported("uname", Uname), + 64: syscalls.Supported("semget", Semget), + 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), + 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), + 67: syscalls.Supported("shmdt", Shmdt), 68: syscalls.ErrorWithEvent("msgget", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 69: syscalls.ErrorWithEvent("msgsnd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 70: syscalls.ErrorWithEvent("msgrcv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 71: syscalls.ErrorWithEvent("msgctl", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 72: syscalls.Undocumented("fcntl", Fcntl), - 73: syscalls.Undocumented("flock", Flock), - 74: syscalls.Undocumented("fsync", Fsync), - 75: syscalls.Undocumented("fdatasync", Fdatasync), - 76: syscalls.Undocumented("truncate", Truncate), - 77: syscalls.Undocumented("ftruncate", Ftruncate), - 78: syscalls.Undocumented("getdents", Getdents), - 79: syscalls.Undocumented("getcwd", Getcwd), - 80: syscalls.Undocumented("chdir", Chdir), - 81: syscalls.Undocumented("fchdir", Fchdir), - 82: syscalls.Undocumented("rename", Rename), - 83: syscalls.Undocumented("mkdir", Mkdir), - 84: syscalls.Undocumented("rmdir", Rmdir), - 85: syscalls.Undocumented("creat", Creat), - 86: syscalls.Undocumented("link", Link), - 87: syscalls.Undocumented("link", Unlink), - 88: syscalls.Undocumented("symlink", Symlink), - 89: syscalls.Undocumented("readlink", Readlink), - 90: syscalls.Undocumented("chmod", Chmod), - 91: syscalls.Undocumented("fchmod", Fchmod), - 92: syscalls.Undocumented("chown", Chown), - 93: syscalls.Undocumented("fchown", Fchown), - 94: syscalls.Undocumented("lchown", Lchown), - 95: syscalls.Undocumented("umask", Umask), - 96: syscalls.Undocumented("gettimeofday", Gettimeofday), - 97: syscalls.Undocumented("getrlimit", Getrlimit), - 98: syscalls.Undocumented("getrusage", Getrusage), - 99: syscalls.Undocumented("sysinfo", Sysinfo), - 100: syscalls.Undocumented("times", Times), - 101: syscalls.Undocumented("ptrace", Ptrace), - 102: syscalls.Undocumented("getuid", Getuid), - 103: syscalls.Undocumented("syslog", Syslog), - 104: syscalls.Undocumented("getgid", Getgid), - 105: syscalls.Undocumented("setuid", Setuid), - 106: syscalls.Undocumented("setgid", Setgid), - 107: syscalls.Undocumented("geteuid", Geteuid), - 108: syscalls.Undocumented("getegid", Getegid), - 109: syscalls.Undocumented("setpgid", Setpgid), - 110: syscalls.Undocumented("getppid", Getppid), - 111: syscalls.Undocumented("getpgrp", Getpgrp), - 112: syscalls.Undocumented("setsid", Setsid), - 113: syscalls.Undocumented("setreuid", Setreuid), - 114: syscalls.Undocumented("setregid", Setregid), - 115: syscalls.Undocumented("getgroups", Getgroups), - 116: syscalls.Undocumented("setgroups", Setgroups), - 117: syscalls.Undocumented("setresuid", Setresuid), - 118: syscalls.Undocumented("getresuid", Getresuid), - 119: syscalls.Undocumented("setresgid", Setresgid), - 120: syscalls.Undocumented("setresgid", Getresgid), - 121: syscalls.Undocumented("getpgid", Getpgid), + 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), + 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), + 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), + 75: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), + 76: syscalls.Supported("truncate", Truncate), + 77: syscalls.Supported("ftruncate", Ftruncate), + 78: syscalls.Supported("getdents", Getdents), + 79: syscalls.Supported("getcwd", Getcwd), + 80: syscalls.Supported("chdir", Chdir), + 81: syscalls.Supported("fchdir", Fchdir), + 82: syscalls.Supported("rename", Rename), + 83: syscalls.Supported("mkdir", Mkdir), + 84: syscalls.Supported("rmdir", Rmdir), + 85: syscalls.Supported("creat", Creat), + 86: syscalls.Supported("link", Link), + 87: syscalls.Supported("unlink", Unlink), + 88: syscalls.Supported("symlink", Symlink), + 89: syscalls.Supported("readlink", Readlink), + 90: syscalls.Supported("chmod", Chmod), + 91: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), + 92: syscalls.Supported("chown", Chown), + 93: syscalls.Supported("fchown", Fchown), + 94: syscalls.Supported("lchown", Lchown), + 95: syscalls.Supported("umask", Umask), + 96: syscalls.Supported("gettimeofday", Gettimeofday), + 97: syscalls.Supported("getrlimit", Getrlimit), + 98: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), + 99: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), + 100: syscalls.Supported("times", Times), + 101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), + 102: syscalls.Supported("getuid", Getuid), + 103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), + 104: syscalls.Supported("getgid", Getgid), + 105: syscalls.Supported("setuid", Setuid), + 106: syscalls.Supported("setgid", Setgid), + 107: syscalls.Supported("geteuid", Geteuid), + 108: syscalls.Supported("getegid", Getegid), + 109: syscalls.Supported("setpgid", Setpgid), + 110: syscalls.Supported("getppid", Getppid), + 111: syscalls.Supported("getpgrp", Getpgrp), + 112: syscalls.Supported("setsid", Setsid), + 113: syscalls.Supported("setreuid", Setreuid), + 114: syscalls.Supported("setregid", Setregid), + 115: syscalls.Supported("getgroups", Getgroups), + 116: syscalls.Supported("setgroups", Setgroups), + 117: syscalls.Supported("setresuid", Setresuid), + 118: syscalls.Supported("getresuid", Getresuid), + 119: syscalls.Supported("setresgid", Setresgid), + 120: syscalls.Supported("setresgid", Getresgid), + 121: syscalls.Supported("getpgid", Getpgid), 122: syscalls.ErrorWithEvent("setfsuid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) 123: syscalls.ErrorWithEvent("setfsgid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 124: syscalls.Undocumented("getsid", Getsid), - 125: syscalls.Undocumented("capget", Capget), - 126: syscalls.Undocumented("capset", Capset), - 127: syscalls.Undocumented("rt_sigpending", RtSigpending), - 128: syscalls.Undocumented("rt_sigtimedwait", RtSigtimedwait), - 129: syscalls.Undocumented("rt_sigqueueinfo", RtSigqueueinfo), - 130: syscalls.Undocumented("rt_sigsuspend", RtSigsuspend), - 131: syscalls.Undocumented("sigaltstack", Sigaltstack), - 132: syscalls.Undocumented("utime", Utime), - 133: syscalls.Undocumented("mknod", Mknod), + 124: syscalls.Supported("getsid", Getsid), + 125: syscalls.Supported("capget", Capget), + 126: syscalls.Supported("capset", Capset), + 127: syscalls.Supported("rt_sigpending", RtSigpending), + 128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), + 129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), + 130: syscalls.Supported("rt_sigsuspend", RtSigsuspend), + 131: syscalls.Supported("sigaltstack", Sigaltstack), + 132: syscalls.Supported("utime", Utime), + 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), 134: syscalls.Error("uselib", syscall.ENOSYS, "Obsolete", nil), 135: syscalls.ErrorWithEvent("personality", syscall.EINVAL, "Unable to change personality.", nil), 136: syscalls.ErrorWithEvent("ustat", syscall.ENOSYS, "Needs filesystem support.", nil), - 137: syscalls.Undocumented("statfs", Statfs), - 138: syscalls.Undocumented("fstatfs", Fstatfs), + 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), + 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), 139: syscalls.ErrorWithEvent("sysfs", syscall.ENOSYS, "", []string{"gvisor.dev/issue/165"}), - 140: syscalls.Undocumented("getpriority", Getpriority), - 141: syscalls.Undocumented("setpriority", Setpriority), + 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), + 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), - 143: syscalls.Undocumented("sched_getparam", SchedGetparam), - 144: syscalls.Undocumented("sched_setscheduler", SchedSetscheduler), - 145: syscalls.Undocumented("sched_getscheduler", SchedGetscheduler), - 146: syscalls.Undocumented("sched_get_priority_max", SchedGetPriorityMax), - 147: syscalls.Undocumented("sched_get_priority_min", SchedGetPriorityMin), + 143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), + 144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), + 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), + 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), + 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syscall.EPERM, "", nil), - 149: syscalls.Undocumented("mlock", Mlock), - 150: syscalls.Undocumented("munlock", Munlock), - 151: syscalls.Undocumented("mlockall", Mlockall), - 152: syscalls.Undocumented("munlockall", Munlockall), + 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), 154: syscalls.Error("modify_ldt", syscall.EPERM, "", nil), 155: syscalls.Error("pivot_root", syscall.EPERM, "", nil), - 156: syscalls.Error("sysctl", syscall.EPERM, `syscall is "worthless"`, nil), - 157: syscalls.Undocumented("prctl", Prctl), - 158: syscalls.Undocumented("arch_prctl", ArchPrctl), + 156: syscalls.Error("sysctl", syscall.EPERM, "Deprecated. Use /proc/sys instead.", nil), + 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), + 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), - 160: syscalls.Undocumented("setrlimit", Setrlimit), - 161: syscalls.Undocumented("chroot", Chroot), - 162: syscalls.Undocumented("sync", Sync), + 160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), + 161: syscalls.Supported("chroot", Chroot), + 162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), - 165: syscalls.Undocumented("mount", Mount), - 166: syscalls.Undocumented("umount2", Umount2), + 165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), + 166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), - 170: syscalls.Undocumented("sethostname", Sethostname), - 171: syscalls.Undocumented("setdomainname", Setdomainname), + 170: syscalls.Supported("sethostname", Sethostname), + 171: syscalls.Supported("setdomainname", Setdomainname), 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil), 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil), 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), - 177: syscalls.Error("get_kernel_syms", syscall.ENOSYS, "Not supported in > 2.6", nil), - 178: syscalls.Error("query_module", syscall.ENOSYS, "Not supported in > 2.6", nil), + 177: syscalls.Error("get_kernel_syms", syscall.ENOSYS, "Not supported in Linux > 2.6.", nil), + 178: syscalls.Error("query_module", syscall.ENOSYS, "Not supported in Linux > 2.6.", nil), 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations - 180: syscalls.Error("nfsservctl", syscall.ENOSYS, "Does not exist > 3.1", nil), - 181: syscalls.Error("getpmsg", syscall.ENOSYS, "Not implemented in Linux", nil), - 182: syscalls.Error("putpmsg", syscall.ENOSYS, "Not implemented in Linux", nil), - 183: syscalls.Error("afs_syscall", syscall.ENOSYS, "Not implemented in Linux", nil), - 184: syscalls.Error("tuxcall", syscall.ENOSYS, "Not implemented in Linux", nil), - 185: syscalls.Error("security", syscall.ENOSYS, "Not implemented in Linux", nil), - 186: syscalls.Undocumented("gettid", Gettid), + 180: syscalls.Error("nfsservctl", syscall.ENOSYS, "Removed after Linux 3.1.", nil), + 181: syscalls.Error("getpmsg", syscall.ENOSYS, "Not implemented in Linux.", nil), + 182: syscalls.Error("putpmsg", syscall.ENOSYS, "Not implemented in Linux.", nil), + 183: syscalls.Error("afs_syscall", syscall.ENOSYS, "Not implemented in Linux.", nil), + 184: syscalls.Error("tuxcall", syscall.ENOSYS, "Not implemented in Linux.", nil), + 185: syscalls.Error("security", syscall.ENOSYS, "Not implemented in Linux.", nil), + 186: syscalls.Supported("gettid", Gettid), 187: syscalls.ErrorWithEvent("readahead", syscall.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341) - 188: syscalls.ErrorWithEvent("setxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 189: syscalls.ErrorWithEvent("lsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 190: syscalls.ErrorWithEvent("fsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 191: syscalls.ErrorWithEvent("getxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 192: syscalls.ErrorWithEvent("lgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 193: syscalls.ErrorWithEvent("fgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 194: syscalls.ErrorWithEvent("listxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 195: syscalls.ErrorWithEvent("llistxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 196: syscalls.ErrorWithEvent("flistxattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 197: syscalls.ErrorWithEvent("removexattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 198: syscalls.ErrorWithEvent("lremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 199: syscalls.ErrorWithEvent("fremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil), - 200: syscalls.Undocumented("tkill", Tkill), - 201: syscalls.Undocumented("time", Time), - 202: syscalls.Undocumented("futex", Futex), - 203: syscalls.Undocumented("sched_setaffinity", SchedSetaffinity), - 204: syscalls.Undocumented("sched_getaffinity", SchedGetaffinity), + 188: syscalls.ErrorWithEvent("setxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 189: syscalls.ErrorWithEvent("lsetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 190: syscalls.ErrorWithEvent("fsetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 191: syscalls.ErrorWithEvent("getxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 192: syscalls.ErrorWithEvent("lgetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 193: syscalls.ErrorWithEvent("fgetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 194: syscalls.ErrorWithEvent("listxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 195: syscalls.ErrorWithEvent("llistxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 196: syscalls.ErrorWithEvent("flistxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 197: syscalls.ErrorWithEvent("removexattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 198: syscalls.ErrorWithEvent("lremovexattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 199: syscalls.ErrorWithEvent("fremovexattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 200: syscalls.Supported("tkill", Tkill), + 201: syscalls.Supported("time", Time), + 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), + 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), + 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), 205: syscalls.Error("set_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), - 206: syscalls.Undocumented("io_setup", IoSetup), - 207: syscalls.Undocumented("io_destroy", IoDestroy), - 208: syscalls.Undocumented("io_getevents", IoGetevents), - 209: syscalls.Undocumented("io_submit", IoSubmit), - 210: syscalls.Undocumented("io_cancel", IoCancel), + 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 211: syscalls.Error("get_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), - 213: syscalls.Undocumented("epoll_create", EpollCreate), - 214: syscalls.ErrorWithEvent("epoll_ctl_old", syscall.ENOSYS, "Deprecated", nil), - 215: syscalls.ErrorWithEvent("epoll_wait_old", syscall.ENOSYS, "Deprecated", nil), - 216: syscalls.ErrorWithEvent("remap_file_pages", syscall.ENOSYS, "Deprecated since 3.16", nil), - 217: syscalls.Undocumented("getdents64", Getdents64), - 218: syscalls.Undocumented("set_tid_address", SetTidAddress), - 219: syscalls.Undocumented("restart_syscall", RestartSyscall), + 213: syscalls.Supported("epoll_create", EpollCreate), + 214: syscalls.ErrorWithEvent("epoll_ctl_old", syscall.ENOSYS, "Deprecated.", nil), + 215: syscalls.ErrorWithEvent("epoll_wait_old", syscall.ENOSYS, "Deprecated.", nil), + 216: syscalls.ErrorWithEvent("remap_file_pages", syscall.ENOSYS, "Deprecated since Linux 3.16.", nil), + 217: syscalls.Supported("getdents64", Getdents64), + 218: syscalls.Supported("set_tid_address", SetTidAddress), + 219: syscalls.Supported("restart_syscall", RestartSyscall), 220: syscalls.ErrorWithEvent("semtimedop", syscall.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) - 221: syscalls.Undocumented("fadvise64", Fadvise64), - 222: syscalls.Undocumented("timer_create", TimerCreate), - 223: syscalls.Undocumented("timer_settime", TimerSettime), - 224: syscalls.Undocumented("timer_gettime", TimerGettime), - 225: syscalls.Undocumented("timer_getoverrun", TimerGetoverrun), - 226: syscalls.Undocumented("timer_delete", TimerDelete), - 227: syscalls.Undocumented("clock_settime", ClockSettime), - 228: syscalls.Undocumented("clock_gettime", ClockGettime), - 229: syscalls.Undocumented("clock_getres", ClockGetres), - 230: syscalls.Undocumented("clock_nanosleep", ClockNanosleep), - 231: syscalls.Undocumented("exit_group", ExitGroup), - 232: syscalls.Undocumented("epoll_wait", EpollWait), - 233: syscalls.Undocumented("epoll_ctl", EpollCtl), - 234: syscalls.Undocumented("tgkill", Tgkill), - 235: syscalls.Undocumented("utimes", Utimes), + 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), + 222: syscalls.Supported("timer_create", TimerCreate), + 223: syscalls.Supported("timer_settime", TimerSettime), + 224: syscalls.Supported("timer_gettime", TimerGettime), + 225: syscalls.Supported("timer_getoverrun", TimerGetoverrun), + 226: syscalls.Supported("timer_delete", TimerDelete), + 227: syscalls.Supported("clock_settime", ClockSettime), + 228: syscalls.Supported("clock_gettime", ClockGettime), + 229: syscalls.Supported("clock_getres", ClockGetres), + 230: syscalls.Supported("clock_nanosleep", ClockNanosleep), + 231: syscalls.Supported("exit_group", ExitGroup), + 232: syscalls.Supported("epoll_wait", EpollWait), + 233: syscalls.Supported("epoll_ctl", EpollCtl), + 234: syscalls.Supported("tgkill", Tgkill), + 235: syscalls.Supported("utimes", Utimes), 236: syscalls.Error("vserver", syscall.ENOSYS, "Not implemented by Linux", nil), 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), - 238: syscalls.Undocumented("set_mempolicy", SetMempolicy), - 239: syscalls.Undocumented("get_mempolicy", GetMempolicy), + 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), + 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), 240: syscalls.ErrorWithEvent("mq_open", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 241: syscalls.ErrorWithEvent("mq_unlink", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 242: syscalls.ErrorWithEvent("mq_timedsend", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) @@ -295,69 +295,69 @@ var AMD64 = &kernel.SyscallTable{ 244: syscalls.ErrorWithEvent("mq_notify", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 245: syscalls.ErrorWithEvent("mq_getsetattr", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), - 247: syscalls.Undocumented("waitid", Waitid), - 248: syscalls.Error("add_key", syscall.EACCES, "Not available to user", nil), - 249: syscalls.Error("request_key", syscall.EACCES, "Not available to user", nil), - 250: syscalls.Error("keyctl", syscall.EACCES, "Not available to user", nil), + 247: syscalls.Supported("waitid", Waitid), + 248: syscalls.Error("add_key", syscall.EACCES, "Not available to user.", nil), + 249: syscalls.Error("request_key", syscall.EACCES, "Not available to user.", nil), + 250: syscalls.Error("keyctl", syscall.EACCES, "Not available to user.", nil), 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) - 253: syscalls.Undocumented("inotify_init", InotifyInit), - 254: syscalls.Undocumented("inotify_add_watch", InotifyAddWatch), - 255: syscalls.Undocumented("inotify_rm_watch", InotifyRmWatch), + 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil), + 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), + 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), - 257: syscalls.Undocumented("openat", Openat), - 258: syscalls.Undocumented("mkdirat", Mkdirat), - 259: syscalls.Undocumented("mknodat", Mknodat), - 260: syscalls.Undocumented("fchownat", Fchownat), - 261: syscalls.Undocumented("futimesat", Futimesat), - 262: syscalls.Undocumented("fstatat", Fstatat), - 263: syscalls.Undocumented("unlinkat", Unlinkat), - 264: syscalls.Undocumented("renameat", Renameat), - 265: syscalls.Undocumented("linkat", Linkat), - 266: syscalls.Undocumented("symlinkat", Symlinkat), - 267: syscalls.Undocumented("readlinkat", Readlinkat), - 268: syscalls.Undocumented("fchmodat", Fchmodat), - 269: syscalls.Undocumented("faccessat", Faccessat), - 270: syscalls.Undocumented("pselect", Pselect), - 271: syscalls.Undocumented("ppoll", Ppoll), - 272: syscalls.Undocumented("unshare", Unshare), - 273: syscalls.Error("set_robust_list", syscall.ENOSYS, "Obsolete", nil), - 274: syscalls.Error("get_robust_list", syscall.ENOSYS, "Obsolete", nil), - 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 276: syscalls.ErrorWithEvent("tee", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 277: syscalls.Undocumented("sync_file_range", SyncFileRange), + 257: syscalls.Supported("openat", Openat), + 258: syscalls.Supported("mkdirat", Mkdirat), + 259: syscalls.Supported("mknodat", Mknodat), + 260: syscalls.Supported("fchownat", Fchownat), + 261: syscalls.Supported("futimesat", Futimesat), + 262: syscalls.Supported("fstatat", Fstatat), + 263: syscalls.Supported("unlinkat", Unlinkat), + 264: syscalls.Supported("renameat", Renameat), + 265: syscalls.Supported("linkat", Linkat), + 266: syscalls.Supported("symlinkat", Symlinkat), + 267: syscalls.Supported("readlinkat", Readlinkat), + 268: syscalls.Supported("fchmodat", Fchmodat), + 269: syscalls.Supported("faccessat", Faccessat), + 270: syscalls.Supported("pselect", Pselect), + 271: syscalls.Supported("ppoll", Ppoll), + 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), + 273: syscalls.Error("set_robust_list", syscall.ENOSYS, "Obsolete.", nil), + 274: syscalls.Error("get_robust_list", syscall.ENOSYS, "Obsolete.", nil), + 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 276: syscalls.ErrorWithEvent("tee", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), 278: syscalls.ErrorWithEvent("vmsplice", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) - 280: syscalls.Undocumented("utimensat", Utimensat), - 281: syscalls.Undocumented("epoll_pwait", EpollPwait), + 280: syscalls.Supported("utimensat", Utimensat), + 281: syscalls.Supported("epoll_pwait", EpollPwait), 282: syscalls.ErrorWithEvent("signalfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) - 283: syscalls.Undocumented("timerfd_create", TimerfdCreate), - 284: syscalls.Undocumented("eventfd", Eventfd), - 285: syscalls.Undocumented("fallocate", Fallocate), - 286: syscalls.Undocumented("timerfd_settime", TimerfdSettime), - 287: syscalls.Undocumented("timerfd_gettime", TimerfdGettime), - 288: syscalls.Undocumented("accept4", Accept4), + 283: syscalls.Supported("timerfd_create", TimerfdCreate), + 284: syscalls.Supported("eventfd", Eventfd), + 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), + 286: syscalls.Supported("timerfd_settime", TimerfdSettime), + 287: syscalls.Supported("timerfd_gettime", TimerfdGettime), + 288: syscalls.Supported("accept4", Accept4), 289: syscalls.ErrorWithEvent("signalfd4", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) - 290: syscalls.Undocumented("eventfd2", Eventfd2), - 291: syscalls.Undocumented("epoll_create1", EpollCreate1), - 292: syscalls.Undocumented("dup3", Dup3), - 293: syscalls.Undocumented("pipe2", Pipe2), - 294: syscalls.Undocumented("inotify_init1", InotifyInit1), - 295: syscalls.Undocumented("preadv", Preadv), - 296: syscalls.Undocumented("pwritev", Pwritev), - 297: syscalls.Undocumented("rt_tgsigqueueinfo", RtTgsigqueueinfo), + 290: syscalls.Supported("eventfd2", Eventfd2), + 291: syscalls.Supported("epoll_create1", EpollCreate1), + 292: syscalls.Supported("dup3", Dup3), + 293: syscalls.Supported("pipe2", Pipe2), + 294: syscalls.Supported("inotify_init1", InotifyInit1), + 295: syscalls.Supported("preadv", Preadv), + 296: syscalls.Supported("pwritev", Pwritev), + 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), 298: syscalls.ErrorWithEvent("perf_event_open", syscall.ENODEV, "No support for perf counters", nil), - 299: syscalls.Undocumented("recvmmsg", RecvMMsg), + 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), 300: syscalls.ErrorWithEvent("fanotify_init", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 301: syscalls.ErrorWithEvent("fanotify_mark", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 302: syscalls.Undocumented("prlimit64", Prlimit64), + 302: syscalls.Supported("prlimit64", Prlimit64), 303: syscalls.Error("name_to_handle_at", syscall.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 304: syscalls.Error("open_by_handle_at", syscall.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), - 306: syscalls.Undocumented("syncfs", Syncfs), - 307: syscalls.Undocumented("sendmmsg", SendMMsg), + 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), + 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), 308: syscalls.ErrorWithEvent("setns", syscall.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) - 309: syscalls.Undocumented("getcpu", Getcpu), + 309: syscalls.Supported("getcpu", Getcpu), 310: syscalls.ErrorWithEvent("process_vm_readv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}), 311: syscalls.ErrorWithEvent("process_vm_writev", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}), 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), @@ -365,20 +365,20 @@ var AMD64 = &kernel.SyscallTable{ 314: syscalls.ErrorWithEvent("sched_setattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) 315: syscalls.ErrorWithEvent("sched_getattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) 316: syscalls.ErrorWithEvent("renameat2", syscall.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) - 317: syscalls.Undocumented("seccomp", Seccomp), - 318: syscalls.Undocumented("getrandom", GetRandom), - 319: syscalls.Undocumented("memfd_create", MemfdCreate), + 317: syscalls.Supported("seccomp", Seccomp), + 318: syscalls.Supported("getrandom", GetRandom), + 319: syscalls.Supported("memfd_create", MemfdCreate), 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), 322: syscalls.ErrorWithEvent("execveat", syscall.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836) 323: syscalls.ErrorWithEvent("userfaultfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) 324: syscalls.ErrorWithEvent("membarrier", syscall.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897) - 325: syscalls.Undocumented("mlock2", Mlock2), + 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), // Syscalls after 325 are "backports" from versions of Linux after 4.4. 326: syscalls.ErrorWithEvent("copy_file_range", syscall.ENOSYS, "", nil), - 327: syscalls.Undocumented("preadv2", Preadv2), - 328: syscalls.Undocumented("pwritev2", Pwritev2), + 327: syscalls.Supported("preadv2", Preadv2), + 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), 332: syscalls.Supported("statx", Statx), }, diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go index a5f3d8407..94d52d5d5 100644 --- a/pkg/sentry/syscalls/syscalls.go +++ b/pkg/sentry/syscalls/syscalls.go @@ -40,16 +40,7 @@ func Supported(name string, fn kernel.SyscallFn) kernel.Syscall { Name: name, Fn: fn, SupportLevel: kernel.SupportFull, - Note: "Full Support", - } -} - -// Undocumented returns a syscall that is undocumented. -func Undocumented(name string, fn kernel.SyscallFn) kernel.Syscall { - return kernel.Syscall{ - Name: name, - Fn: fn, - SupportLevel: kernel.SupportUndocumented, + Note: "Fully Supported.", } } @@ -75,7 +66,7 @@ func Error(name string, err syscall.Errno, note string, urls []string) kernel.Sy return 0, nil, err }, SupportLevel: kernel.SupportUnimplemented, - Note: fmt.Sprintf("%sReturns %q", note, err.Error()), + Note: fmt.Sprintf("%sReturns %q.", note, err.Error()), URLs: urls, } } @@ -93,7 +84,7 @@ func ErrorWithEvent(name string, err syscall.Errno, note string, urls []string) return 0, nil, err }, SupportLevel: kernel.SupportUnimplemented, - Note: fmt.Sprintf("%sReturns %q", note, err.Error()), + Note: fmt.Sprintf("%sReturns %q.", note, err.Error()), URLs: urls, } } @@ -115,7 +106,7 @@ func CapError(name string, c linux.Capability, note string, urls []string) kerne return 0, nil, syserror.ENOSYS }, SupportLevel: kernel.SupportUnimplemented, - Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise", note, syserror.EPERM, c.String(), syserror.ENOSYS), + Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, syserror.EPERM, c.String(), syserror.ENOSYS), URLs: urls, } } -- cgit v1.2.3 From 753da9604efc74dced3055bb2f5c6bef2d98fe6c Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Tue, 2 Jul 2019 19:27:51 -0700 Subject: Remove map from fd_map, change to fd_table. This renames FDMap to FDTable and drops the kernel.FD type, which had an entire package to itself and didn't serve much use (it was freely cast between types, and served as more of an annoyance than providing any protection.) Based on BenchmarkFDLookupAndDecRef-12, we can expect 5-10 ns per lookup operation, and 10-15 ns per concurrent lookup operation of savings. This also fixes two tangential usage issues with the FDMap. Namely, non-atomic use of NewFDFrom and associated calls to Remove (that are both racy and fail to drop the reference on the underlying file.) PiperOrigin-RevId: 256285890 --- pkg/refs/refcounter.go | 7 + pkg/sentry/control/BUILD | 1 - pkg/sentry/control/proc.go | 12 +- pkg/sentry/fs/README.md | 2 +- pkg/sentry/fs/g3doc/inotify.md | 4 +- pkg/sentry/fs/proc/BUILD | 1 - pkg/sentry/fs/proc/fds.go | 38 ++- pkg/sentry/fs/proc/task.go | 4 +- pkg/sentry/kernel/BUILD | 9 +- pkg/sentry/kernel/epoll/BUILD | 1 - pkg/sentry/kernel/epoll/epoll.go | 3 +- pkg/sentry/kernel/fd_map.go | 366 ---------------------------- pkg/sentry/kernel/fd_map_test.go | 136 ----------- pkg/sentry/kernel/fd_table.go | 380 ++++++++++++++++++++++++++++++ pkg/sentry/kernel/fd_table_test.go | 192 +++++++++++++++ pkg/sentry/kernel/fd_table_unsafe.go | 103 ++++++++ pkg/sentry/kernel/kdefs/BUILD | 10 - pkg/sentry/kernel/kdefs/kdefs.go | 20 -- pkg/sentry/kernel/kernel.go | 76 +++--- pkg/sentry/kernel/task.go | 65 ++++- pkg/sentry/kernel/task_clone.go | 40 ++-- pkg/sentry/kernel/task_exec.go | 2 +- pkg/sentry/kernel/task_exit.go | 4 +- pkg/sentry/kernel/task_log.go | 2 +- pkg/sentry/kernel/task_start.go | 13 +- pkg/sentry/loader/loader.go | 2 +- pkg/sentry/socket/BUILD | 1 - pkg/sentry/socket/control/BUILD | 1 - pkg/sentry/socket/control/control.go | 10 +- pkg/sentry/socket/epsocket/BUILD | 1 - pkg/sentry/socket/epsocket/epsocket.go | 8 +- pkg/sentry/socket/hostinet/BUILD | 1 - pkg/sentry/socket/hostinet/socket.go | 9 +- pkg/sentry/socket/netlink/BUILD | 1 - pkg/sentry/socket/netlink/socket.go | 3 +- pkg/sentry/socket/rpcinet/BUILD | 1 - pkg/sentry/socket/rpcinet/socket.go | 8 +- pkg/sentry/socket/socket.go | 3 +- pkg/sentry/socket/unix/BUILD | 1 - pkg/sentry/socket/unix/unix.go | 8 +- pkg/sentry/strace/BUILD | 1 - pkg/sentry/strace/poll.go | 3 +- pkg/sentry/strace/strace.go | 7 +- pkg/sentry/syscalls/BUILD | 1 - pkg/sentry/syscalls/epoll.go | 30 ++- pkg/sentry/syscalls/linux/BUILD | 1 - pkg/sentry/syscalls/linux/sys_aio.go | 9 +- pkg/sentry/syscalls/linux/sys_epoll.go | 7 +- pkg/sentry/syscalls/linux/sys_eventfd.go | 5 +- pkg/sentry/syscalls/linux/sys_file.go | 170 ++++++------- pkg/sentry/syscalls/linux/sys_getdents.go | 9 +- pkg/sentry/syscalls/linux/sys_inotify.go | 15 +- pkg/sentry/syscalls/linux/sys_lseek.go | 5 +- pkg/sentry/syscalls/linux/sys_mmap.go | 5 +- pkg/sentry/syscalls/linux/sys_pipe.go | 23 +- pkg/sentry/syscalls/linux/sys_poll.go | 5 +- pkg/sentry/syscalls/linux/sys_prctl.go | 5 +- pkg/sentry/syscalls/linux/sys_read.go | 21 +- pkg/sentry/syscalls/linux/sys_socket.go | 90 ++++--- pkg/sentry/syscalls/linux/sys_splice.go | 25 +- pkg/sentry/syscalls/linux/sys_stat.go | 17 +- pkg/sentry/syscalls/linux/sys_sync.go | 17 +- pkg/sentry/syscalls/linux/sys_timerfd.go | 13 +- pkg/sentry/syscalls/linux/sys_write.go | 21 +- runsc/boot/BUILD | 1 - runsc/boot/fds.go | 27 +-- runsc/boot/fs.go | 4 +- runsc/boot/loader.go | 22 +- 68 files changed, 1115 insertions(+), 993 deletions(-) delete mode 100644 pkg/sentry/kernel/fd_map.go delete mode 100644 pkg/sentry/kernel/fd_map_test.go create mode 100644 pkg/sentry/kernel/fd_table.go create mode 100644 pkg/sentry/kernel/fd_table_test.go create mode 100644 pkg/sentry/kernel/fd_table_unsafe.go delete mode 100644 pkg/sentry/kernel/kdefs/BUILD delete mode 100644 pkg/sentry/kernel/kdefs/kdefs.go diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go index 3dd6f0dd4..6ecbace9e 100644 --- a/pkg/refs/refcounter.go +++ b/pkg/refs/refcounter.go @@ -365,6 +365,8 @@ func (r *AtomicRefCount) ReadRefs() int64 { // // The sanity check here is limited to real references, since if they have // dropped beneath zero then the object should have been destroyed. +// +//go:nosplit func (r *AtomicRefCount) IncRef() { if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { panic("Incrementing non-positive ref count") @@ -379,6 +381,8 @@ func (r *AtomicRefCount) IncRef() { // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to // distinguish other TryIncRef calls from genuine references held. +// +//go:nosplit func (r *AtomicRefCount) TryIncRef() bool { const speculativeRef = 1 << 32 v := atomic.AddInt64(&r.refCount, speculativeRef) @@ -420,6 +424,7 @@ func (r *AtomicRefCount) dropWeakRef(w *WeakRef) { // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // +//go:nosplit func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: @@ -455,6 +460,8 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { } // DecRef decrements this object's reference count. +// +//go:nosplit func (r *AtomicRefCount) DecRef() { r.DecRefWithDestructor(nil) } diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index 5dccb8e3c..bf802d1b6 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -23,7 +23,6 @@ go_library( "//pkg/sentry/fs/host", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/state", diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 66a506584..6ae60c5cb 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" @@ -123,9 +122,8 @@ func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID // TTYFileOperations that wraps the TTY is also returned. func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) { // Import file descriptors. - l := limits.NewLimitSet() - fdm := proc.Kernel.NewFDMap() - defer fdm.DecRef() + fdTable := proc.Kernel.NewFDTable() + defer fdTable.DecRef() // No matter what happens, we should close all files in the FilePayload // before returning. Any files that are imported will be duped. @@ -149,9 +147,9 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI WorkingDirectory: args.WorkingDirectory, Root: args.Root, Credentials: creds, - FDMap: fdm, + FDTable: fdTable, Umask: 0022, - Limits: l, + Limits: limits.NewLimitSet(), MaxSymlinkTraversals: linux.MaxSymlinkTraversals, UTSNamespace: proc.Kernel.RootUTSNamespace(), IPCNamespace: proc.Kernel.RootIPCNamespace(), @@ -212,7 +210,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI } // Add the file to the FD map. - if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil { + if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { return nil, 0, nil, err } } diff --git a/pkg/sentry/fs/README.md b/pkg/sentry/fs/README.md index c4e8faa3c..db4a1b730 100644 --- a/pkg/sentry/fs/README.md +++ b/pkg/sentry/fs/README.md @@ -69,7 +69,7 @@ Specifically this state is: - An `fs.MountManager` containing mount points. -- A `kernel.FDMap` containing pointers to open files. +- A `kernel.FDTable` containing pointers to open files. Anything else managed by the VFS that can be easily loaded into memory from a filesystem is synced back to those filesystems and is not saved. Examples are diff --git a/pkg/sentry/fs/g3doc/inotify.md b/pkg/sentry/fs/g3doc/inotify.md index e1630553b..71a577d9d 100644 --- a/pkg/sentry/fs/g3doc/inotify.md +++ b/pkg/sentry/fs/g3doc/inotify.md @@ -9,7 +9,7 @@ For the most part, the sentry implementation of inotify mirrors the Linux architecture. Inotify instances (i.e. the fd returned by inotify_init(2)) are backed by a pseudo-filesystem. Events are generated from various places in the sentry, including the [syscall layer][syscall_dir], the [vfs layer][dirent] and -the [process fd table][fd_map]. Watches are stored in inodes and generated +the [process fd table][fd_table]. Watches are stored in inodes and generated events are queued to the inotify instance owning the watches for delivery to the user. @@ -114,7 +114,7 @@ ordering. [dirent]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/dirent.go [event]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_event.go -[fd_map]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_map.go +[fd_table]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_table.go [inode]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode.go [inode_watches]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode_inotify.go [inotify]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify.go diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index da41a10ab..70ed854a8 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -42,7 +42,6 @@ go_library( "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/mm", diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index ea7aded9a..bee421d76 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -42,8 +41,8 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF var file *fs.File var fdFlags kernel.FDFlags t.WithMuLocked(func(t *kernel.Task) { - if fdm := t.FDMap(); fdm != nil { - file, fdFlags = fdm.GetDescriptor(kdefs.FD(n)) + if fdTable := t.FDTable(); fdTable != nil { + file, fdFlags = fdTable.Get(int32(n)) } }) if file == nil { @@ -56,36 +55,31 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF // toDentAttr callback for each to get a DentAttr, which it then emits. This is // a helper for implementing fs.InodeOperations.Readdir. func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) { - var fds kernel.FDs + var fds []int32 t.WithMuLocked(func(t *kernel.Task) { - if fdm := t.FDMap(); fdm != nil { - fds = fdm.GetFDs() + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.GetFDs() } }) - fdInts := make([]int, 0, len(fds)) - for _, fd := range fds { - fdInts = append(fdInts, int(fd)) - } - - // Find the fd to start at. - idx := sort.SearchInts(fdInts, int(offset)) - if idx == len(fdInts) { + // Find the appropriate starting point. + idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(offset) }) + if idx == len(fds) { return offset, nil } - fdInts = fdInts[idx:] + fds = fds[idx:] - var fd int - for _, fd = range fdInts { + // Serialize all FDs. + for _, fd := range fds { name := strconv.FormatUint(uint64(fd), 10) - if err := c.DirEmit(name, toDentAttr(fd)); err != nil { + if err := c.DirEmit(name, toDentAttr(int(fd))); err != nil { // Returned offset is the next fd to serialize. return int64(fd), err } } // We serialized them all. Next offset should be higher than last // serialized fd. - return int64(fd + 1), nil + return int64(fds[len(fds)-1] + 1), nil } // fd implements fs.InodeOperations for a file in /proc/TID/fd/. @@ -154,9 +148,9 @@ func (f *fd) Close() error { type fdDir struct { ramfs.Dir - // We hold a reference on the task's fdmap but only keep an indirect - // task pointer to avoid Dirent loading circularity caused by fdmap's - // potential back pointers into the dirent tree. + // We hold a reference on the task's FDTable but only keep an indirect + // task pointer to avoid Dirent loading circularity caused by the + // table's back pointers into the dirent tree. t *kernel.Task } diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index b2e36aeee..ef0ca3301 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -580,8 +580,8 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ( var fds int var vss, rss, data uint64 s.t.WithMuLocked(func(t *kernel.Task) { - if fdm := t.FDMap(); fdm != nil { - fds = fdm.Size() + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.Size() } if mm := t.MemoryManager(); mm != nil { vss = mm.VirtualMemorySize() diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index c172d399e..7b92f1b8d 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -96,7 +96,8 @@ go_library( srcs = [ "abstract_socket_namespace.go", "context.go", - "fd_map.go", + "fd_table.go", + "fd_table_unsafe.go", "fs_context.go", "ipc_namespace.go", "kernel.go", @@ -179,7 +180,6 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/futex", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/sched", "//pkg/sentry/kernel/semaphore", "//pkg/sentry/kernel/shm", @@ -214,7 +214,7 @@ go_test( name = "kernel_test", size = "small", srcs = [ - "fd_map_test.go", + "fd_table_test.go", "table_test.go", "task_test.go", "timekeeper_test.go", @@ -223,9 +223,10 @@ go_test( deps = [ "//pkg/abi", "//pkg/sentry/arch", + "//pkg/sentry/context", "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", "//pkg/sentry/fs/filetest", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/sched", "//pkg/sentry/limits", "//pkg/sentry/pgalloc", diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD index fb99cfc8f..f46c43128 100644 --- a/pkg/sentry/kernel/epoll/BUILD +++ b/pkg/sentry/kernel/epoll/BUILD @@ -30,7 +30,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 33c7dccae..9c0a4e1b4 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -61,7 +60,7 @@ const ( // +stateify savable type FileIdentifier struct { File *fs.File `state:"wait"` - Fd kdefs.FD + Fd int32 } // pollEntry holds all the state associated with an event poll entry, that is, diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go deleted file mode 100644 index 1b84bfe14..000000000 --- a/pkg/sentry/kernel/fd_map.go +++ /dev/null @@ -1,366 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package kernel - -import ( - "bytes" - "fmt" - "sort" - "sync" - "sync/atomic" - "syscall" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" - "gvisor.dev/gvisor/pkg/sentry/limits" -) - -// FDs is an ordering of FD's that can be made stable. -type FDs []kdefs.FD - -func (f FDs) Len() int { - return len(f) -} - -func (f FDs) Swap(i, j int) { - f[i], f[j] = f[j], f[i] -} - -func (f FDs) Less(i, j int) bool { - return f[i] < f[j] -} - -// FDFlags define flags for an individual descriptor. -// -// +stateify savable -type FDFlags struct { - // CloseOnExec indicates the descriptor should be closed on exec. - CloseOnExec bool -} - -// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags -// representation. -func (f FDFlags) ToLinuxFileFlags() (mask uint) { - if f.CloseOnExec { - mask |= linux.O_CLOEXEC - } - return -} - -// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags -// representation. -func (f FDFlags) ToLinuxFDFlags() (mask uint) { - if f.CloseOnExec { - mask |= linux.FD_CLOEXEC - } - return -} - -// descriptor holds the details about a file descriptor, namely a pointer the -// file itself and the descriptor flags. -// -// +stateify savable -type descriptor struct { - file *fs.File - flags FDFlags -} - -// FDMap is used to manage File references and flags. -// -// +stateify savable -type FDMap struct { - refs.AtomicRefCount - k *Kernel - files map[kdefs.FD]descriptor - mu sync.RWMutex `state:"nosave"` - uid uint64 -} - -// ID returns a unique identifier for this FDMap. -func (f *FDMap) ID() uint64 { - return f.uid -} - -// NewFDMap allocates a new FDMap that may be used by tasks in k. -func (k *Kernel) NewFDMap() *FDMap { - f := FDMap{ - k: k, - files: make(map[kdefs.FD]descriptor), - uid: atomic.AddUint64(&k.fdMapUids, 1), - } - f.EnableLeakCheck("kernel.FDMap") - return &f -} - -// destroy removes all of the file descriptors from the map. -func (f *FDMap) destroy() { - f.RemoveIf(func(*fs.File, FDFlags) bool { - return true - }) -} - -// DecRef implements RefCounter.DecRef with destructor f.destroy. -func (f *FDMap) DecRef() { - f.DecRefWithDestructor(f.destroy) -} - -// Size returns the number of file descriptor slots currently allocated. -func (f *FDMap) Size() int { - f.mu.RLock() - defer f.mu.RUnlock() - - return len(f.files) -} - -// String is a stringer for FDMap. -func (f *FDMap) String() string { - f.mu.RLock() - defer f.mu.RUnlock() - - var b bytes.Buffer - for k, v := range f.files { - n, _ := v.file.Dirent.FullName(nil /* root */) - b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", k, n)) - } - return b.String() -} - -// NewFDFrom allocates a new FD guaranteed to be the lowest number available -// greater than or equal to from. This property is important as Unix programs -// tend to count on this allocation order. -func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error) { - if fd < 0 { - // Don't accept negative FDs. - return 0, syscall.EINVAL - } - - f.mu.Lock() - defer f.mu.Unlock() - - // Finds the lowest fd not in the handles map. - lim := limitSet.Get(limits.NumberOfFiles) - for i := fd; lim.Cur == limits.Infinity || i < kdefs.FD(lim.Cur); i++ { - if _, ok := f.files[i]; !ok { - file.IncRef() - f.files[i] = descriptor{file, flags} - return i, nil - } - } - - return -1, syscall.EMFILE -} - -// NewFDAt sets the file reference for the given FD. If there is an -// active reference for that FD, the ref count for that existing reference -// is decremented. -func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error { - if fd < 0 { - // Don't accept negative FDs. - return syscall.EBADF - } - - // In this one case we do not do a defer of the Unlock. The - // reason is that we must have done all the work needed for - // discarding any old open file before we return to the - // caller. In other words, the DecRef(), below, must have - // completed by the time we return to the caller to ensure - // side effects are, in fact, effected. A classic example is - // dup2(fd1, fd2); if fd2 was already open, it must be closed, - // and we don't want to resume the caller until it is; we have - // to block on the DecRef(). Hence we can not just do a 'go - // oldfile.DecRef()', since there would be no guarantee that - // it would be done before we the caller resumed. Since we - // must wait for the DecRef() to finish, and that could take - // time, it's best to first call f.muUnlock beore so we are - // not blocking other uses of this FDMap on the DecRef() call. - f.mu.Lock() - oldDesc, oldExists := f.files[fd] - lim := limitSet.Get(limits.NumberOfFiles).Cur - // if we're closing one then the effective limit is one - // more than the actual limit. - if oldExists && lim != limits.Infinity { - lim++ - } - if lim != limits.Infinity && fd >= kdefs.FD(lim) { - f.mu.Unlock() - return syscall.EMFILE - } - - file.IncRef() - f.files[fd] = descriptor{file, flags} - f.mu.Unlock() - - if oldExists { - oldDesc.file.DecRef() - } - return nil -} - -// SetFlags sets the flags for the given file descriptor, if it is valid. -func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags) { - f.mu.Lock() - defer f.mu.Unlock() - - desc, ok := f.files[fd] - if !ok { - return - } - - f.files[fd] = descriptor{desc.file, flags} -} - -// GetDescriptor returns a reference to the file and the flags for the FD. It -// bumps its reference count as well. It returns nil if there is no File -// for the FD, i.e. if the FD is invalid. The caller must use DecRef -// when they are done. -func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags) { - f.mu.RLock() - defer f.mu.RUnlock() - - if desc, ok := f.files[fd]; ok { - desc.file.IncRef() - return desc.file, desc.flags - } - return nil, FDFlags{} -} - -// GetFile returns a reference to the File for the FD and bumps -// its reference count as well. It returns nil if there is no File -// for the FD, i.e. if the FD is invalid. The caller must use DecRef -// when they are done. -func (f *FDMap) GetFile(fd kdefs.FD) *fs.File { - f.mu.RLock() - if desc, ok := f.files[fd]; ok { - desc.file.IncRef() - f.mu.RUnlock() - return desc.file - } - f.mu.RUnlock() - return nil -} - -// fds returns an ordering of FDs. -func (f *FDMap) fds() FDs { - fds := make(FDs, 0, len(f.files)) - for fd := range f.files { - fds = append(fds, fd) - } - sort.Sort(fds) - return fds -} - -// GetFDs returns a list of valid fds. -func (f *FDMap) GetFDs() FDs { - f.mu.RLock() - defer f.mu.RUnlock() - return f.fds() -} - -// GetRefs returns a stable slice of references to all files and bumps the -// reference count on each. The caller must use DecRef on each reference when -// they're done using the slice. -func (f *FDMap) GetRefs() []*fs.File { - f.mu.RLock() - defer f.mu.RUnlock() - - fds := f.fds() - fs := make([]*fs.File, 0, len(fds)) - for _, fd := range fds { - desc := f.files[fd] - desc.file.IncRef() - fs = append(fs, desc.file) - } - return fs -} - -// Fork returns an independent FDMap pointing to the same descriptors. -func (f *FDMap) Fork() *FDMap { - f.mu.RLock() - defer f.mu.RUnlock() - - clone := f.k.NewFDMap() - - // Grab a extra reference for every file. - for fd, desc := range f.files { - desc.file.IncRef() - clone.files[fd] = desc - } - - // That's it! - return clone -} - -// unlock releases all file locks held by this FDMap's uid. Must only be -// called on a non-nil *fs.File. -func (f *FDMap) unlock(file *fs.File) { - id := lock.UniqueID(f.ID()) - file.Dirent.Inode.LockCtx.Posix.UnlockRegion(id, lock.LockRange{0, lock.LockEOF}) -} - -// inotifyFileClose generates the appropriate inotify events for f being closed. -func inotifyFileClose(f *fs.File) { - var ev uint32 - d := f.Dirent - - if fs.IsDir(d.Inode.StableAttr) { - ev |= linux.IN_ISDIR - } - - if f.Flags().Write { - ev |= linux.IN_CLOSE_WRITE - } else { - ev |= linux.IN_CLOSE_NOWRITE - } - - d.InotifyEvent(ev, 0) -} - -// Remove removes an FD from the FDMap, and returns (File, true) if a File -// one was found. Callers are expected to decrement the reference count on -// the File. Otherwise returns (nil, false). -func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool) { - f.mu.Lock() - desc := f.files[fd] - delete(f.files, fd) - f.mu.Unlock() - if desc.file != nil { - f.unlock(desc.file) - inotifyFileClose(desc.file) - return desc.file, true - } - return nil, false -} - -// RemoveIf removes all FDs where cond is true. -func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool) { - var removed []*fs.File - f.mu.Lock() - for fd, desc := range f.files { - if desc.file != nil && cond(desc.file, desc.flags) { - delete(f.files, fd) - removed = append(removed, desc.file) - } - } - f.mu.Unlock() - - for _, file := range removed { - f.unlock(file) - inotifyFileClose(file) - file.DecRef() - } -} diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go deleted file mode 100644 index 8571dbe59..000000000 --- a/pkg/sentry/kernel/fd_map_test.go +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package kernel - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/sentry/fs/filetest" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" - "gvisor.dev/gvisor/pkg/sentry/limits" -) - -const ( - // maxFD is the maximum FD to try to create in the map. - // This number of open files has been seen in the wild. - maxFD = 2 * 1024 -) - -func newTestFDMap() *FDMap { - return &FDMap{ - files: make(map[kdefs.FD]descriptor), - } -} - -// TestFDMapMany allocates maxFD FDs, i.e. maxes out the FDMap, -// until there is no room, then makes sure that NewFDAt works -// and also that if we remove one and add one that works too. -func TestFDMapMany(t *testing.T) { - file := filetest.NewTestFile(t) - limitSet := limits.NewLimitSet() - limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true /* privileged */) - - f := newTestFDMap() - for i := 0; i < maxFD; i++ { - if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD) - } - } - - if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil { - t.Fatalf("f.NewFDFrom(0, r) in full map: got nil, wanted error") - } - - if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) - } -} - -// TestFDMap does a set of simple tests to make sure simple adds, -// removes, GetRefs, and DecRefs work. The ordering is just weird -// enough that a table-driven approach seemed clumsy. -func TestFDMap(t *testing.T) { - file := filetest.NewTestFile(t) - limitSet := limits.NewLimitSet() - limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}, true /* privileged */) - - f := newTestFDMap() - if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err) - } - - if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil { - t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error") - } - - largeLimit := limits.Limit{maxFD, maxFD} - limitSet.Set(limits.NumberOfFiles, largeLimit, true /* privileged */) - - if fd, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("Adding an FD to a resized map: got %v, want nil", err) - } else if fd != kdefs.FD(1) { - t.Fatalf("Added an FD to a resized map: got %v, want 1", fd) - } - - if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil { - t.Fatalf("Replacing FD 1 via f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) - } - - if err := f.NewFDAt(maxFD+1, file, FDFlags{}, limitSet); err == nil { - t.Fatalf("Using an FD that was too large via f.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1) - } - - if ref := f.GetFile(1); ref == nil { - t.Fatalf("f.GetFile(1): got nil, wanted %v", file) - } - - if ref := f.GetFile(2); ref != nil { - t.Fatalf("f.GetFile(2): got a %v, wanted nil", ref) - } - - ref, ok := f.Remove(1) - if !ok { - t.Fatalf("f.Remove(1) for an existing FD: failed, want success") - } - ref.DecRef() - - if ref, ok := f.Remove(1); ok { - ref.DecRef() - t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") - } - -} - -func TestDescriptorFlags(t *testing.T) { - file := filetest.NewTestFile(t) - f := newTestFDMap() - limitSet := limits.NewLimitSet() - limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true /* privileged */) - - origFlags := FDFlags{CloseOnExec: true} - - if err := f.NewFDAt(2, file, origFlags, limitSet); err != nil { - t.Fatalf("f.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err) - } - - newFile, newFlags := f.GetDescriptor(2) - if newFile == nil { - t.Fatalf("f.GetFile(2): got a %v, wanted nil", newFile) - } - - if newFlags != origFlags { - t.Fatalf("new File flags %+v don't match original %+v", newFlags, origFlags) - } -} diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go new file mode 100644 index 000000000..1f3a57dc1 --- /dev/null +++ b/pkg/sentry/kernel/fd_table.go @@ -0,0 +1,380 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "bytes" + "fmt" + "math" + "sync" + "sync/atomic" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/limits" +) + +// FDFlags define flags for an individual descriptor. +// +// +stateify savable +type FDFlags struct { + // CloseOnExec indicates the descriptor should be closed on exec. + CloseOnExec bool +} + +// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags +// representation. +func (f FDFlags) ToLinuxFileFlags() (mask uint) { + if f.CloseOnExec { + mask |= linux.O_CLOEXEC + } + return +} + +// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags +// representation. +func (f FDFlags) ToLinuxFDFlags() (mask uint) { + if f.CloseOnExec { + mask |= linux.FD_CLOEXEC + } + return +} + +// descriptor holds the details about a file descriptor, namely a pointer to +// the file itself and the descriptor flags. +// +// Note that this is immutable and can only be changed via operations on the +// descriptorTable. +// +// +stateify savable +type descriptor struct { + file *fs.File + flags FDFlags +} + +// FDTable is used to manage File references and flags. +// +// +stateify savable +type FDTable struct { + refs.AtomicRefCount + k *Kernel + + // uid is a unique identifier. + uid uint64 + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // used contains the number of non-nil entries. + used int32 + + // descriptorTable holds descriptors. + descriptorTable `state:".(map[int32]descriptor)"` +} + +func (f *FDTable) saveDescriptorTable() map[int32]descriptor { + m := make(map[int32]descriptor) + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + m[fd] = descriptor{ + file: file, + flags: flags, + } + }) + return m +} + +func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { + f.init() // Initialize table. + for fd, d := range m { + f.set(fd, d.file, d.flags) + + // Note that we do _not_ need to acquire a extra table + // reference here. The table reference will already be + // accounted for in the file, so we drop the reference taken by + // set above. + d.file.DecRef() + } +} + +// drop drops the table reference. +func (f *FDTable) drop(file *fs.File) { + // Release locks. + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF}) + + // Send inotify events. + d := file.Dirent + var ev uint32 + if fs.IsDir(d.Inode.StableAttr) { + ev |= linux.IN_ISDIR + } + if file.Flags().Write { + ev |= linux.IN_CLOSE_WRITE + } else { + ev |= linux.IN_CLOSE_NOWRITE + } + d.InotifyEvent(ev, 0) + + // Drop the table reference. + file.DecRef() +} + +// ID returns a unique identifier for this FDTable. +func (f *FDTable) ID() uint64 { + return f.uid +} + +// NewFDTable allocates a new FDTable that may be used by tasks in k. +func (k *Kernel) NewFDTable() *FDTable { + f := &FDTable{ + k: k, + uid: atomic.AddUint64(&k.fdMapUids, 1), + } + f.init() + return f +} + +// destroy removes all of the file descriptors from the map. +func (f *FDTable) destroy() { + f.RemoveIf(func(*fs.File, FDFlags) bool { + return true + }) +} + +// DecRef implements RefCounter.DecRef with destructor f.destroy. +func (f *FDTable) DecRef() { + f.DecRefWithDestructor(f.destroy) +} + +// Size returns the number of file descriptor slots currently allocated. +func (f *FDTable) Size() int { + size := atomic.LoadInt32(&f.used) + return int(size) +} + +// forEach iterates over all non-nil files. +// +// It is the caller's responsibility to acquire an appropriate lock. +func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) { + fd := int32(0) + for { + file, flags, ok := f.get(fd) + if !ok { + break + } + if file != nil { + if !file.TryIncRef() { + continue // Race caught. + } + fn(int32(fd), file, flags) + file.DecRef() + } + fd++ + } +} + +// String is a stringer for FDTable. +func (f *FDTable) String() string { + var b bytes.Buffer + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + n, _ := file.Dirent.FullName(nil /* root */) + b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n)) + }) + return b.String() +} + +// NewFDs allocates new FDs guaranteed to be the lowest number available +// greater than or equal to the fd parameter. All files will share the set +// flags. Success is guaranteed to be all or none. +func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) { + if fd < 0 { + // Don't accept negative FDs. + return nil, syscall.EINVAL + } + + // Default limit. + end := int32(math.MaxInt32) + + // Ensure we don't get past the provided limit. + if limitSet := limits.FromContext(ctx); limitSet != nil { + lim := limitSet.Get(limits.NumberOfFiles) + if lim.Cur != limits.Infinity { + end = int32(lim.Cur) + } + if fd >= end { + return nil, syscall.EMFILE + } + } + + f.mu.Lock() + defer f.mu.Unlock() + + // Install all entries. + for i := fd; i < end && len(fds) < len(files); i++ { + if d, _, _ := f.get(i); d == nil { + f.set(i, files[len(fds)], flags) // Set the descriptor. + fds = append(fds, i) // Record the file descriptor. + } + } + + // Failure? Unwind existing FDs. + if len(fds) < len(files) { + for _, i := range fds { + f.set(i, nil, FDFlags{}) // Zap entry. + } + return nil, syscall.EMFILE + } + + return fds, nil +} + +// NewFDAt sets the file reference for the given FD. If there is an active +// reference for that FD, the ref count for that existing reference is +// decremented. +func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + f.mu.Lock() + defer f.mu.Unlock() + + // Check the limit for the provided file. + if limitSet := limits.FromContext(ctx); limitSet != nil { + if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur { + return syscall.EMFILE + } + } + + // Install the entry. + f.set(fd, file, flags) + return nil +} + +// SetFlags sets the flags for the given file descriptor. +// +// True is returned iff flags were changed. +func (f *FDTable) SetFlags(fd int32, flags FDFlags) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + f.mu.Lock() + defer f.mu.Unlock() + + file, _, _ := f.get(fd) + if file == nil { + // No file found. + return syscall.EBADF + } + + // Update the flags. + f.set(fd, file, flags) + return nil +} + +// Get returns a reference to the file and the flags for the FD or nil if no +// file is defined for the given fd. +// +// N.B. Callers are required to use DecRef when they are done. +// +//go:nosplit +func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) { + if fd < 0 { + return nil, FDFlags{} + } + + for { + file, flags, _ := f.get(fd) + if file != nil { + if !file.TryIncRef() { + continue // Race caught. + } + // Reference acquired. + return file, flags + } + // No file available. + return nil, FDFlags{} + } +} + +// GetFDs returns a list of valid fds. +func (f *FDTable) GetFDs() []int32 { + fds := make([]int32, 0, f.used) + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + fds = append(fds, fd) + }) + return fds +} + +// GetRefs returns a stable slice of references to all files and bumps the +// reference count on each. The caller must use DecRef on each reference when +// they're done using the slice. +func (f *FDTable) GetRefs() []*fs.File { + files := make([]*fs.File, 0, f.Size()) + f.forEach(func(_ int32, file *fs.File, flags FDFlags) { + file.IncRef() // Acquire a reference for caller. + files = append(files, file) + }) + return files +} + +// Fork returns an independent FDTable. +func (f *FDTable) Fork() *FDTable { + clone := f.k.NewFDTable() + + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + // The set function here will acquire an appropriate table + // reference for the clone. We don't need anything else. + clone.set(fd, file, flags) + }) + return clone +} + +// Remove removes an FD from and returns a non-file iff successful. +// +// N.B. Callers are required to use DecRef when they are done. +func (f *FDTable) Remove(fd int32) *fs.File { + if fd < 0 { + return nil + } + + f.mu.Lock() + defer f.mu.Unlock() + + orig, _, _ := f.get(fd) + if orig != nil { + orig.IncRef() // Reference for caller. + f.set(fd, nil, FDFlags{}) // Zap entry. + } + return orig +} + +// RemoveIf removes all FDs where cond is true. +func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) { + f.mu.Lock() + defer f.mu.Unlock() + + f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + if cond(file, flags) { + f.set(fd, nil, FDFlags{}) // Clear from table. + } + }) +} diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go new file mode 100644 index 000000000..2413788e7 --- /dev/null +++ b/pkg/sentry/kernel/fd_table_test.go @@ -0,0 +1,192 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "runtime" + "sync" + "testing" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/filetest" + "gvisor.dev/gvisor/pkg/sentry/limits" +) + +const ( + // maxFD is the maximum FD to try to create in the map. + // + // This number of open files has been seen in the wild. + maxFD = 2 * 1024 +) + +func runTest(t testing.TB, fn func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet)) { + t.Helper() // Don't show in stacks. + + // Create the limits and context. + limitSet := limits.NewLimitSet() + limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true) + ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet) + + // Create a test file.; + file := filetest.NewTestFile(t) + + // Create the table. + fdTable := new(FDTable) + fdTable.init() + + // Run the test. + fn(ctx, fdTable, file, limitSet) +} + +// TestFDTableMany allocates maxFD FDs, i.e. maxes out the FDTable, until there +// is no room, then makes sure that NewFDAt works and also that if we remove +// one and add one that works too. +func TestFDTableMany(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + for i := 0; i < maxFD; i++ { + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD) + } + } + + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil { + t.Fatalf("fdTable.NewFDs(0, r) in full map: got nil, wanted error") + } + + if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil { + t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) + } + }) +} + +// TestFDTable does a set of simple tests to make sure simple adds, removes, +// GetRefs, and DecRefs work. The ordering is just weird enough that a +// table-driven approach seemed clumsy. +func TestFDTable(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet) { + // Cap the limit at one. + limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}, true) + + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err) + } + + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil { + t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error") + } + + // Remove the previous limit. + limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true) + + if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Adding an FD to a resized map: got %v, want nil", err) + } else if len(fds) != 1 || fds[0] != 1 { + t.Fatalf("Added an FD to a resized map: got %v, want {1}", fds) + } + + if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil { + t.Fatalf("Replacing FD 1 via fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) + } + + if err := fdTable.NewFDAt(ctx, maxFD+1, file, FDFlags{}); err == nil { + t.Fatalf("Using an FD that was too large via fdTable.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1) + } + + if ref, _ := fdTable.Get(1); ref == nil { + t.Fatalf("fdTable.Get(1): got nil, wanted %v", file) + } + + if ref, _ := fdTable.Get(2); ref != nil { + t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref) + } + + ref := fdTable.Remove(1) + if ref == nil { + t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success") + } + ref.DecRef() + + if ref := fdTable.Remove(1); ref != nil { + t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") + } + }) +} + +func TestDescriptorFlags(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + if err := fdTable.NewFDAt(ctx, 2, file, FDFlags{CloseOnExec: true}); err != nil { + t.Fatalf("fdTable.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err) + } + + newFile, flags := fdTable.Get(2) + if newFile == nil { + t.Fatalf("fdTable.Get(2): got a %v, wanted nil", newFile) + } + + if !flags.CloseOnExec { + t.Fatalf("new File flags %v don't match original %d\n", flags, 0) + } + }) +} + +func BenchmarkFDLookupAndDecRef(b *testing.B) { + b.StopTimer() // Setup. + + runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{}) + if err != nil { + b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err) + } + + b.StartTimer() // Benchmark. + for i := 0; i < b.N; i++ { + tf, _ := fdTable.Get(fds[i%len(fds)]) + tf.DecRef() + } + }) +} + +func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) { + b.StopTimer() // Setup. + + runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{}) + if err != nil { + b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err) + } + + concurrency := runtime.GOMAXPROCS(0) + if concurrency < 4 { + concurrency = 4 + } + each := b.N / concurrency + + b.StartTimer() // Benchmark. + var wg sync.WaitGroup + for i := 0; i < concurrency; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < each; i++ { + tf, _ := fdTable.Get(fds[i%len(fds)]) + tf.DecRef() + } + }() + } + wg.Wait() + }) +} diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go new file mode 100644 index 000000000..e009df974 --- /dev/null +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -0,0 +1,103 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync/atomic" + "unsafe" + + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +type descriptorTable struct { + // slice is a *[]unsafe.Pointer, where each element is actually + // *descriptor object, updated atomically. + // + // Changes to the slice itself requiring holding FDTable.mu. + slice unsafe.Pointer `state:".(map[int32]*descriptor)"` +} + +// init initializes the table. +func (f *FDTable) init() { + var slice []unsafe.Pointer // Empty slice. + atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) +} + +// get gets a file entry. +// +// The boolean indicates whether this was in range. +// +//go:nosplit +func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) { + slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) + if fd >= int32(len(slice)) { + return nil, FDFlags{}, false + } + d := (*descriptor)(atomic.LoadPointer(&slice[fd])) + if d == nil { + return nil, FDFlags{}, true + } + return d.file, d.flags, true +} + +// set sets an entry. +// +// This handles accounting changes, as well as acquiring and releasing the +// reference needed by the table iff the file is different. +// +// Precondition: mu must be held. +func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) { + slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) + + // Grow the table as required. + if last := int32(len(slice)); fd >= last { + end := fd + 1 + if end < 2*last { + end = 2 * last + } + slice = append(slice, make([]unsafe.Pointer, end-last)...) + atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) + } + + // Create the new element. + var d *descriptor + if file != nil { + d = &descriptor{ + file: file, + flags: flags, + } + } + + // Update the single element. + orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(d))) + + // Acquire a table reference. + if file != nil && (orig == nil || file != orig.file) { + file.IncRef() + } + + // Drop the table reference. + if orig != nil && file != orig.file { + f.drop(orig.file) + } + + // Adjust used. + switch { + case orig == nil && file != nil: + atomic.AddInt32(&f.used, 1) + case orig != nil && file == nil: + atomic.AddInt32(&f.used, -1) + } +} diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD deleted file mode 100644 index 5d62f406a..000000000 --- a/pkg/sentry/kernel/kdefs/BUILD +++ /dev/null @@ -1,10 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "kdefs", - srcs = ["kdefs.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs", - visibility = ["//:sandbox"], -) diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go deleted file mode 100644 index 304da2032..000000000 --- a/pkg/sentry/kernel/kdefs/kdefs.go +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package kdefs defines common kernel definitions. -// -package kdefs - -// FD is a File Descriptor. -type FD int32 diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 47dadc43a..38b49cba2 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -155,7 +155,7 @@ type Kernel struct { // cpuClockTicker increments cpuClock. cpuClockTicker *ktime.Timer `state:"nosave"` - // fdMapUids is an ever-increasing counter for generating FDMap uids. + // fdMapUids is an ever-increasing counter for generating FDTable uids. // // fdMapUids is mutable, and is accessed using atomic memory operations. fdMapUids uint64 @@ -400,8 +400,8 @@ func (k *Kernel) flushMountSourceRefs() error { // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. - return k.tasks.forEachFDPaused(func(desc descriptor) error { - desc.file.Dirent.Inode.MountSource.FlushDirentRefs() + return k.tasks.forEachFDPaused(func(file *fs.File) error { + file.Dirent.Inode.MountSource.FlushDirentRefs() return nil }) } @@ -410,35 +410,35 @@ func (k *Kernel) flushMountSourceRefs() error { // task. // // Precondition: Must be called with the kernel paused. -func (ts *TaskSet) forEachFDPaused(f func(descriptor) error) error { +func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) { ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. - if t.fds == nil { + if t.fdTable == nil { continue } - for _, desc := range t.fds.files { - if err := f(desc); err != nil { - return err + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if lastErr := f(file); lastErr != nil && err == nil { + err = lastErr } - } + }) } - return nil + return err } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - return ts.forEachFDPaused(func(desc descriptor) error { - if flags := desc.file.Flags(); !flags.Write { + return ts.forEachFDPaused(func(file *fs.File) error { + if flags := file.Flags(); !flags.Write { return nil } - if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { + if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { return nil } // Here we need all metadata synced. - syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) if err := fs.SaveFileFsyncError(syncErr); err != nil { - name, _ := desc.file.Dirent.FullName(nil /* root */) + name, _ := file.Dirent.FullName(nil /* root */) // Wrap this error in ErrSaveRejection // so that it will trigger a save // error, rather than a panic. This @@ -483,14 +483,12 @@ func (ts *TaskSet) unregisterEpollWaiters() { defer ts.mu.RUnlock() for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. - if fdmap := t.fds; fdmap != nil { - for _, desc := range fdmap.files { - if desc.file != nil { - if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok { - e.UnregisterEpollWaiters() - } + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if e, ok := file.FileOperations.(*epoll.EventPoll); ok { + e.UnregisterEpollWaiters() } - } + }) } } } @@ -538,6 +536,8 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error { } log.Infof("Memory load took [%s].", time.Since(memoryStart)) + log.Infof("Overall load took [%s]", time.Since(loadStart)) + // Ensure that all pending asynchronous work is complete: // - namedpipe opening // - inode file opening @@ -602,9 +602,9 @@ type CreateProcessArgs struct { // Credentials is the initial credentials. Credentials *auth.Credentials - // FDMap is the initial set of file descriptors. If CreateProcess succeeds, - // it takes a reference on FDMap. - FDMap *FDMap + // FDTable is the initial set of file descriptors. If CreateProcess succeeds, + // it takes a reference on FDTable. + FDTable *FDTable // Umask is the initial umask. Umask uint @@ -789,9 +789,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, return nil, 0, errors.New(se.String()) } - // Take a reference on the FDMap, which will be transferred to + // Take a reference on the FDTable, which will be transferred to // TaskSet.NewTask(). - args.FDMap.IncRef() + args.FDTable.IncRef() // Create the task. config := &TaskConfig{ @@ -799,7 +799,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, ThreadGroup: tg, TaskContext: tc, FSContext: newFSContext(root, wd, args.Umask), - FDMap: args.FDMap, + FDTable: args.FDTable, Credentials: args.Credentials, AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), UTSNamespace: args.UTSNamespace, @@ -871,7 +871,7 @@ func (k *Kernel) pauseTimeLocked() { } // By precondition, nothing else can be interacting with PIDNamespace.tids - // or FDMap.files, so we can iterate them without synchronization. (We + // or FDTable.files, so we can iterate them without synchronization. (We // can't hold the TaskSet mutex when pausing thread group timers because // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet // mutex, while holding the Timer mutex.) @@ -882,14 +882,14 @@ func (k *Kernel) pauseTimeLocked() { it.PauseTimer() } } - // This means we'll iterate FDMaps shared by multiple tasks repeatedly, + // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. - if fdm := t.fds; fdm != nil { - for _, desc := range fdm.files { - if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.PauseTimer() } - } + }) } } k.timekeeper.PauseUpdates() @@ -914,12 +914,12 @@ func (k *Kernel) resumeTimeLocked() { it.ResumeTimer() } } - if fdm := t.fds; fdm != nil { - for _, desc := range fdm.files { - if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.ResumeTimer() } - } + }) } } } diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 2e3a39d3b..e91f82bb3 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -236,15 +236,15 @@ type Task struct { // tc is protected by mu, and is owned by the task goroutine. tc TaskContext - // fsc is the task's filesystem context. + // fsContext is the task's filesystem context. // - // fsc is protected by mu, and is owned by the task goroutine. - fsc *FSContext + // fsContext is protected by mu, and is owned by the task goroutine. + fsContext *FSContext - // fds is the task's file descriptor table. + // fdTable is the task's file descriptor table. // - // fds is protected by mu, and is owned by the task goroutine. - fds *FDMap + // fdTable is protected by mu, and is owned by the task goroutine. + fdTable *FDTable // If vforkParent is not nil, it is the task that created this task with // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when @@ -602,7 +602,7 @@ func (t *Task) Value(key interface{}) interface{} { case context.CtxThreadGroupID: return int32(t.ThreadGroup().ID()) case fs.CtxRoot: - return t.fsc.RootDirectory() + return t.fsContext.RootDirectory() case fs.CtxDirentCacheLimiter: return t.k.DirentCacheLimiter case inet.CtxStack: @@ -668,7 +668,7 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock { func (t *Task) IsChrooted() bool { realRoot := t.tg.mounts.Root() defer realRoot.DecRef() - root := t.fsc.RootDirectory() + root := t.fsContext.RootDirectory() if root != nil { defer root.DecRef() } @@ -689,16 +689,55 @@ func (t *Task) TaskContext() *TaskContext { // Precondition: The caller must be running on the task goroutine, or t.mu must // be locked. func (t *Task) FSContext() *FSContext { - return t.fsc + return t.fsContext } -// FDMap returns t's FDMap. FDMap does not take an additional reference on the -// returned FDMap. +// FDTable returns t's FDTable. FDMTable does not take an additional reference +// on the returned FDMap. // // Precondition: The caller must be running on the task goroutine, or t.mu must // be locked. -func (t *Task) FDMap() *FDMap { - return t.fds +func (t *Task) FDTable() *FDTable { + return t.fdTable +} + +// GetFile is a convenience wrapper t.FDTable().GetFile. +// +// Precondition: same as FDTable. +func (t *Task) GetFile(fd int32) *fs.File { + f, _ := t.fdTable.Get(fd) + return f +} + +// NewFDs is a convenience wrapper for t.FDTable().NewFDs. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error) { + return t.fdTable.NewFDs(t, fd, files, flags) +} + +// NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) { + fds, err := t.fdTable.NewFDs(t, fd, []*fs.File{file}, flags) + if err != nil { + return 0, err + } + return fds[0], nil +} + +// NewFDAt is a convenience wrapper for t.FDTable().NewFDAt. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error { + return t.fdTable.NewFDAt(t, fd, file, flags) } // WithMuLocked executes f with t.mu locked. diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index b5cc3860d..0916fd658 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -214,20 +214,20 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } } - var fsc *FSContext + var fsContext *FSContext if opts.NewFSContext { - fsc = t.fsc.Fork() + fsContext = t.fsContext.Fork() } else { - fsc = t.fsc - fsc.IncRef() + fsContext = t.fsContext + fsContext.IncRef() } - var fds *FDMap + var fdTable *FDTable if opts.NewFiles { - fds = t.fds.Fork() + fdTable = t.fdTable.Fork() } else { - fds = t.fds - fds.IncRef() + fdTable = t.fdTable + fdTable.IncRef() } pidns := t.tg.pidns @@ -251,8 +251,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { ThreadGroup: tg, SignalMask: t.SignalMask(), TaskContext: tc, - FSContext: fsc, - FDMap: fds, + FSContext: fsContext, + FDTable: fdTable, Credentials: creds, Niceness: t.Niceness(), NetworkNamespaced: t.netns, @@ -485,22 +485,22 @@ func (t *Task) Unshare(opts *SharingOptions) error { // namespace" t.ipcns = NewIPCNamespace(creds.UserNamespace) } - var oldfds *FDMap + var oldFDTable *FDTable if opts.NewFiles { - oldfds = t.fds - t.fds = oldfds.Fork() + oldFDTable = t.fdTable + t.fdTable = oldFDTable.Fork() } - var oldfsc *FSContext + var oldFSContext *FSContext if opts.NewFSContext { - oldfsc = t.fsc - t.fsc = oldfsc.Fork() + oldFSContext = t.fsContext + t.fsContext = oldFSContext.Fork() } t.mu.Unlock() - if oldfds != nil { - oldfds.DecRef() + if oldFDTable != nil { + oldFDTable.DecRef() } - if oldfsc != nil { - oldfsc.DecRef() + if oldFSContext != nil { + oldFSContext.DecRef() } return nil } diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index cd85acaef..17a089b90 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -195,7 +195,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Unlock() // Remove FDs with the CloseOnExec flag set. - t.fds.RemoveIf(func(file *fs.File, flags FDFlags) bool { + t.fdTable.RemoveIf(func(file *fs.File, flags FDFlags) bool { return flags.CloseOnExec }) diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index b97d65185..535f03e50 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -265,8 +265,8 @@ func (*runExitMain) execute(t *Task) taskRunState { // Releasing the MM unblocks a blocked CLONE_VFORK parent. t.unstopVforkParent() - t.fsc.DecRef() - t.fds.DecRef() + t.fsContext.DecRef() + t.fdTable.DecRef() // If this is the last task to exit from the thread group, release the // thread group's resources. diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index cf48663b6..a29e9b9eb 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -63,7 +63,7 @@ func (t *Task) DebugDumpState() { if mm := t.MemoryManager(); mm != nil { t.Debugf("Mappings:\n%s", mm) } - t.Debugf("FDMap:\n%s", t.fds) + t.Debugf("FDTable:\n%s", t.fdTable) } // debugDumpRegisters logs register state at log level debug. diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 72caae537..a88bf3951 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -52,9 +52,10 @@ type TaskConfig struct { // succeeds. FSContext *FSContext - // FDMap is the FDMap of the new task. A reference must be held on FDMap, - // which is transferred to TaskSet.NewTask whether or not it succeeds. - FDMap *FDMap + // FDTable is the FDTableof the new task. A reference must be held on + // FDMap, which is transferred to TaskSet.NewTask whether or not it + // succeeds. + FDTable *FDTable // Credentials is the Credentials of the new task. Credentials *auth.Credentials @@ -90,7 +91,7 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { if err != nil { cfg.TaskContext.release() cfg.FSContext.DecRef() - cfg.FDMap.DecRef() + cfg.FDTable.DecRef() return nil, err } return t, nil @@ -112,8 +113,8 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { signalMask: cfg.SignalMask, signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, tc: *tc, - fsc: cfg.FSContext, - fds: cfg.FDMap, + fsContext: cfg.FSContext, + fdTable: cfg.FDTable, p: cfg.Kernel.Platform.NewContext(), k: cfg.Kernel, ptraceTracees: make(map[*Task]struct{}), diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index edfdac2a7..baa12d9a0 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -54,7 +54,7 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in // openPath opens name for loading. // // openPath returns the fs.Dirent and an *fs.File for name, which is not -// installed in the Task FDMap. The caller takes ownership of both. +// installed in the Task FDTable. The caller takes ownership of both. // // name must be a readable, executable, regular file. func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, name string) (*fs.Dirent, *fs.File, error) { diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index 7a24d4806..2b03ea87c 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -14,7 +14,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 39de46c39..81dbd7309 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -17,7 +17,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", "//pkg/syserror", diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index b646dc258..4f4a20dfe 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" @@ -63,7 +62,7 @@ type RightsFiles []*fs.File func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) { files := make(RightsFiles, 0, len(fds)) for _, fd := range fds { - file, _ := t.FDMap().GetDescriptor(kdefs.FD(fd)) + file := t.GetFile(fd) if file == nil { files.Release() return nil, syserror.EBADF @@ -109,7 +108,9 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32 files, trunc := rights.Files(t, max) fds := make([]int32, 0, len(files)) for i := 0; i < max && len(files) > 0; i++ { - fd, err := t.FDMap().NewFDFrom(0, files[0], kernel.FDFlags{cloexec}, t.ThreadGroup().Limits()) + fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{ + CloseOnExec: cloexec, + }) files[0].DecRef() files = files[1:] if err != nil { @@ -315,8 +316,7 @@ func PackTimestamp(t *kernel.Task, timestamp int64, buf []byte) []byte { // Parse parses a raw socket control message into portable objects. func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport.ControlMessages, error) { var ( - fds linux.ControlMessageRights - + fds linux.ControlMessageRights haveCreds bool creds linux.ControlMessageCredentials ) diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD index 45bb24a3f..1f014f399 100644 --- a/pkg/sentry/socket/epsocket/BUILD +++ b/pkg/sentry/socket/epsocket/BUILD @@ -28,7 +28,6 @@ go_library( "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go index 2a38e370a..b2b2d98a1 100644 --- a/pkg/sentry/socket/epsocket/epsocket.go +++ b/pkg/sentry/socket/epsocket/epsocket.go @@ -40,7 +40,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -537,7 +536,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *wait // Accept implements the linux syscall accept(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { // Issue the accept request to get the new endpoint. ep, wq, terr := s.Endpoint.Accept() if terr != nil { @@ -575,10 +574,9 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } } - fdFlags := kernel.FDFlags{ + fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, - } - fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits()) + }) t.Kernel().RecordSocket(ns) diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 4f670beb4..a951f1bb0 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -26,7 +26,6 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/inet", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index c63f3aacf..7f69406b7 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -190,7 +189,7 @@ func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo } // Accept implements socket.Socket.Accept. -func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { var peerAddr []byte var peerAddrlen uint32 var peerAddrPtr *byte @@ -236,11 +235,11 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } defer f.DecRef() - fdFlags := kernel.FDFlags{ + kfd, kerr := t.NewFDFrom(0, f, kernel.FDFlags{ CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, - } - kfd, kerr := t.FDMap().NewFDFrom(0, f, fdFlags, t.ThreadGroup().Limits()) + }) t.Kernel().RecordSocket(f) + return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr) } diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index f6b001b63..45ebb2a0e 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -20,7 +20,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/netlink/port", diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index ecc1e2d53..f3d6c1e9b 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" @@ -272,7 +271,7 @@ func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr } // Accept implements socket.Socket.Accept. -func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { // Netlink sockets never support accept. return 0, nil, 0, syserr.ErrNotSupported } diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD index 96d374383..5061dcbde 100644 --- a/pkg/sentry/socket/rpcinet/BUILD +++ b/pkg/sentry/socket/rpcinet/BUILD @@ -25,7 +25,6 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/inet", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/hostinet", diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go index cc7b964ea..ccaaddbfc 100644 --- a/pkg/sentry/socket/rpcinet/socket.go +++ b/pkg/sentry/socket/rpcinet/socket.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn" @@ -286,7 +285,7 @@ func rpcAccept(t *kernel.Task, fd uint32, peer bool) (*pb.AcceptResponse_ResultP } // Accept implements socket.Socket.Accept. -func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { payload, se := rpcAccept(t, s.fd, peerRequested) // Check if we need to block. @@ -336,10 +335,9 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, }) defer file.DecRef() - fdFlags := kernel.FDFlags{ + fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, - } - fd, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, 0, syserr.FromError(err) } diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 933120f34..0efa58a58 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -53,7 +52,7 @@ type Socket interface { // Accept implements the accept4(2) linux syscall. // Returns fd, real peer address length and error. Real peer address // length is only set if len(peer) > 0. - Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) + Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) // Bind implements the bind(2) linux syscall. Bind(t *kernel.Task, sockaddr []byte) *syserr.Error diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 8580eb87d..da9977fde 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -20,7 +20,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index bf7d2cfa2..b30871a90 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" @@ -194,7 +193,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, * // Accept implements the linux syscall accept(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { +func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) { // Issue the accept request to get the new endpoint. ep, err := s.ep.Accept() if err != nil { @@ -229,10 +228,9 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } } - fdFlags := kernel.FDFlags{ + fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, - } - fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits()) + }) if e != nil { return 0, nil, 0, syserr.FromError(e) } diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index d77c7a433..445d25010 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -30,7 +30,6 @@ go_library( "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/kernel", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/socket/control", "//pkg/sentry/socket/epsocket", "//pkg/sentry/socket/netlink", diff --git a/pkg/sentry/strace/poll.go b/pkg/sentry/strace/poll.go index 57cf6b139..5187594a7 100644 --- a/pkg/sentry/strace/poll.go +++ b/pkg/sentry/strace/poll.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/usermem" ) @@ -50,7 +49,7 @@ func pollFD(t *kernel.Task, pfd *linux.PollFD, post bool) string { if post { revents = PollEventSet.Parse(uint64(pfd.REvents)) } - return fmt.Sprintf("{FD: %s, Events: %s, REvents: %s}", fd(t, kdefs.FD(pfd.FD)), PollEventSet.Parse(uint64(pfd.Events)), revents) + return fmt.Sprintf("{FD: %s, Events: %s, REvents: %s}", fd(t, pfd.FD), PollEventSet.Parse(uint64(pfd.Events)), revents) } func pollFDs(t *kernel.Task, addr usermem.Addr, nfds uint, post bool) string { diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 86e9c5690..311389547 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -31,7 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" pb "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -133,7 +132,7 @@ func path(t *kernel.Task, addr usermem.Addr) string { return fmt.Sprintf("%#x %s", addr, path) } -func fd(t *kernel.Task, fd kdefs.FD) string { +func fd(t *kernel.Task, fd int32) string { root := t.FSContext().RootDirectory() if root != nil { defer root.DecRef() @@ -151,7 +150,7 @@ func fd(t *kernel.Task, fd kdefs.FD) string { return fmt.Sprintf("AT_FDCWD %s", name) } - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { // Cast FD to uint64 to avoid printing negative hex. return fmt.Sprintf("%#x (bad FD)", uint64(fd)) @@ -375,7 +374,7 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo } switch i.format[arg] { case FD: - output = append(output, fd(t, kdefs.FD(args[arg].Int()))) + output = append(output, fd(t, args[arg].Int())) case WriteBuffer: output = append(output, dump(t, args[arg].Pointer(), args[arg+1].SizeT(), maximumBlobSize)) case WriteIOVec: diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD index 18fddee76..79d972202 100644 --- a/pkg/sentry/syscalls/BUILD +++ b/pkg/sentry/syscalls/BUILD @@ -15,7 +15,6 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/kernel", "//pkg/sentry/kernel/epoll", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/time", "//pkg/syserror", "//pkg/waiter", diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go index c710ec9e3..11850dd0f 100644 --- a/pkg/sentry/syscalls/epoll.go +++ b/pkg/sentry/syscalls/epoll.go @@ -20,20 +20,18 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/waiter" ) // CreateEpoll implements the epoll_create(2) linux syscall. -func CreateEpoll(t *kernel.Task, closeOnExec bool) (kdefs.FD, error) { +func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) { file := epoll.NewEventPoll(t) defer file.DecRef() - flags := kernel.FDFlags{ + fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: closeOnExec, - } - fd, err := t.FDMap().NewFDFrom(0, file, flags, t.ThreadGroup().Limits()) + }) if err != nil { return 0, err } @@ -42,16 +40,16 @@ func CreateEpoll(t *kernel.Task, closeOnExec bool) (kdefs.FD, error) { } // AddEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_ADD. -func AddEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { +func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { // Get epoll from the file descriptor. - epollfile := t.FDMap().GetFile(epfd) + epollfile := t.GetFile(epfd) if epollfile == nil { return syscall.EBADF } defer epollfile.DecRef() // Get the target file id. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return syscall.EBADF } @@ -68,16 +66,16 @@ func AddEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags } // UpdateEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_MOD. -func UpdateEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { +func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { // Get epoll from the file descriptor. - epollfile := t.FDMap().GetFile(epfd) + epollfile := t.GetFile(epfd) if epollfile == nil { return syscall.EBADF } defer epollfile.DecRef() // Get the target file id. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return syscall.EBADF } @@ -94,16 +92,16 @@ func UpdateEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFl } // RemoveEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_DEL. -func RemoveEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD) error { +func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { // Get epoll from the file descriptor. - epollfile := t.FDMap().GetFile(epfd) + epollfile := t.GetFile(epfd) if epollfile == nil { return syscall.EBADF } defer epollfile.DecRef() // Get the target file id. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return syscall.EBADF } @@ -120,9 +118,9 @@ func RemoveEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD) error { } // WaitEpoll implements the epoll_wait(2) linux syscall. -func WaitEpoll(t *kernel.Task, fd kdefs.FD, max int, timeout int) ([]epoll.Event, error) { +func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]epoll.Event, error) { // Get epoll from the file descriptor. - epollfile := t.FDMap().GetFile(fd) + epollfile := t.GetFile(fd) if epollfile == nil { return nil, syscall.EBADF } diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 6e5be0158..33a40b9c6 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -71,7 +71,6 @@ go_library( "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/eventfd", "//pkg/sentry/kernel/fasync", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/sched", "//pkg/sentry/kernel/shm", diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index 7081d1a45..f56411bfe 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -55,7 +54,7 @@ type ioCallback struct { OpCode uint16 ReqPrio int16 - FD uint32 + FD int32 Buf uint64 Bytes uint64 @@ -65,7 +64,7 @@ type ioCallback struct { Flags uint32 // eventfd to signal if IOCB_FLAG_RESFD is set in flags. - ResFD uint32 + ResFD int32 } // ioEvent describes an I/O result. @@ -292,7 +291,7 @@ func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioC // submitCallback processes a single callback. func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error { - file := t.FDMap().GetFile(kdefs.FD(cb.FD)) + file := t.GetFile(cb.FD) if file == nil { // File not found. return syserror.EBADF @@ -302,7 +301,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Was there an eventFD? Extract it. var eventFile *fs.File if cb.Flags&_IOCB_FLAG_RESFD != 0 { - eventFile = t.FDMap().GetFile(kdefs.FD(cb.ResFD)) + eventFile = t.GetFile(cb.ResFD) if eventFile == nil { // Bad FD. return syserror.EBADF diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index 14a61cfa5..a0dd4d4e3 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/syscalls" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" @@ -61,9 +60,9 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // EpollCtl implements the epoll_ctl(2) linux syscall. func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - epfd := kdefs.FD(args[0].Int()) + epfd := args[0].Int() op := args[1].Int() - fd := kdefs.FD(args[2].Int()) + fd := args[2].Int() eventAddr := args[3].Pointer() // Capture the event state if needed. @@ -132,7 +131,7 @@ func copyOutEvents(t *kernel.Task, addr usermem.Addr, e []epoll.Event) error { // EpollWait implements the epoll_wait(2) linux syscall. func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - epfd := kdefs.FD(args[0].Int()) + epfd := args[0].Int() eventsAddr := args[1].Pointer() maxEvents := int(args[2].Int()) timeout := int(args[3].Int()) diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go index 7dbe84884..5cfc2c3c8 100644 --- a/pkg/sentry/syscalls/linux/sys_eventfd.go +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -47,10 +47,9 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc }) defer event.DecRef() - fd, err := t.FDMap().NewFDFrom(0, event, kernel.FDFlags{ + fd, err := t.NewFDFrom(0, event, kernel.FDFlags{ CloseOnExec: flags&EFD_CLOEXEC != 0, - }, - t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 2776fdec7..eb6f5648f 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/fasync" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -34,7 +33,7 @@ import ( ) // fileOpAt performs an operation on the second last component in the path. -func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error { +func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error { // Extract the last component. dir, name := fs.SplitLast(path) if dir == "/" { @@ -60,7 +59,7 @@ func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dire } // fileOpOn performs an operation on the last entry of the path. -func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error { +func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error { var ( d *fs.Dirent // The file. wd *fs.Dirent // The working directory (if required.) @@ -78,7 +77,7 @@ func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func rel = wd } else { // Need to extract the given FD. - f = t.FDMap().GetFile(dirFD) + f = t.GetFile(dirFD) if f == nil { return syserror.EBADF } @@ -131,7 +130,7 @@ func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string return path, dirPath, nil } -func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd uintptr, err error) { +func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, err @@ -184,8 +183,9 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u defer file.DecRef() // Success. - fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0} - newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) + newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) if err != nil { return err } @@ -201,7 +201,7 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u return fd, err // Use result in frame. } -func mknodAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { +func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -285,7 +285,7 @@ func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Mknodat implements the linux syscall mknodat(2). func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() path := args[1].Pointer() mode := linux.FileMode(args[2].ModeT()) // We don't need this argument until we support creation of device nodes. @@ -294,7 +294,7 @@ func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca return 0, nil, mknodAt(t, dirFD, path, mode) } -func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) { +func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, err @@ -428,8 +428,9 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod } // Success. - fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0} - newFD, err := t.FDMap().NewFDFrom(0, newFile, fdFlags, t.ThreadGroup().Limits()) + newFD, err := t.NewFDFrom(0, newFile, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) if err != nil { return err } @@ -463,7 +464,7 @@ func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Openat implements linux syscall openat(2). func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() flags := uint(args[2].Uint()) if flags&linux.O_CREAT != 0 { @@ -502,7 +503,7 @@ func (ac accessContext) Value(key interface{}) interface{} { } } -func accessAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, resolve bool, mode uint) error { +func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, resolve bool, mode uint) error { const rOK = 4 const wOK = 2 const xOK = 1 @@ -557,7 +558,7 @@ func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Faccessat implements linux syscall faccessat(2). func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() mode := args[2].ModeT() flags := args[3].Int() @@ -567,10 +568,10 @@ func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Ioctl implements linux syscall ioctl(2). func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() request := int(args[1].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -579,12 +580,12 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Shared flags between file and socket. switch request { case linux.FIONCLEX: - t.FDMap().SetFlags(fd, kernel.FDFlags{ + t.FDTable().SetFlags(fd, kernel.FDFlags{ CloseOnExec: false, }) return 0, nil, nil case linux.FIOCLEX: - t.FDMap().SetFlags(fd, kernel.FDFlags{ + t.FDTable().SetFlags(fd, kernel.FDFlags{ CloseOnExec: true, }) return 0, nil, nil @@ -728,9 +729,9 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Fchdir implements the linux syscall fchdir(2). func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -752,10 +753,13 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Close implements linux syscall close(2). func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file, ok := t.FDMap().Remove(fd) - if !ok { + // Note that Remove provides a reference on the file that we may use to + // flush. It is still active until we drop the final reference below + // (and other reference-holding operations complete). + file := t.FDTable().Remove(fd) + if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() @@ -766,30 +770,30 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Dup implements linux syscall dup(2). func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() - newfd, err := t.FDMap().NewFDFrom(0, file, kernel.FDFlags{}, t.ThreadGroup().Limits()) + newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{}) if err != nil { return 0, nil, syserror.EMFILE } - return uintptr(newfd), nil, nil + return uintptr(newFD), nil, nil } // Dup2 implements linux syscall dup2(2). func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - oldfd := kdefs.FD(args[0].Int()) - newfd := kdefs.FD(args[1].Int()) + oldfd := args[0].Int() + newfd := args[1].Int() // If oldfd is a valid file descriptor, and newfd has the same value as oldfd, // then dup2() does nothing, and returns newfd. if oldfd == newfd { - oldFile := t.FDMap().GetFile(oldfd) + oldFile := t.GetFile(oldfd) if oldFile == nil { return 0, nil, syserror.EBADF } @@ -805,21 +809,21 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Dup3 implements linux syscall dup3(2). func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - oldfd := kdefs.FD(args[0].Int()) - newfd := kdefs.FD(args[1].Int()) + oldfd := args[0].Int() + newfd := args[1].Int() flags := args[2].Uint() if oldfd == newfd { return 0, nil, syserror.EINVAL } - oldFile := t.FDMap().GetFile(oldfd) + oldFile := t.GetFile(oldfd) if oldFile == nil { return 0, nil, syserror.EBADF } defer oldFile.DecRef() - err := t.FDMap().NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}, t.ThreadGroup().Limits()) + err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}) if err != nil { return 0, nil, err } @@ -862,10 +866,10 @@ func fSetOwn(t *kernel.Task, file *fs.File, who int32) { // Fcntl implements linux syscall fcntl(2). func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() cmd := args[1].Int() - file, flags := t.FDMap().GetDescriptor(fd) + file, flags := t.FDTable().Get(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -873,9 +877,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: - from := kdefs.FD(args[2].Int()) - fdFlags := kernel.FDFlags{CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC} - fd, err := t.FDMap().NewFDFrom(from, file, fdFlags, t.ThreadGroup().Limits()) + from := args[2].Int() + fd, err := t.NewFDFrom(from, file, kernel.FDFlags{ + CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC, + }) if err != nil { return 0, nil, err } @@ -884,7 +889,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(flags.ToLinuxFDFlags()), nil, nil case linux.F_SETFD: flags := args[2].Uint() - t.FDMap().SetFlags(fd, kernel.FDFlags{ + t.FDTable().SetFlags(fd, kernel.FDFlags{ CloseOnExec: flags&linux.FD_CLOEXEC != 0, }) case linux.F_GETFL: @@ -945,8 +950,8 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err } - // The lock uid is that of the Task's FDMap. - lockUniqueID := lock.UniqueID(t.FDMap().ID()) + // The lock uid is that of the Task's FDTable. + lockUniqueID := lock.UniqueID(t.FDTable().ID()) // These locks don't block; execute the non-blocking operation using the inode's lock // context directly. @@ -1036,7 +1041,7 @@ const ( // Fadvise64 implements linux syscall fadvise64(2). // This implementation currently ignores the provided advice. func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() length := args[2].Int64() advice := args[3].Int() @@ -1045,7 +1050,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, syserror.EINVAL } - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -1071,7 +1076,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, nil } -func mkdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { +func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -1116,14 +1121,14 @@ func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Mkdirat implements linux syscall mkdirat(2). func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() mode := linux.FileMode(args[2].ModeT()) return 0, nil, mkdirAt(t, dirFD, addr, mode) } -func rmdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error { +func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -1163,7 +1168,7 @@ func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr) } -func symlinkAt(t *kernel.Task, dirFD kdefs.FD, newAddr usermem.Addr, oldAddr usermem.Addr) error { +func symlinkAt(t *kernel.Task, dirFD int32, newAddr usermem.Addr, oldAddr usermem.Addr) error { newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */) if err != nil { return err @@ -1206,7 +1211,7 @@ func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Symlinkat implements linux syscall symlinkat(2). func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldAddr := args[0].Pointer() - dirFD := kdefs.FD(args[1].Int()) + dirFD := args[1].Int() newAddr := args[2].Pointer() return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr) @@ -1248,7 +1253,7 @@ func mayLinkAt(t *kernel.Task, target *fs.Inode) error { // linkAt creates a hard link to the target specified by oldDirFD and oldAddr, // specified by newDirFD and newAddr. If resolve is true, then the symlinks // will be followed when evaluating the target. -func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr, resolve, allowEmpty bool) error { +func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr, resolve, allowEmpty bool) error { oldPath, _, err := copyInPath(t, oldAddr, allowEmpty) if err != nil { return err @@ -1262,7 +1267,7 @@ func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kd } if allowEmpty && oldPath == "" { - target := t.FDMap().GetFile(oldDirFD) + target := t.GetFile(oldDirFD) if target == nil { return syserror.EBADF } @@ -1324,9 +1329,9 @@ func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Linkat implements linux syscall linkat(2). func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - oldDirFD := kdefs.FD(args[0].Int()) + oldDirFD := args[0].Int() oldAddr := args[1].Pointer() - newDirFD := kdefs.FD(args[2].Int()) + newDirFD := args[2].Int() newAddr := args[3].Pointer() // man linkat(2): @@ -1351,7 +1356,7 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty) } -func readlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) { +func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, err @@ -1401,7 +1406,7 @@ func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Readlinkat implements linux syscall readlinkat(2). func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() bufAddr := args[2].Pointer() size := args[3].SizeT() @@ -1410,7 +1415,7 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return n, nil, err } -func unlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error { +func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -1440,7 +1445,7 @@ func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Unlinkat implements linux syscall unlinkat(2). func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() flags := args[2].Uint() if flags&linux.AT_REMOVEDIR != 0 { @@ -1501,10 +1506,10 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Ftruncate implements linux syscall ftruncate(2). func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() length := args[1].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -1619,7 +1624,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { return nil } -func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error { +func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error { path, _, err := copyInPath(t, addr, allowEmpty) if err != nil { return err @@ -1627,7 +1632,7 @@ func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty if path == "" { // Annoying. What's wrong with fchown? - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return syserror.EBADF } @@ -1661,11 +1666,11 @@ func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fchown implements linux syscall fchown(2). func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() uid := auth.UID(args[1].Uint()) gid := auth.GID(args[2].Uint()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -1676,7 +1681,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fchownat implements Linux syscall fchownat(2). func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() addr := args[1].Pointer() uid := auth.UID(args[2].Uint()) gid := auth.GID(args[3].Uint()) @@ -1706,7 +1711,7 @@ func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error { return nil } -func chmodAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { +func chmodAt(t *kernel.Task, fd int32, addr usermem.Addr, mode linux.FileMode) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err @@ -1727,10 +1732,10 @@ func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Fchmod implements linux syscall fchmod(2). func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() mode := linux.FileMode(args[1].ModeT()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -1741,7 +1746,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fchmodat implements linux syscall fchmodat(2). func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() mode := linux.FileMode(args[2].ModeT()) @@ -1757,7 +1762,7 @@ func defaultSetToSystemTimeSpec() fs.TimeSpec { } } -func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error { +func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error { setTimestamp := func(root *fs.Dirent, d *fs.Dirent, _ uint) error { // Does the task own the file? if !d.Inode.CheckOwnership(t) { @@ -1790,7 +1795,7 @@ func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, r // Linux returns EINVAL in this case. See utimes.c. return syserror.EINVAL } - f := t.FDMap().GetFile(dirFD) + f := t.GetFile(dirFD) if f == nil { return syserror.EBADF } @@ -1858,7 +1863,7 @@ func timespecIsValid(ts linux.Timespec) bool { // Utimensat implements linux syscall utimensat(2). func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() pathnameAddr := args[1].Pointer() timesAddr := args[2].Pointer() flags := args[3].Int() @@ -1893,7 +1898,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Futimesat implements linux syscall futimesat(2). func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - dirFD := kdefs.FD(args[0].Int()) + dirFD := args[0].Int() pathnameAddr := args[1].Pointer() timesAddr := args[2].Pointer() @@ -1917,7 +1922,7 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true) } -func renameAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr) error { +func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error { newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */) if err != nil { return err @@ -1965,21 +1970,21 @@ func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Renameat implements linux syscall renameat(2). func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - oldDirFD := kdefs.FD(args[0].Int()) + oldDirFD := args[0].Int() oldPathAddr := args[1].Pointer() - newDirFD := kdefs.FD(args[2].Int()) + newDirFD := args[2].Int() newPathAddr := args[3].Pointer() return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr) } // Fallocate implements linux system call fallocate(2). func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() mode := args[1].Int64() offset := args[2].Int64() length := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -2028,10 +2033,10 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Flock implements linux syscall flock(2). func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() operation := args[1].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { // flock(2): EBADF fd is not an open file descriptor. return 0, nil, syserror.EBADF @@ -2138,8 +2143,9 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S defer dirent.DecRef() defer file.DecRef() - fdFlags := kernel.FDFlags{CloseOnExec: cloExec} - newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) + newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ + CloseOnExec: cloExec, + }) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index dea872672..7a2beda23 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -23,14 +23,13 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) // Getdents implements linux syscall getdents(2) for 64bit systems. func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := int(args[2].Uint()) @@ -46,7 +45,7 @@ func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Getdents64 implements linux syscall getdents64(2). func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := int(args[2].Uint()) @@ -62,8 +61,8 @@ func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // getdents implements the core of getdents(2)/getdents64(2). // f is the syscall implementation dirent serialization function. -func getdents(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) { - dir := t.FDMap().GetFile(fd) +func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) { + dir := t.GetFile(fd) if dir == nil { return 0, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go index 9cfa660fa..a4f36e01f 100644 --- a/pkg/sentry/syscalls/linux/sys_inotify.go +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ) const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC) @@ -44,9 +43,9 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t)) defer n.DecRef() - fd, err := t.FDMap().NewFDFrom(0, n, kernel.FDFlags{ + fd, err := t.NewFDFrom(0, n, kernel.FDFlags{ CloseOnExec: flags&linux.IN_CLOEXEC != 0, - }, t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, err @@ -63,8 +62,8 @@ func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // fdToInotify resolves an fd to an inotify object. If successful, the file will // have an extra ref and the caller is responsible for releasing the ref. -func fdToInotify(t *kernel.Task, fd kdefs.FD) (*fs.Inotify, *fs.File, error) { - file := t.FDMap().GetFile(fd) +func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) { + file := t.GetFile(fd) if file == nil { // Invalid fd. return nil, nil, syscall.EBADF @@ -82,7 +81,7 @@ func fdToInotify(t *kernel.Task, fd kdefs.FD) (*fs.Inotify, *fs.File, error) { // InotifyAddWatch implements the inotify_add_watch() syscall. func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() mask := args[2].Uint() @@ -114,7 +113,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern } // Copy out to the return frame. - fd = kdefs.FD(ino.AddWatch(dirent, mask)) + fd = ino.AddWatch(dirent, mask) return nil }) @@ -123,7 +122,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern // InotifyRmWatch implements the inotify_rm_watch() syscall. func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() wd := args[1].Int() ino, file, err := fdToInotify(t, fd) diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go index a3813b818..297e920c4 100644 --- a/pkg/sentry/syscalls/linux/sys_lseek.go +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -18,17 +18,16 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/syserror" ) // Lseek implements linux syscall lseek(2). func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() offset := args[1].Int64() whence := args[2].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index d831833bc..58a05b5bb 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -40,7 +39,7 @@ func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { prot := args[2].Int() flags := args[3].Int() - fd := kdefs.FD(args[4].Int()) + fd := args[4].Int() fixed := flags&linux.MAP_FIXED != 0 private := flags&linux.MAP_PRIVATE != 0 shared := flags&linux.MAP_SHARED != 0 @@ -80,7 +79,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if !anon { // Convert the passed FD to a file reference. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index 7c1bea43d..576022dd5 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/usermem" ) @@ -38,24 +38,17 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { w.SetFlags(linuxToFlags(flags).Settable()) defer w.DecRef() - rfd, err := t.FDMap().NewFDFrom(0, r, kernel.FDFlags{ - CloseOnExec: flags&linux.O_CLOEXEC != 0}, - t.ThreadGroup().Limits()) + fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) if err != nil { return 0, err } - wfd, err := t.FDMap().NewFDFrom(0, w, kernel.FDFlags{ - CloseOnExec: flags&linux.O_CLOEXEC != 0}, - t.ThreadGroup().Limits()) - if err != nil { - t.FDMap().Remove(rfd) - return 0, err - } - - if _, err := t.CopyOut(addr, []kdefs.FD{rfd, wfd}); err != nil { - t.FDMap().Remove(rfd) - t.FDMap().Remove(wfd) + if _, err := t.CopyOut(addr, fds); err != nil { + // The files are not closed in this case, the exact semantics + // of this error case are not well defined, but they could have + // already been observed by user space. return 0, syscall.EFAULT } return 0, nil diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index ef6211218..7a13beac2 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -64,7 +63,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan return } - file := t.FDMap().GetFile(kdefs.FD(pfd.FD)) + file := t.GetFile(pfd.FD) if file == nil { pfd.REvents = linux.POLLNVAL return @@ -265,7 +264,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add // immediately to ensure we don't leak. Note, another thread // might be about to close fd. This is racy, but that's // OK. Linux is racy in the same way. - file := t.FDMap().GetFile(kdefs.FD(fd)) + file := t.GetFile(fd) if file == nil { return 0, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 9d70881fd..5329023e6 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/mm" ) @@ -122,9 +121,9 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch args[1].Int() { case linux.PR_SET_MM_EXE_FILE: - fd := kdefs.FD(args[2].Int()) + fd := args[2].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index a1965f490..b2474e60d 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -39,11 +38,11 @@ const ( // they can do large reads all at once. Bug for bug. Same for other read // calls below. func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -75,12 +74,12 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Pread64 implements linux syscall pread64(2). func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() offset := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -122,11 +121,11 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Readv implements linux syscall readv(2). func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -152,12 +151,12 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Preadv implements linux syscall preadv(2). func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -201,13 +200,13 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // splits the offset argument into a high/low value for compatibility with // 32-bit architectures. The flags argument is the 5th argument. - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() flags := int(args[5].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index ccdb079bb..d1165a57a 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" @@ -200,9 +199,9 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal }) defer s.DecRef() - fd, err := t.FDMap().NewFDFrom(0, s, kernel.FDFlags{ + fd, err := t.NewFDFrom(0, s, kernel.FDFlags{ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, - }, t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, err } @@ -225,9 +224,6 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy fileFlags := fs.SettableFileFlags{ NonBlocking: stype&linux.SOCK_NONBLOCK != 0, } - fdFlags := kernel.FDFlags{ - CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, - } // Create the socket pair. s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol) @@ -240,20 +236,16 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy defer s2.DecRef() // Create the FDs for the sockets. - fd1, err := t.FDMap().NewFDFrom(0, s1, fdFlags, t.ThreadGroup().Limits()) - if err != nil { - return 0, nil, err - } - fd2, err := t.FDMap().NewFDFrom(0, s2, fdFlags, t.ThreadGroup().Limits()) + fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{ + CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, + }) if err != nil { - t.FDMap().Remove(fd1) return 0, nil, err } // Copy the file descriptors out. - if _, err := t.CopyOut(socks, []int32{int32(fd1), int32(fd2)}); err != nil { - t.FDMap().Remove(fd1) - t.FDMap().Remove(fd2) + if _, err := t.CopyOut(socks, fds); err != nil { + // Note that we don't close files here; see pipe(2) also. return 0, nil, err } @@ -262,12 +254,12 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Connect implements the linux syscall connect(2). func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Uint() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -291,14 +283,14 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // accept is the implementation of the accept syscall. It is called by accept // and accept4 syscall handlers. -func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) { +func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) { // Check that no unsupported flags are passed in. if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { return 0, syscall.EINVAL } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, syscall.EBADF } @@ -331,7 +323,7 @@ func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr // Accept4 implements the linux syscall accept4(2). func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() flags := int(args[3].Int()) @@ -342,7 +334,7 @@ func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Accept implements the linux syscall accept(2). func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() @@ -352,12 +344,12 @@ func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Bind implements the linux syscall bind(2). func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Uint() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -380,11 +372,11 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Listen implements the linux syscall listen(2). func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() backlog := args[1].Int() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -409,11 +401,11 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Shutdown implements the linux syscall shutdown(2). func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() how := args[1].Int() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -437,14 +429,14 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // GetSockOpt implements the linux syscall getsockopt(2). func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() level := args[1].Int() name := args[2].Int() optValAddr := args[3].Pointer() optLenAddr := args[4].Pointer() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -521,14 +513,14 @@ func getSockOpt(t *kernel.Task, s socket.Socket, level, name, len int) (interfac // // Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket. func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() level := args[1].Int() name := args[2].Int() optValAddr := args[3].Pointer() optLen := args[4].Int() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -561,12 +553,12 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // GetSockName implements the linux syscall getsockname(2). func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -589,12 +581,12 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // GetPeerName implements the linux syscall getpeername(2). func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -617,7 +609,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // RecvMsg implements the linux syscall recvmsg(2). func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() msgPtr := args[1].Pointer() flags := args[2].Int() @@ -627,7 +619,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -663,7 +655,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // RecvMMsg implements the linux syscall recvmmsg(2). func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() msgPtr := args[1].Pointer() vlen := args[2].Uint() flags := args[3].Int() @@ -680,7 +672,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -842,7 +834,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i // recvFrom is the implementation of the recvfrom syscall. It is called by // recvfrom and recv syscall handlers. -func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) { +func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) { if int(bufLen) < 0 { return 0, syscall.EINVAL } @@ -853,7 +845,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, syscall.EBADF } @@ -903,7 +895,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f // RecvFrom implements the linux syscall recvfrom(2). func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() bufPtr := args[1].Pointer() bufLen := args[2].Uint64() flags := args[3].Int() @@ -916,7 +908,7 @@ func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // SendMsg implements the linux syscall sendmsg(2). func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() msgPtr := args[1].Pointer() flags := args[2].Int() @@ -926,7 +918,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -953,7 +945,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // SendMMsg implements the linux syscall sendmmsg(2). func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() msgPtr := args[1].Pointer() vlen := args[2].Uint() flags := args[3].Int() @@ -964,7 +956,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syscall.EBADF } @@ -1079,14 +1071,14 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme // sendTo is the implementation of the sendto syscall. It is called by sendto // and send syscall handlers. -func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) { +func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) { bl := int(bufLen) if bl < 0 { return 0, syscall.EINVAL } // Get socket from the file descriptor. - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, syscall.EBADF } @@ -1135,7 +1127,7 @@ func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, fla // SendTo implements the linux syscall sendto(2). func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() bufPtr := args[1].Pointer() bufLen := args[2].Uint64() flags := args[3].Int() diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index b6517313f..a7c98efcb 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -81,8 +80,8 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB // Sendfile implements linux system call sendfile(2). func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - outFD := kdefs.FD(args[0].Int()) - inFD := kdefs.FD(args[1].Int()) + outFD := args[0].Int() + inFD := args[1].Int() offsetAddr := args[2].Pointer() count := int64(args[3].SizeT()) @@ -92,13 +91,13 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } // Get files. - outFile := t.FDMap().GetFile(outFD) + outFile := t.GetFile(outFD) if outFile == nil { return 0, nil, syserror.EBADF } defer outFile.DecRef() - inFile := t.FDMap().GetFile(inFD) + inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } @@ -163,9 +162,9 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Splice implements splice(2). func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - inFD := kdefs.FD(args[0].Int()) + inFD := args[0].Int() inOffset := args[1].Pointer() - outFD := kdefs.FD(args[2].Int()) + outFD := args[2].Int() outOffset := args[3].Pointer() count := int64(args[4].SizeT()) flags := args[5].Int() @@ -182,13 +181,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0 // Get files. - outFile := t.FDMap().GetFile(outFD) + outFile := t.GetFile(outFD) if outFile == nil { return 0, nil, syserror.EBADF } defer outFile.DecRef() - inFile := t.FDMap().GetFile(inFD) + inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } @@ -251,8 +250,8 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Tee imlements tee(2). func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - inFD := kdefs.FD(args[0].Int()) - outFD := kdefs.FD(args[1].Int()) + inFD := args[0].Int() + outFD := args[1].Int() count := int64(args[2].SizeT()) flags := args[3].Int() @@ -265,13 +264,13 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0 // Get files. - outFile := t.FDMap().GetFile(outFD) + outFile := t.GetFile(outFD) if outFile == nil { return 0, nil, syserror.EBADF } defer outFile.DecRef() - inFile := t.FDMap().GetFile(inFD) + inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 9a5657254..5556bc276 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) @@ -42,7 +41,7 @@ func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Fstatat implements linux syscall newfstatat, i.e. fstatat(2). func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() statAddr := args[2].Pointer() flags := args[3].Int() @@ -54,7 +53,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if path == "" { // Annoying. What's wrong with fstat? - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -93,10 +92,10 @@ func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Fstat implements linux syscall fstat(2). func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() statAddr := args[1].Pointer() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -178,7 +177,7 @@ func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs // Statx implements linux syscall statx(2). func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() pathAddr := args[1].Pointer() flags := args[2].Int() mask := args[3].Uint() @@ -197,7 +196,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if path == "" { - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -285,10 +284,10 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fstatfs implements linux syscall fstatfs(2). func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() statfsAddr := args[1].Pointer() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go index 37225735f..3e55235bd 100644 --- a/pkg/sentry/syscalls/linux/sys_sync.go +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -32,9 +31,9 @@ func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Syncfs implements linux system call syncfs(2). func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -47,9 +46,9 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Fsync implements linux syscall fsync(2). func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -63,9 +62,9 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // // At the moment, it just calls Fsync, which is a big hammer, but correct. func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -79,6 +78,7 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { var err error + fd := args[0].Int() offset := args[1].Int64() nbytes := args[2].Int64() uflags := args[3].Uint() @@ -97,8 +97,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel nbytes = fs.FileMaxOffset } - fd := kdefs.FD(args[0].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go index ea6d44315..1ce5ce4c3 100644 --- a/pkg/sentry/syscalls/linux/sys_timerfd.go +++ b/pkg/sentry/syscalls/linux/sys_timerfd.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/syserror" ) @@ -49,9 +48,9 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel NonBlocking: flags&linux.TFD_NONBLOCK != 0, }) - fd, err := t.FDMap().NewFDFrom(0, f, kernel.FDFlags{ + fd, err := t.NewFDFrom(0, f, kernel.FDFlags{ CloseOnExec: flags&linux.TFD_CLOEXEC != 0, - }, t.ThreadGroup().Limits()) + }) if err != nil { return 0, nil, err } @@ -61,7 +60,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel // TimerfdSettime implements Linux syscall timerfd_settime(2). func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() flags := args[1].Int() newValAddr := args[2].Pointer() oldValAddr := args[3].Pointer() @@ -70,7 +69,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne return 0, nil, syserror.EINVAL } - f := t.FDMap().GetFile(fd) + f := t.GetFile(fd) if f == nil { return 0, nil, syserror.EBADF } @@ -101,10 +100,10 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne // TimerfdGettime implements Linux syscall timerfd_gettime(2). func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() curValAddr := args[1].Pointer() - f := t.FDMap().GetFile(fd) + f := t.GetFile(fd) if f == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index 3a5bf9ac4..5278c96a6 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -39,11 +38,11 @@ const ( // Write implements linux syscall write(2). func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -75,12 +74,12 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Pwrite64 implements linux syscall pwrite64(2). func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() offset := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -122,11 +121,11 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Writev implements linux syscall writev(2). func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -152,12 +151,12 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Pwritev implements linux syscall pwritev(2). func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } @@ -202,7 +201,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // splits the offset argument into a high/low value for compatibility with // 32-bit architectures. The flags argument is the 5th argument. - fd := kdefs.FD(args[0].Int()) + fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() @@ -212,7 +211,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, syserror.EACCES } - file := t.FDMap().GetFile(fd) + file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index d91f66d95..16cd6540f 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -52,7 +52,6 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel:uncaught_signal_go_proto", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/kdefs", "//pkg/sentry/limits", "//pkg/sentry/loader", "//pkg/sentry/pgalloc", diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index 59e1b46ec..e5de1f3d7 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -21,32 +21,23 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" - "gvisor.dev/gvisor/pkg/sentry/limits" ) -// createFDMap creates an FD map that contains stdin, stdout, and stderr. If -// console is true, then ioctl calls will be passed through to the host FD. +// createFDTable creates an FD table that contains stdin, stdout, and stderr. +// If console is true, then ioctl calls will be passed through to the host FD. // Upon success, createFDMap dups then closes stdioFDs. -func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) { +func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) { if len(stdioFDs) != 3 { return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } k := kernel.KernelFromContext(ctx) - fdm := k.NewFDMap() - defer fdm.DecRef() + fdTable := k.NewFDTable() + defer fdTable.DecRef() mounter := fs.FileOwnerFromContext(ctx) - // Maps sandbox FD to host FD. - fdMap := map[int]int{ - 0: stdioFDs[0], - 1: stdioFDs[1], - 2: stdioFDs[2], - } - var ttyFile *fs.File - for appFD, hostFD := range fdMap { + for appFD, hostFD := range stdioFDs { var appFile *fs.File if console && appFD < 3 { @@ -80,11 +71,11 @@ func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs } // Add the file to the FD map. - if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil { + if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { return nil, err } } - fdm.IncRef() - return fdm, nil + fdTable.IncRef() + return fdTable, nil } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 9da0c7067..f9a6f2d3c 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -25,8 +25,10 @@ import ( // Include filesystem types that OCI spec might mount. _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" + "gvisor.dev/gvisor/pkg/sentry/fs/gofer" _ "gvisor.dev/gvisor/pkg/sentry/fs/host" _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" + "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" @@ -36,8 +38,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/gofer" - "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 89f7d9f94..7e27d1f49 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -517,13 +517,13 @@ func (l *Loader) run() error { // Create the FD map, which will set stdin, stdout, and stderr. If console // is true, then ioctl calls will be passed through to the host fd. ctx := l.rootProcArgs.NewContext(l.k) - fdm, err := createFDMap(ctx, l.rootProcArgs.Limits, l.console, l.stdioFDs) + fdTable, err := createFDTable(ctx, l.console, l.stdioFDs) if err != nil { return fmt.Errorf("importing fds: %v", err) } // CreateProcess takes a reference on FDMap if successful. We won't need // ours either way. - l.rootProcArgs.FDMap = fdm + l.rootProcArgs.FDTable = fdTable // cid for root container can be empty. Only subcontainers need it to set // the mount location. @@ -562,13 +562,13 @@ func (l *Loader) run() error { return fmt.Errorf("creating init process: %v", err) } - // CreateProcess takes a reference on FDMap if successful. - l.rootProcArgs.FDMap.DecRef() + // CreateProcess takes a reference on FDTable if successful. + l.rootProcArgs.FDTable.DecRef() } ep.tg = l.k.GlobalInit() if l.console { - ttyFile := l.rootProcArgs.FDMap.GetFile(0) + ttyFile, _ := l.rootProcArgs.FDTable.Get(0) defer ttyFile.DecRef() ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations) @@ -648,13 +648,13 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file // Create the FD map, which will set stdin, stdout, and stderr. ctx := procArgs.NewContext(l.k) - fdm, err := createFDMap(ctx, procArgs.Limits, false, stdioFDs) + fdTable, err := createFDTable(ctx, false, stdioFDs) if err != nil { return fmt.Errorf("importing fds: %v", err) } - // CreateProcess takes a reference on FDMap if successful. We won't need ours - // either way. - procArgs.FDMap = fdm + // CreateProcess takes a reference on fdTable if successful. We won't + // need ours either way. + procArgs.FDTable = fdTable // Can't take ownership away from os.File. dup them to get a new FDs. var goferFDs []int @@ -683,8 +683,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file } l.k.StartProcess(tg) - // CreateProcess takes a reference on FDMap if successful. - procArgs.FDMap.DecRef() + // CreateProcess takes a reference on FDTable if successful. + procArgs.FDTable.DecRef() l.processes[eid].tg = tg return nil -- cgit v1.2.3 From 85b27a9f8f0c04f10f16a93e848efb926a391638 Mon Sep 17 00:00:00 2001 From: Yong He Date: Tue, 2 Jul 2019 22:02:15 -0700 Subject: Solve BounceToKernel may hang issue BounceToKernel will make vCPU quit from guest ring3 to guest ring0, but vCPUWaiter is not cleared when we unlock the vCPU, when next time this vCPU enter guest mode ring3, vCPU may enter guest mode with vCPUWaiter bit setted, this will cause the following BounceToKernel to this vCPU hangs at waitUntilNot. Halt may workaroud this issue, because halt process will reset vCPU status into vCPUUser, and notify all waiter for vCPU state change, but if there is no exception or syscall in this period, BounceToKernel will hang at waitUntilNot. PiperOrigin-RevId: 256299660 --- pkg/sentry/platform/kvm/machine.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index 7d92e16cc..679087e25 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -426,7 +426,12 @@ func (c *vCPU) unlock() { // Normal state. case vCPUUser | vCPUGuest | vCPUWaiter: // Force a transition: this must trigger a notification when we - // return from guest mode. + // return from guest mode. We must clear vCPUWaiter here + // anyways, because BounceToKernel will force a transition only + // from ring3 to ring0, which will not clear this bit. Halt may + // workaround the issue, but if there is no exception or + // syscall in this period, BounceToKernel will hang. + atomicbitops.AndUint32(&c.state, ^vCPUWaiter) c.notify() case vCPUUser | vCPUWaiter: // Waiting for the lock to be released; the responsibility is -- cgit v1.2.3 From 116cac053e2e4e167caa9707439065af7c7b82b3 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 3 Jul 2019 13:57:24 -0700 Subject: netstack/udp: connect with the AF_UNSPEC address family means disconnect PiperOrigin-RevId: 256433283 --- pkg/sentry/socket/epsocket/epsocket.go | 17 ++-- pkg/sentry/socket/unix/unix.go | 2 +- pkg/sentry/strace/socket.go | 2 +- pkg/syserr/netstack.go | 75 ++++++++------- pkg/tcpip/tcpip.go | 79 ++++++++------- pkg/tcpip/transport/icmp/endpoint.go | 5 + pkg/tcpip/transport/raw/endpoint.go | 5 + pkg/tcpip/transport/tcp/endpoint.go | 5 + pkg/tcpip/transport/tcp/endpoint_state.go | 1 + pkg/tcpip/transport/udp/endpoint.go | 42 +++++++- pkg/tcpip/transport/udp/endpoint_state.go | 2 - test/syscalls/linux/udp_socket.cc | 155 +++++++++++++++++++++++++++++- 12 files changed, 299 insertions(+), 91 deletions(-) diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go index b2b2d98a1..9d1bcfd41 100644 --- a/pkg/sentry/socket/epsocket/epsocket.go +++ b/pkg/sentry/socket/epsocket/epsocket.go @@ -285,14 +285,14 @@ func bytesToIPAddress(addr []byte) tcpip.Address { // GetAddress reads an sockaddr struct from the given address and converts it // to the FullAddress format. It supports AF_UNIX, AF_INET and AF_INET6 // addresses. -func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) { +func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syserr.Error) { // Make sure we have at least 2 bytes for the address family. if len(addr) < 2 { return tcpip.FullAddress{}, syserr.ErrInvalidArgument } family := usermem.ByteOrder.Uint16(addr) - if family != uint16(sfamily) { + if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) { return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported } @@ -317,7 +317,7 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) { case linux.AF_INET: var a linux.SockAddrInet if len(addr) < sockAddrInetSize { - return tcpip.FullAddress{}, syserr.ErrBadAddress + return tcpip.FullAddress{}, syserr.ErrInvalidArgument } binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a) @@ -330,7 +330,7 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) { case linux.AF_INET6: var a linux.SockAddrInet6 if len(addr) < sockAddrInet6Size { - return tcpip.FullAddress{}, syserr.ErrBadAddress + return tcpip.FullAddress{}, syserr.ErrInvalidArgument } binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a) @@ -343,6 +343,9 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) { } return out, nil + case linux.AF_UNSPEC: + return tcpip.FullAddress{}, nil + default: return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported } @@ -465,7 +468,7 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { // Connect implements the linux syscall connect(2) for sockets backed by // tpcip.Endpoint. func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { - addr, err := GetAddress(s.family, sockaddr) + addr, err := GetAddress(s.family, sockaddr, false /* strict */) if err != nil { return err } @@ -498,7 +501,7 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo // Bind implements the linux syscall bind(2) for sockets backed by // tcpip.Endpoint. func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { - addr, err := GetAddress(s.family, sockaddr) + addr, err := GetAddress(s.family, sockaddr, true /* strict */) if err != nil { return err } @@ -1922,7 +1925,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] var addr *tcpip.FullAddress if len(to) > 0 { - addrBuf, err := GetAddress(s.family, to) + addrBuf, err := GetAddress(s.family, to, true /* strict */) if err != nil { return 0, err } diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index b30871a90..637168714 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -110,7 +110,7 @@ func (s *SocketOperations) Endpoint() transport.Endpoint { // extractPath extracts and validates the address. func extractPath(sockaddr []byte) (string, *syserr.Error) { - addr, err := epsocket.GetAddress(linux.AF_UNIX, sockaddr) + addr, err := epsocket.GetAddress(linux.AF_UNIX, sockaddr, true /* strict */) if err != nil { return "", err } diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index f9cf2eb21..386b40af7 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -332,7 +332,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string { switch family { case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX: - fa, err := epsocket.GetAddress(int(family), b) + fa, err := epsocket.GetAddress(int(family), b, true /* strict */) if err != nil { return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err) } diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go index 965808a27..8ff922c69 100644 --- a/pkg/syserr/netstack.go +++ b/pkg/syserr/netstack.go @@ -49,43 +49,44 @@ var ( ) var netstackErrorTranslations = map[*tcpip.Error]*Error{ - tcpip.ErrUnknownProtocol: ErrUnknownProtocol, - tcpip.ErrUnknownNICID: ErrUnknownNICID, - tcpip.ErrUnknownDevice: ErrUnknownDevice, - tcpip.ErrUnknownProtocolOption: ErrUnknownProtocolOption, - tcpip.ErrDuplicateNICID: ErrDuplicateNICID, - tcpip.ErrDuplicateAddress: ErrDuplicateAddress, - tcpip.ErrNoRoute: ErrNoRoute, - tcpip.ErrBadLinkEndpoint: ErrBadLinkEndpoint, - tcpip.ErrAlreadyBound: ErrAlreadyBound, - tcpip.ErrInvalidEndpointState: ErrInvalidEndpointState, - tcpip.ErrAlreadyConnecting: ErrAlreadyConnecting, - tcpip.ErrAlreadyConnected: ErrAlreadyConnected, - tcpip.ErrNoPortAvailable: ErrNoPortAvailable, - tcpip.ErrPortInUse: ErrPortInUse, - tcpip.ErrBadLocalAddress: ErrBadLocalAddress, - tcpip.ErrClosedForSend: ErrClosedForSend, - tcpip.ErrClosedForReceive: ErrClosedForReceive, - tcpip.ErrWouldBlock: ErrWouldBlock, - tcpip.ErrConnectionRefused: ErrConnectionRefused, - tcpip.ErrTimeout: ErrTimeout, - tcpip.ErrAborted: ErrAborted, - tcpip.ErrConnectStarted: ErrConnectStarted, - tcpip.ErrDestinationRequired: ErrDestinationRequired, - tcpip.ErrNotSupported: ErrNotSupported, - tcpip.ErrQueueSizeNotSupported: ErrQueueSizeNotSupported, - tcpip.ErrNotConnected: ErrNotConnected, - tcpip.ErrConnectionReset: ErrConnectionReset, - tcpip.ErrConnectionAborted: ErrConnectionAborted, - tcpip.ErrNoSuchFile: ErrNoSuchFile, - tcpip.ErrInvalidOptionValue: ErrInvalidOptionValue, - tcpip.ErrNoLinkAddress: ErrHostDown, - tcpip.ErrBadAddress: ErrBadAddress, - tcpip.ErrNetworkUnreachable: ErrNetworkUnreachable, - tcpip.ErrMessageTooLong: ErrMessageTooLong, - tcpip.ErrNoBufferSpace: ErrNoBufferSpace, - tcpip.ErrBroadcastDisabled: ErrBroadcastDisabled, - tcpip.ErrNotPermitted: ErrNotPermittedNet, + tcpip.ErrUnknownProtocol: ErrUnknownProtocol, + tcpip.ErrUnknownNICID: ErrUnknownNICID, + tcpip.ErrUnknownDevice: ErrUnknownDevice, + tcpip.ErrUnknownProtocolOption: ErrUnknownProtocolOption, + tcpip.ErrDuplicateNICID: ErrDuplicateNICID, + tcpip.ErrDuplicateAddress: ErrDuplicateAddress, + tcpip.ErrNoRoute: ErrNoRoute, + tcpip.ErrBadLinkEndpoint: ErrBadLinkEndpoint, + tcpip.ErrAlreadyBound: ErrAlreadyBound, + tcpip.ErrInvalidEndpointState: ErrInvalidEndpointState, + tcpip.ErrAlreadyConnecting: ErrAlreadyConnecting, + tcpip.ErrAlreadyConnected: ErrAlreadyConnected, + tcpip.ErrNoPortAvailable: ErrNoPortAvailable, + tcpip.ErrPortInUse: ErrPortInUse, + tcpip.ErrBadLocalAddress: ErrBadLocalAddress, + tcpip.ErrClosedForSend: ErrClosedForSend, + tcpip.ErrClosedForReceive: ErrClosedForReceive, + tcpip.ErrWouldBlock: ErrWouldBlock, + tcpip.ErrConnectionRefused: ErrConnectionRefused, + tcpip.ErrTimeout: ErrTimeout, + tcpip.ErrAborted: ErrAborted, + tcpip.ErrConnectStarted: ErrConnectStarted, + tcpip.ErrDestinationRequired: ErrDestinationRequired, + tcpip.ErrNotSupported: ErrNotSupported, + tcpip.ErrQueueSizeNotSupported: ErrQueueSizeNotSupported, + tcpip.ErrNotConnected: ErrNotConnected, + tcpip.ErrConnectionReset: ErrConnectionReset, + tcpip.ErrConnectionAborted: ErrConnectionAborted, + tcpip.ErrNoSuchFile: ErrNoSuchFile, + tcpip.ErrInvalidOptionValue: ErrInvalidOptionValue, + tcpip.ErrNoLinkAddress: ErrHostDown, + tcpip.ErrBadAddress: ErrBadAddress, + tcpip.ErrNetworkUnreachable: ErrNetworkUnreachable, + tcpip.ErrMessageTooLong: ErrMessageTooLong, + tcpip.ErrNoBufferSpace: ErrNoBufferSpace, + tcpip.ErrBroadcastDisabled: ErrBroadcastDisabled, + tcpip.ErrNotPermitted: ErrNotPermittedNet, + tcpip.ErrAddressFamilyNotSupported: ErrAddressFamilyNotSupported, } // TranslateNetstackError converts an error from the tcpip package to a sentry diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 966b3acfd..c61f96fb0 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -66,43 +66,44 @@ func (e *Error) IgnoreStats() bool { // Errors that can be returned by the network stack. var ( - ErrUnknownProtocol = &Error{msg: "unknown protocol"} - ErrUnknownNICID = &Error{msg: "unknown nic id"} - ErrUnknownDevice = &Error{msg: "unknown device"} - ErrUnknownProtocolOption = &Error{msg: "unknown option for protocol"} - ErrDuplicateNICID = &Error{msg: "duplicate nic id"} - ErrDuplicateAddress = &Error{msg: "duplicate address"} - ErrNoRoute = &Error{msg: "no route"} - ErrBadLinkEndpoint = &Error{msg: "bad link layer endpoint"} - ErrAlreadyBound = &Error{msg: "endpoint already bound", ignoreStats: true} - ErrInvalidEndpointState = &Error{msg: "endpoint is in invalid state"} - ErrAlreadyConnecting = &Error{msg: "endpoint is already connecting", ignoreStats: true} - ErrAlreadyConnected = &Error{msg: "endpoint is already connected", ignoreStats: true} - ErrNoPortAvailable = &Error{msg: "no ports are available"} - ErrPortInUse = &Error{msg: "port is in use"} - ErrBadLocalAddress = &Error{msg: "bad local address"} - ErrClosedForSend = &Error{msg: "endpoint is closed for send"} - ErrClosedForReceive = &Error{msg: "endpoint is closed for receive"} - ErrWouldBlock = &Error{msg: "operation would block", ignoreStats: true} - ErrConnectionRefused = &Error{msg: "connection was refused"} - ErrTimeout = &Error{msg: "operation timed out"} - ErrAborted = &Error{msg: "operation aborted"} - ErrConnectStarted = &Error{msg: "connection attempt started", ignoreStats: true} - ErrDestinationRequired = &Error{msg: "destination address is required"} - ErrNotSupported = &Error{msg: "operation not supported"} - ErrQueueSizeNotSupported = &Error{msg: "queue size querying not supported"} - ErrNotConnected = &Error{msg: "endpoint not connected"} - ErrConnectionReset = &Error{msg: "connection reset by peer"} - ErrConnectionAborted = &Error{msg: "connection aborted"} - ErrNoSuchFile = &Error{msg: "no such file"} - ErrInvalidOptionValue = &Error{msg: "invalid option value specified"} - ErrNoLinkAddress = &Error{msg: "no remote link address"} - ErrBadAddress = &Error{msg: "bad address"} - ErrNetworkUnreachable = &Error{msg: "network is unreachable"} - ErrMessageTooLong = &Error{msg: "message too long"} - ErrNoBufferSpace = &Error{msg: "no buffer space available"} - ErrBroadcastDisabled = &Error{msg: "broadcast socket option disabled"} - ErrNotPermitted = &Error{msg: "operation not permitted"} + ErrUnknownProtocol = &Error{msg: "unknown protocol"} + ErrUnknownNICID = &Error{msg: "unknown nic id"} + ErrUnknownDevice = &Error{msg: "unknown device"} + ErrUnknownProtocolOption = &Error{msg: "unknown option for protocol"} + ErrDuplicateNICID = &Error{msg: "duplicate nic id"} + ErrDuplicateAddress = &Error{msg: "duplicate address"} + ErrNoRoute = &Error{msg: "no route"} + ErrBadLinkEndpoint = &Error{msg: "bad link layer endpoint"} + ErrAlreadyBound = &Error{msg: "endpoint already bound", ignoreStats: true} + ErrInvalidEndpointState = &Error{msg: "endpoint is in invalid state"} + ErrAlreadyConnecting = &Error{msg: "endpoint is already connecting", ignoreStats: true} + ErrAlreadyConnected = &Error{msg: "endpoint is already connected", ignoreStats: true} + ErrNoPortAvailable = &Error{msg: "no ports are available"} + ErrPortInUse = &Error{msg: "port is in use"} + ErrBadLocalAddress = &Error{msg: "bad local address"} + ErrClosedForSend = &Error{msg: "endpoint is closed for send"} + ErrClosedForReceive = &Error{msg: "endpoint is closed for receive"} + ErrWouldBlock = &Error{msg: "operation would block", ignoreStats: true} + ErrConnectionRefused = &Error{msg: "connection was refused"} + ErrTimeout = &Error{msg: "operation timed out"} + ErrAborted = &Error{msg: "operation aborted"} + ErrConnectStarted = &Error{msg: "connection attempt started", ignoreStats: true} + ErrDestinationRequired = &Error{msg: "destination address is required"} + ErrNotSupported = &Error{msg: "operation not supported"} + ErrQueueSizeNotSupported = &Error{msg: "queue size querying not supported"} + ErrNotConnected = &Error{msg: "endpoint not connected"} + ErrConnectionReset = &Error{msg: "connection reset by peer"} + ErrConnectionAborted = &Error{msg: "connection aborted"} + ErrNoSuchFile = &Error{msg: "no such file"} + ErrInvalidOptionValue = &Error{msg: "invalid option value specified"} + ErrNoLinkAddress = &Error{msg: "no remote link address"} + ErrBadAddress = &Error{msg: "bad address"} + ErrNetworkUnreachable = &Error{msg: "network is unreachable"} + ErrMessageTooLong = &Error{msg: "message too long"} + ErrNoBufferSpace = &Error{msg: "no buffer space available"} + ErrBroadcastDisabled = &Error{msg: "broadcast socket option disabled"} + ErrNotPermitted = &Error{msg: "operation not permitted"} + ErrAddressFamilyNotSupported = &Error{msg: "address family not supported by protocol"} ) // Errors related to Subnet @@ -339,6 +340,10 @@ type Endpoint interface { // get the actual result. The first call to Connect after the socket has // connected returns nil. Calling connect again results in ErrAlreadyConnected. // Anything else -- the attempt to connect failed. + // + // If address.Addr is empty, this means that Enpoint has to be + // disconnected if this is supported, otherwise + // ErrAddressFamilyNotSupported must be returned. Connect(address FullAddress) *Error // Shutdown closes the read and/or write end of the endpoint connection diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index 33cefd937..ab9e80747 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -422,6 +422,11 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { e.mu.Lock() defer e.mu.Unlock() + if addr.Addr == "" { + // AF_UNSPEC isn't supported. + return tcpip.ErrAddressFamilyNotSupported + } + nicid := addr.NIC localPort := uint16(0) switch e.state { diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index 73599dd9d..42aded77f 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -298,6 +298,11 @@ func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { ep.mu.Lock() defer ep.mu.Unlock() + if addr.Addr == "" { + // AF_UNSPEC isn't supported. + return tcpip.ErrAddressFamilyNotSupported + } + if ep.closed { return tcpip.ErrInvalidEndpointState } diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index ee60ebf58..cb40fea94 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -1271,6 +1271,11 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocol // Connect connects the endpoint to its peer. func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { + if addr.Addr == "" && addr.Port == 0 { + // AF_UNSPEC isn't supported. + return tcpip.ErrAddressFamilyNotSupported + } + return e.connect(addr, true, true) } diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index ec61a3886..b93959034 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -342,6 +342,7 @@ func loadError(s string) *tcpip.Error { tcpip.ErrNoBufferSpace, tcpip.ErrBroadcastDisabled, tcpip.ErrNotPermitted, + tcpip.ErrAddressFamilyNotSupported, } messageToError = make(map[string]*tcpip.Error) diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 99fdfb795..cb0ea83a6 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -698,8 +698,44 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (t return netProto, nil } +func (e *endpoint) disconnect() *tcpip.Error { + e.mu.Lock() + defer e.mu.Unlock() + + if e.state != stateConnected { + return nil + } + id := stack.TransportEndpointID{} + // Exclude ephemerally bound endpoints. + if e.bindNICID != 0 || e.id.LocalAddress == "" { + var err *tcpip.Error + id = stack.TransportEndpointID{ + LocalPort: e.id.LocalPort, + LocalAddress: e.id.LocalAddress, + } + id, err = e.registerWithStack(e.regNICID, e.effectiveNetProtos, id) + if err != nil { + return err + } + e.state = stateBound + } else { + e.state = stateInitial + } + + e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e) + e.id = id + e.route.Release() + e.route = stack.Route{} + e.dstPort = 0 + + return nil +} + // Connect connects the endpoint to its peer. Specifying a NIC is optional. func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { + if addr.Addr == "" { + return e.disconnect() + } if addr.Port == 0 { // We don't support connecting to port zero. return tcpip.ErrInvalidEndpointState @@ -734,12 +770,16 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { defer r.Release() id := stack.TransportEndpointID{ - LocalAddress: r.LocalAddress, + LocalAddress: e.id.LocalAddress, LocalPort: localPort, RemotePort: addr.Port, RemoteAddress: r.RemoteAddress, } + if e.state == stateInitial { + id.LocalAddress = r.LocalAddress + } + // Even if we're connected, this endpoint can still be used to send // packets on a different network protocol, so we register both even if // v6only is set to false and this is an ipv6 endpoint. diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go index 701bdd72b..18e786397 100644 --- a/pkg/tcpip/transport/udp/endpoint_state.go +++ b/pkg/tcpip/transport/udp/endpoint_state.go @@ -92,8 +92,6 @@ func (e *endpoint) afterLoad() { if err != nil { panic(*err) } - - e.id.LocalAddress = e.route.LocalAddress } else if len(e.id.LocalAddress) != 0 { // stateBound if e.stack.CheckLocalAddress(e.regNICID, netProto, e.id.LocalAddress) == 0 { panic(tcpip.ErrBadLocalAddress) diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc index 31db8a2ad..1bb0307c4 100644 --- a/test/syscalls/linux/udp_socket.cc +++ b/test/syscalls/linux/udp_socket.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include @@ -304,12 +305,50 @@ TEST_P(UdpSocketTest, ReceiveAfterConnect) { SyscallSucceedsWithValue(sizeof(buf))); // Receive the data. - char received[512]; + char received[sizeof(buf)]; EXPECT_THAT(recv(s_, received, sizeof(received), 0), SyscallSucceedsWithValue(sizeof(received))); EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); } +TEST_P(UdpSocketTest, ReceiveAfterDisconnect) { + // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds()); + ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds()); + + // Get the address s_ was bound to during connect. + struct sockaddr_storage addr; + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast(&addr), &addrlen), + SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + + for (int i = 0; i < 2; i++) { + // Send from t_ to s_. + char buf[512]; + RandomizeBuffer(buf, sizeof(buf)); + EXPECT_THAT(getsockname(s_, reinterpret_cast(&addr), &addrlen), + SyscallSucceeds()); + ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, + reinterpret_cast(&addr), addrlen), + SyscallSucceedsWithValue(sizeof(buf))); + + // Receive the data. + char received[sizeof(buf)]; + EXPECT_THAT(recv(s_, received, sizeof(received), 0), + SyscallSucceedsWithValue(sizeof(received))); + EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); + + // Disconnect s_. + struct sockaddr addr = {}; + addr.sa_family = AF_UNSPEC; + ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)), SyscallSucceeds()); + // Connect s_ loopback:TestPort. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + } +} + TEST_P(UdpSocketTest, Connect) { ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); @@ -335,6 +374,112 @@ TEST_P(UdpSocketTest, Connect) { EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0); } +TEST_P(UdpSocketTest, DisconnectAfterBind) { + ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds()); + // Connect the socket. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + struct sockaddr_storage addr = {}; + addr.ss_family = AF_UNSPEC; + EXPECT_THAT( + connect(s_, reinterpret_cast(&addr), sizeof(addr.ss_family)), + SyscallSucceeds()); + + // Check that we're still bound. + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast(&addr), &addrlen), + SyscallSucceeds()); + + EXPECT_EQ(addrlen, addrlen_); + EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0); + + addrlen = sizeof(addr); + EXPECT_THAT(getpeername(s_, reinterpret_cast(&addr), &addrlen), + SyscallFailsWithErrno(ENOTCONN)); +} + +TEST_P(UdpSocketTest, DisconnectAfterBindToAny) { + struct sockaddr_storage baddr = {}; + socklen_t addrlen; + auto port = *Port(reinterpret_cast(addr_[1])); + if (addr_[0]->sa_family == AF_INET) { + auto addr_in = reinterpret_cast(&baddr); + addr_in->sin_family = AF_INET; + addr_in->sin_port = port; + inet_pton(AF_INET, "0.0.0.0", + reinterpret_cast(&addr_in->sin_addr.s_addr)); + } else { + auto addr_in = reinterpret_cast(&baddr); + addr_in->sin6_family = AF_INET6; + addr_in->sin6_port = port; + inet_pton(AF_INET6, + "::", reinterpret_cast(&addr_in->sin6_addr.s6_addr)); + addr_in->sin6_scope_id = 0; + } + ASSERT_THAT(bind(s_, reinterpret_cast(&baddr), addrlen_), + SyscallSucceeds()); + // Connect the socket. + ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); + + struct sockaddr_storage addr = {}; + addr.ss_family = AF_UNSPEC; + EXPECT_THAT( + connect(s_, reinterpret_cast(&addr), sizeof(addr.ss_family)), + SyscallSucceeds()); + + // Check that we're still bound. + addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast(&addr), &addrlen), + SyscallSucceeds()); + + EXPECT_EQ(addrlen, addrlen_); + EXPECT_EQ(memcmp(&addr, &baddr, addrlen), 0); + + addrlen = sizeof(addr); + EXPECT_THAT(getpeername(s_, reinterpret_cast(&addr), &addrlen), + SyscallFailsWithErrno(ENOTCONN)); +} + +TEST_P(UdpSocketTest, Disconnect) { + for (int i = 0; i < 2; i++) { + // Try to connect again. + EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds()); + + // Check that we're connected to the right peer. + struct sockaddr_storage peer; + socklen_t peerlen = sizeof(peer); + EXPECT_THAT(getpeername(s_, reinterpret_cast(&peer), &peerlen), + SyscallSucceeds()); + EXPECT_EQ(peerlen, addrlen_); + EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0); + + // Try to disconnect. + struct sockaddr_storage addr = {}; + addr.ss_family = AF_UNSPEC; + EXPECT_THAT( + connect(s_, reinterpret_cast(&addr), sizeof(addr.ss_family)), + SyscallSucceeds()); + + peerlen = sizeof(peer); + EXPECT_THAT(getpeername(s_, reinterpret_cast(&peer), &peerlen), + SyscallFailsWithErrno(ENOTCONN)); + + // Check that we're still bound. + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getsockname(s_, reinterpret_cast(&addr), &addrlen), + SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + EXPECT_EQ(*Port(&addr), 0); + } +} + +TEST_P(UdpSocketTest, ConnectBadAddress) { + struct sockaddr addr = {}; + addr.sa_family = addr_[0]->sa_family; + ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)), + SyscallFailsWithErrno(EINVAL)); +} + TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) { ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds()); @@ -397,7 +542,7 @@ TEST_P(UdpSocketTest, SendAndReceiveNotConnected) { SyscallSucceedsWithValue(sizeof(buf))); // Receive the data. - char received[512]; + char received[sizeof(buf)]; EXPECT_THAT(recv(s_, received, sizeof(received), 0), SyscallSucceedsWithValue(sizeof(received))); EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); @@ -419,7 +564,7 @@ TEST_P(UdpSocketTest, SendAndReceiveConnected) { SyscallSucceedsWithValue(sizeof(buf))); // Receive the data. - char received[512]; + char received[sizeof(buf)]; EXPECT_THAT(recv(s_, received, sizeof(received), 0), SyscallSucceedsWithValue(sizeof(received))); EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); @@ -462,7 +607,7 @@ TEST_P(UdpSocketTest, ReceiveBeforeConnect) { ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds()); // Receive the data. It works because it was sent before the connect. - char received[512]; + char received[sizeof(buf)]; EXPECT_THAT(recv(s_, received, sizeof(received), 0), SyscallSucceedsWithValue(sizeof(received))); EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0); @@ -491,7 +636,7 @@ TEST_P(UdpSocketTest, ReceiveFrom) { SyscallSucceedsWithValue(sizeof(buf))); // Receive the data and sender address. - char received[512]; + char received[sizeof(buf)]; struct sockaddr_storage addr; socklen_t addrlen = sizeof(addr); EXPECT_THAT(recvfrom(s_, received, sizeof(received), 0, -- cgit v1.2.3 From 9f2f9f0cab7ad9bbdcde23e8c98ea42c38c1e4e8 Mon Sep 17 00:00:00 2001 From: Neel Natu Date: Wed, 3 Jul 2019 16:00:29 -0700 Subject: futex: compare keys for equality when doing a FUTEX_UNLOCK_PI. PiperOrigin-RevId: 256453827 --- pkg/sentry/kernel/futex/futex.go | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index 3bd5c04af..278cc8143 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -729,14 +729,14 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool } b := m.lockBucket(&k) - err = m.unlockPILocked(t, addr, tid, b) + err = m.unlockPILocked(t, addr, tid, b, &k) k.release() b.mu.Unlock() return err } -func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket) error { +func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket, key *Key) error { cur, err := t.LoadUint32(addr) if err != nil { return err @@ -746,7 +746,22 @@ func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *buc return syserror.EPERM } - if b.waiters.Empty() { + var next *Waiter // Who's the next owner? + var next2 *Waiter // Who's the one after that? + for w := b.waiters.Front(); w != nil; w = w.Next() { + if !w.key.matches(key) { + continue + } + + if next == nil { + next = w + } else { + next2 = w + break + } + } + + if next == nil { // It's safe to set 0 because there are no waiters, no new owner, and the // executing task is the current owner (no owner died bit). prev, err := t.CompareAndSwapUint32(addr, cur, 0) @@ -761,12 +776,10 @@ func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *buc return nil } - next := b.waiters.Front() - // Set next owner's TID, waiters if there are any. Resets owner died bit, if // set, because the executing task takes over as the owner. val := next.tid - if next.Next() != nil { + if next2 != nil { val |= linux.FUTEX_WAITERS } -- cgit v1.2.3 From da57fb9d25d947195147868253a928f83980c1fd Mon Sep 17 00:00:00 2001 From: Ian Lewis Date: Wed, 3 Jul 2019 20:12:16 -0700 Subject: Fix syscall doc for getresgid PiperOrigin-RevId: 256481284 --- pkg/sentry/syscalls/linux/linux64.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 99af33cfd..7bbbb5008 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -168,7 +168,7 @@ var AMD64 = &kernel.SyscallTable{ 117: syscalls.Supported("setresuid", Setresuid), 118: syscalls.Supported("getresuid", Getresuid), 119: syscalls.Supported("setresgid", Setresgid), - 120: syscalls.Supported("setresgid", Getresgid), + 120: syscalls.Supported("getresgid", Getresgid), 121: syscalls.Supported("getpgid", Getpgid), 122: syscalls.ErrorWithEvent("setfsuid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) 123: syscalls.ErrorWithEvent("setfsgid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) -- cgit v1.2.3 From 67f2cefce02816307805699c3462d6fd7ce61b69 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 3 Jul 2019 22:50:26 -0700 Subject: Avoid importing platforms from many source files PiperOrigin-RevId: 256494243 --- pkg/sentry/platform/BUILD | 1 + pkg/sentry/platform/kvm/BUILD | 2 ++ pkg/sentry/platform/kvm/filters.go | 33 ++++++++++++++++++++++++++++++ pkg/sentry/platform/kvm/kvm.go | 14 +++++++++++++ pkg/sentry/platform/platform.go | 28 ++++++++++++++++++++++++++ pkg/sentry/platform/ptrace/BUILD | 1 + pkg/sentry/platform/ptrace/filters.go | 33 ++++++++++++++++++++++++++++++ pkg/sentry/platform/ptrace/ptrace.go | 15 ++++++++++++++ runsc/BUILD | 2 ++ runsc/boot/BUILD | 3 +-- runsc/boot/config.go | 38 ++--------------------------------- runsc/boot/filter/BUILD | 2 -- runsc/boot/filter/config.go | 23 --------------------- runsc/boot/filter/filter.go | 13 +----------- runsc/boot/loader.go | 20 ++++++------------ runsc/boot/loader_test.go | 1 + runsc/boot/platforms/BUILD | 16 +++++++++++++++ runsc/boot/platforms/platforms.go | 30 +++++++++++++++++++++++++++ runsc/cmd/BUILD | 1 + runsc/cmd/boot.go | 3 ++- runsc/container/BUILD | 1 + runsc/container/container_test.go | 3 ++- runsc/main.go | 7 ++++--- runsc/sandbox/BUILD | 3 ++- runsc/sandbox/sandbox.go | 23 +++++++++------------ runsc/test/testutil/testutil.go | 1 + 26 files changed, 209 insertions(+), 108 deletions(-) create mode 100644 pkg/sentry/platform/kvm/filters.go create mode 100644 pkg/sentry/platform/ptrace/filters.go create mode 100644 runsc/boot/platforms/BUILD create mode 100644 runsc/boot/platforms/platforms.go diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD index 0b9962b2b..9aa6ec507 100644 --- a/pkg/sentry/platform/BUILD +++ b/pkg/sentry/platform/BUILD @@ -28,6 +28,7 @@ go_library( "//pkg/abi/linux", "//pkg/atomicbitops", "//pkg/log", + "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/context", "//pkg/sentry/platform/safecopy", diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 9ccf77fdf..ad8b95744 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -14,6 +14,7 @@ go_library( "bluepill_fault.go", "bluepill_unsafe.go", "context.go", + "filters.go", "kvm.go", "kvm_amd64.go", "kvm_amd64_unsafe.go", @@ -33,6 +34,7 @@ go_library( "//pkg/cpuid", "//pkg/log", "//pkg/procid", + "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", diff --git a/pkg/sentry/platform/kvm/filters.go b/pkg/sentry/platform/kvm/filters.go new file mode 100644 index 000000000..7d949f1dd --- /dev/null +++ b/pkg/sentry/platform/kvm/filters.go @@ -0,0 +1,33 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// SyscallFilters returns syscalls made exclusively by the KVM platform. +func (*KVM) SyscallFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_ARCH_PRCTL: {}, + syscall.SYS_IOCTL: {}, + syscall.SYS_MMAP: {}, + syscall.SYS_RT_SIGSUSPEND: {}, + syscall.SYS_RT_SIGTIMEDWAIT: {}, + 0xffffffffffffffff: {}, // KVM uses syscall -1 to transition to host. + } +} diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index b49d7f3c4..ee4cd2f4d 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -141,3 +141,17 @@ func (k *KVM) NewContext() platform.Context { machine: k.machine, } } + +type constructor struct{} + +func (*constructor) New(f *os.File) (platform.Platform, error) { + return New(f) +} + +func (*constructor) OpenDevice() (*os.File, error) { + return OpenDevice() +} + +func init() { + platform.Register("kvm", &constructor{}) +} diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index eccbe2336..ec22dbf87 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -19,8 +19,10 @@ package platform import ( "fmt" + "os" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -93,6 +95,9 @@ type Platform interface { // Platforms for which this does not hold may panic if PreemptAllCPUs is // called. PreemptAllCPUs() error + + // SyscallFilters returns syscalls made exclusively by this platform. + SyscallFilters() seccomp.SyscallRules } // NoCPUPreemptionDetection implements Platform.DetectsCPUPreemption and @@ -347,3 +352,26 @@ type File interface { func (fr FileRange) String() string { return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End) } + +// Constructor represents a platform type. +type Constructor interface { + New(deviceFile *os.File) (Platform, error) + OpenDevice() (*os.File, error) +} + +// platforms contains all available platform types. +var platforms = map[string]Constructor{} + +// Register registers a new platform type. +func Register(name string, platform Constructor) { + platforms[name] = platform +} + +// Lookup looks up the platform constructor by name. +func Lookup(name string) (Constructor, error) { + p, ok := platforms[name] + if !ok { + return nil, fmt.Errorf("unknown platform: %v", name) + } + return p, nil +} diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 6a1343f47..1b6c54e96 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -5,6 +5,7 @@ package(licenses = ["notice"]) go_library( name = "ptrace", srcs = [ + "filters.go", "ptrace.go", "ptrace_unsafe.go", "stub_amd64.s", diff --git a/pkg/sentry/platform/ptrace/filters.go b/pkg/sentry/platform/ptrace/filters.go new file mode 100644 index 000000000..1e07cfd0d --- /dev/null +++ b/pkg/sentry/platform/ptrace/filters.go @@ -0,0 +1,33 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ptrace + +import ( + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/seccomp" +) + +// SyscallFilters returns syscalls made exclusively by the ptrace platform. +func (*PTrace) SyscallFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + unix.SYS_GETCPU: {}, + unix.SYS_SCHED_SETAFFINITY: {}, + syscall.SYS_PTRACE: {}, + syscall.SYS_TGKILL: {}, + syscall.SYS_WAIT4: {}, + } +} diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index ee7e0640c..6fd30ed25 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -45,6 +45,7 @@ package ptrace import ( + "os" "sync" "gvisor.dev/gvisor/pkg/abi/linux" @@ -236,3 +237,17 @@ func (p *PTrace) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan s func (*PTrace) NewContext() platform.Context { return &context{} } + +type constructor struct{} + +func (*constructor) New(*os.File) (platform.Platform, error) { + return New() +} + +func (*constructor) OpenDevice() (*os.File, error) { + return nil, nil +} + +func init() { + platform.Register("ptrace", &constructor{}) +} diff --git a/runsc/BUILD b/runsc/BUILD index 0e4cf8e09..6b8c92706 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -16,6 +16,7 @@ go_binary( x_defs = {"main.version": "{VERSION}"}, deps = [ "//pkg/log", + "//pkg/sentry/platform", "//runsc/boot", "//runsc/cmd", "//runsc/specutils", @@ -47,6 +48,7 @@ go_binary( x_defs = {"main.version": "{VERSION}"}, deps = [ "//pkg/log", + "//pkg/sentry/platform", "//runsc/boot", "//runsc/cmd", "//runsc/specutils", diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 16cd6540f..5025401dd 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -56,8 +56,6 @@ go_library( "//pkg/sentry/loader", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/platform/kvm", - "//pkg/sentry/platform/ptrace", "//pkg/sentry/sighandling", "//pkg/sentry/socket/epsocket", "//pkg/sentry/socket/hostinet", @@ -86,6 +84,7 @@ go_library( "//pkg/tcpip/transport/udp", "//pkg/urpc", "//runsc/boot/filter", + "//runsc/boot/platforms", "//runsc/specutils", "@com_github_golang_protobuf//proto:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 6d276f207..6f1eb9a41 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -22,40 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/watchdog" ) -// PlatformType tells which platform to use. -type PlatformType int - -const ( - // PlatformPtrace runs the sandbox with the ptrace platform. - PlatformPtrace PlatformType = iota - - // PlatformKVM runs the sandbox with the KVM platform. - PlatformKVM -) - -// MakePlatformType converts type from string. -func MakePlatformType(s string) (PlatformType, error) { - switch s { - case "ptrace": - return PlatformPtrace, nil - case "kvm": - return PlatformKVM, nil - default: - return 0, fmt.Errorf("invalid platform type %q", s) - } -} - -func (p PlatformType) String() string { - switch p { - case PlatformPtrace: - return "ptrace" - case PlatformKVM: - return "kvm" - default: - return fmt.Sprintf("unknown(%d)", p) - } -} - // FileAccessType tells how the filesystem is accessed. type FileAccessType int @@ -187,7 +153,7 @@ type Config struct { LogPackets bool // Platform is the platform to run on. - Platform PlatformType + Platform string // Strace indicates that strace should be enabled. Strace bool @@ -247,7 +213,7 @@ func (c *Config) ToFlags() []string { "--overlay=" + strconv.FormatBool(c.Overlay), "--network=" + c.Network.String(), "--log-packets=" + strconv.FormatBool(c.LogPackets), - "--platform=" + c.Platform.String(), + "--platform=" + c.Platform, "--strace=" + strconv.FormatBool(c.Strace), "--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","), "--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)), diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD index 07898f3de..f5509b6b7 100644 --- a/runsc/boot/filter/BUILD +++ b/runsc/boot/filter/BUILD @@ -20,8 +20,6 @@ go_library( "//pkg/log", "//pkg/seccomp", "//pkg/sentry/platform", - "//pkg/sentry/platform/kvm", - "//pkg/sentry/platform/ptrace", "//pkg/tcpip/link/fdbased", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index e4ccb40d9..0ee5b8bbd 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -437,29 +437,6 @@ func hostInetFilters() seccomp.SyscallRules { } } -// ptraceFilters returns syscalls made exclusively by the ptrace platform. -func ptraceFilters() seccomp.SyscallRules { - return seccomp.SyscallRules{ - unix.SYS_GETCPU: {}, - unix.SYS_SCHED_SETAFFINITY: {}, - syscall.SYS_PTRACE: {}, - syscall.SYS_TGKILL: {}, - syscall.SYS_WAIT4: {}, - } -} - -// kvmFilters returns syscalls made exclusively by the KVM platform. -func kvmFilters() seccomp.SyscallRules { - return seccomp.SyscallRules{ - syscall.SYS_ARCH_PRCTL: {}, - syscall.SYS_IOCTL: {}, - syscall.SYS_MMAP: {}, - syscall.SYS_RT_SIGSUSPEND: {}, - syscall.SYS_RT_SIGTIMEDWAIT: {}, - 0xffffffffffffffff: {}, // KVM uses syscall -1 to transition to host. - } -} - func controlServerFilters(fd int) seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_ACCEPT: []seccomp.Rule{ diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go index 468481f29..e80c171b3 100644 --- a/runsc/boot/filter/filter.go +++ b/runsc/boot/filter/filter.go @@ -18,13 +18,9 @@ package filter import ( - "fmt" - "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/kvm" - "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" ) // Options are seccomp filter related options. @@ -53,14 +49,7 @@ func Install(opt Options) error { s.Merge(profileFilters()) } - switch p := opt.Platform.(type) { - case *ptrace.PTrace: - s.Merge(ptraceFilters()) - case *kvm.KVM: - s.Merge(kvmFilters()) - default: - return fmt.Errorf("unknown platform type %T", p) - } + s.Merge(opt.Platform.SyscallFilters()) return seccomp.Install(s) } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 7e27d1f49..38e426ee7 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -42,8 +42,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/kvm" - "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" "gvisor.dev/gvisor/pkg/sentry/sighandling" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/time" @@ -59,6 +57,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/runsc/boot/filter" + _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. "gvisor.dev/gvisor/runsc/specutils" // Include supported socket providers. @@ -416,19 +415,12 @@ func (l *Loader) Destroy() { } func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) { - switch conf.Platform { - case PlatformPtrace: - log.Infof("Platform: ptrace") - return ptrace.New() - case PlatformKVM: - log.Infof("Platform: kvm") - if deviceFile == nil { - return nil, fmt.Errorf("kvm device file must be provided") - } - return kvm.New(deviceFile) - default: - return nil, fmt.Errorf("invalid platform %v", conf.Platform) + p, err := platform.Lookup(conf.Platform) + if err != nil { + panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err)) } + log.Infof("Platform: %s", conf.Platform) + return p.New(deviceFile) } func createMemoryFile() (*pgalloc.MemoryFile, error) { diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index eca592e5b..ff713660d 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -47,6 +47,7 @@ func testConfig() *Config { RootDir: "unused_root_dir", Network: NetworkNone, DisableSeccomp: true, + Platform: "ptrace", } } diff --git a/runsc/boot/platforms/BUILD b/runsc/boot/platforms/BUILD new file mode 100644 index 000000000..03391cdca --- /dev/null +++ b/runsc/boot/platforms/BUILD @@ -0,0 +1,16 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "platforms", + srcs = ["platforms.go"], + importpath = "gvisor.dev/gvisor/runsc/boot/platforms", + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/sentry/platform/kvm", + "//pkg/sentry/platform/ptrace", + ], +) diff --git a/runsc/boot/platforms/platforms.go b/runsc/boot/platforms/platforms.go new file mode 100644 index 000000000..056b46ad5 --- /dev/null +++ b/runsc/boot/platforms/platforms.go @@ -0,0 +1,30 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package platforms imports all available platform packages. +package platforms + +import ( + // Import platforms that runsc might use. + _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" +) + +const ( + // Ptrace runs the sandbox with the ptrace platform. + Ptrace = "ptrace" + + // KVM runs the sandbox with the KVM platform. + KVM = "kvm" +) diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 2c8b84252..5223b9972 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -46,6 +46,7 @@ go_library( "//pkg/unet", "//pkg/urpc", "//runsc/boot", + "//runsc/boot/platforms", "//runsc/console", "//runsc/container", "//runsc/fsgofer", diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index 272eb14d3..b40fded5b 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -26,6 +26,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/boot/platforms" "gvisor.dev/gvisor/runsc/specutils" ) @@ -172,7 +173,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if caps == nil { caps = &specs.LinuxCapabilities{} } - if conf.Platform == boot.PlatformPtrace { + if conf.Platform == platforms.Ptrace { // Ptrace platform requires extra capabilities. const c = "CAP_SYS_PTRACE" caps.Bounding = append(caps.Bounding, c) diff --git a/runsc/container/BUILD b/runsc/container/BUILD index ebe77165e..e246c38ae 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -53,6 +53,7 @@ go_test( "//pkg/unet", "//pkg/urpc", "//runsc/boot", + "//runsc/boot/platforms", "//runsc/specutils", "//runsc/test/testutil", "@com_github_cenkalti_backoff//:go_default_library", diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index b09b80715..baba09fe1 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -36,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/boot/platforms" "gvisor.dev/gvisor/runsc/specutils" "gvisor.dev/gvisor/runsc/test/testutil" ) @@ -256,7 +257,7 @@ func configs(opts ...configOption) []*boot.Config { if testutil.RaceEnabled { continue } - c.Platform = boot.PlatformKVM + c.Platform = platforms.KVM case nonExclusiveFS: c.FileAccess = boot.FileAccessShared default: diff --git a/runsc/main.go b/runsc/main.go index 135061cd3..bc83c57a2 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -30,6 +30,7 @@ import ( "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cmd" "gvisor.dev/gvisor/runsc/specutils" @@ -61,7 +62,7 @@ var ( straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs") // Flags that control sandbox runtime behavior. - platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") + platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") gso = flag.Bool("gso", true, "enable generic segmenation offload") fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") @@ -139,8 +140,8 @@ func main() { } cmd.ErrorLogger = errorLogger - platformType, err := boot.MakePlatformType(*platform) - if err != nil { + platformType := *platformName + if _, err := platform.Lookup(platformType); err != nil { cmd.Fatalf("%v", err) } diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index f32da45c1..7fdceaab6 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -18,9 +18,10 @@ go_library( "//pkg/control/server", "//pkg/log", "//pkg/sentry/control", - "//pkg/sentry/platform/kvm", + "//pkg/sentry/platform", "//pkg/urpc", "//runsc/boot", + "//runsc/boot/platforms", "//runsc/cgroup", "//runsc/console", "//runsc/specutils", diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 6bebf0737..4a11f617d 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -32,9 +32,10 @@ import ( "gvisor.dev/gvisor/pkg/control/server" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" - "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/boot/platforms" "gvisor.dev/gvisor/runsc/cgroup" "gvisor.dev/gvisor/runsc/console" "gvisor.dev/gvisor/runsc/specutils" @@ -491,7 +492,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF {Type: specs.UTSNamespace}, } - if conf.Platform == boot.PlatformPtrace { + if conf.Platform == platforms.Ptrace { // TODO(b/75837838): Also set a new PID namespace so that we limit // access to other host processes. log.Infof("Sandbox will be started in the current PID namespace") @@ -1046,19 +1047,15 @@ func (s *Sandbox) waitForStopped() error { // deviceFileForPlatform opens the device file for the given platform. If the // platform does not need a device file, then nil is returned. -func deviceFileForPlatform(p boot.PlatformType) (*os.File, error) { - var ( - f *os.File - err error - ) - switch p { - case boot.PlatformKVM: - f, err = kvm.OpenDevice() - default: - return nil, nil +func deviceFileForPlatform(name string) (*os.File, error) { + p, err := platform.Lookup(name) + if err != nil { + return nil, err } + + f, err := p.OpenDevice() if err != nil { return nil, fmt.Errorf("opening device file for platform %q: %v", p, err) } - return f, err + return f, nil } diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go index ecab6871d..a98675bfc 100644 --- a/runsc/test/testutil/testutil.go +++ b/runsc/test/testutil/testutil.go @@ -132,6 +132,7 @@ func TestConfig() *boot.Config { LogPackets: true, Network: boot.NetworkNone, Strace: true, + Platform: "ptrace", FileAccess: boot.FileAccessExclusive, TestOnlyAllowRunAsCurrentUserWithoutChroot: true, NumNetworkChannels: 1, -- cgit v1.2.3 From 8f9b1ca8e7066df529b89422937e3212bf761262 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Mon, 8 Jul 2019 10:43:11 -0700 Subject: ext4: disklayout: inode impl. PiperOrigin-RevId: 257010414 --- pkg/sentry/fs/ext4/disklayout/BUILD | 5 + pkg/sentry/fs/ext4/disklayout/block_group_32.go | 9 +- pkg/sentry/fs/ext4/disklayout/block_group_64.go | 6 +- pkg/sentry/fs/ext4/disklayout/disklayout.go | 50 ++++++ pkg/sentry/fs/ext4/disklayout/inode_new.go | 96 ++++++++++ pkg/sentry/fs/ext4/disklayout/inode_old.go | 117 +++++++++++++ pkg/sentry/fs/ext4/disklayout/inode_test.go | 222 ++++++++++++++++++++++++ pkg/sentry/fs/ext4/disklayout/superblock.go | 18 +- pkg/sentry/fs/ext4/disklayout/superblock_32.go | 7 +- pkg/sentry/fs/ext4/disklayout/superblock_64.go | 6 +- pkg/sentry/fs/ext4/disklayout/superblock_old.go | 8 +- 11 files changed, 504 insertions(+), 40 deletions(-) create mode 100644 pkg/sentry/fs/ext4/disklayout/disklayout.go create mode 100644 pkg/sentry/fs/ext4/disklayout/inode_new.go create mode 100644 pkg/sentry/fs/ext4/disklayout/inode_old.go create mode 100644 pkg/sentry/fs/ext4/disklayout/inode_test.go diff --git a/pkg/sentry/fs/ext4/disklayout/BUILD b/pkg/sentry/fs/ext4/disklayout/BUILD index e3d2f8d71..d1cbab275 100644 --- a/pkg/sentry/fs/ext4/disklayout/BUILD +++ b/pkg/sentry/fs/ext4/disklayout/BUILD @@ -8,7 +8,10 @@ go_library( "block_group.go", "block_group_32.go", "block_group_64.go", + "disklayout.go", "inode.go", + "inode_new.go", + "inode_old.go", "superblock.go", "superblock_32.go", "superblock_64.go", @@ -29,7 +32,9 @@ go_test( size = "small", srcs = [ "block_group_test.go", + "inode_test.go", "superblock_test.go", ], embed = [":disklayout"], + deps = ["//pkg/sentry/kernel/time"], ) diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_32.go b/pkg/sentry/fs/ext4/disklayout/block_group_32.go index dadecb3e3..3e16c76db 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group_32.go +++ b/pkg/sentry/fs/ext4/disklayout/block_group_32.go @@ -17,12 +17,6 @@ package disklayout // BlockGroup32Bit emulates the first half of struct ext4_group_desc in // fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and // 32-bit ext4 filesystems. It implements BlockGroup interface. -// -// The suffix `Lo` here stands for lower bits because this is also used in the -// 64-bit version where these fields represent the lower half of the fields. -// The suffix `Raw` has been added to indicate that the field does not have a -// counterpart in the 64-bit version and to resolve name collision with the -// interface. type BlockGroup32Bit struct { BlockBitmapLo uint32 InodeBitmapLo uint32 @@ -38,6 +32,9 @@ type BlockGroup32Bit struct { ChecksumRaw uint16 } +// Compiles only if BlockGroup32Bit implements BlockGroup. +var _ BlockGroup = (*BlockGroup32Bit)(nil) + // InodeTable implements BlockGroup.InodeTable. func (bg *BlockGroup32Bit) InodeTable() uint64 { return uint64(bg.InodeTableLo) } diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_64.go b/pkg/sentry/fs/ext4/disklayout/block_group_64.go index 27de3990d..9a809197a 100644 --- a/pkg/sentry/fs/ext4/disklayout/block_group_64.go +++ b/pkg/sentry/fs/ext4/disklayout/block_group_64.go @@ -18,9 +18,6 @@ package disklayout // It is the block group descriptor struct for 64-bit ext4 filesystems. // It implements BlockGroup interface. It is an extension of the 32-bit // version of BlockGroup. -// -// The suffix `Hi` here stands for upper bits because they represent the upper -// half of the fields. type BlockGroup64Bit struct { // We embed the 32-bit struct here because 64-bit version is just an extension // of the 32-bit version. @@ -40,6 +37,9 @@ type BlockGroup64Bit struct { _ uint32 // Padding to 64 bytes. } +// Compiles only if BlockGroup64Bit implements BlockGroup. +var _ BlockGroup = (*BlockGroup64Bit)(nil) + // Methods to override. Checksum() and Flags() are not overridden. // InodeTable implements BlockGroup.InodeTable. diff --git a/pkg/sentry/fs/ext4/disklayout/disklayout.go b/pkg/sentry/fs/ext4/disklayout/disklayout.go new file mode 100644 index 000000000..bdf4e2132 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/disklayout.go @@ -0,0 +1,50 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package disklayout provides Linux ext file system's disk level structures +// which can be directly read into from the underlying device. Structs aim to +// emulate structures `exactly` how they are layed out on disk. +// +// This library aims to be compatible with all ext(2/3/4) systems so it +// provides a generic interface for all major structures and various +// implementations (for different versions). The user code is responsible for +// using appropriate implementations based on the underlying device. +// +// Interfacing all major structures here serves a few purposes: +// - Abstracts away the complexity of the underlying structure from client +// code. The client only has to figure out versioning on set up and then +// can use these as black boxes and pass it higher up the stack. +// - Having pointer receivers forces the user to use pointers to these +// heavy structs. Hence, prevents the client code from unintentionally +// copying these by value while passing the interface around. +// - Version-based implementation selection is resolved on set up hence +// avoiding per call overhead of choosing implementation. +// - All interface methods are pretty light weight (do not take in any +// parameters by design). Passing pointer arguments to interface methods +// can lead to heap allocation as the compiler won't be able to perform +// escape analysis on an unknown implementation at compile time. +// +// Notes: +// - All fields in these structs are exported because binary.Read would +// panic otherwise. +// - All structures on disk are in little-endian order. Only jbd2 (journal) +// structures are in big-endian order. +// - All OS dependent fields in these structures will be interpretted using +// the Linux version of that field. +// - The suffix `Lo` in field names stands for lower bits of that field. +// - The suffix `Hi` in field names stands for upper bits of that field. +// - The suffix `Raw` has been added to indicate that the field is not split +// into Lo and Hi fields and also to resolve name collision with the +// respective interface. +package disklayout diff --git a/pkg/sentry/fs/ext4/disklayout/inode_new.go b/pkg/sentry/fs/ext4/disklayout/inode_new.go new file mode 100644 index 000000000..4f5348372 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/inode_new.go @@ -0,0 +1,96 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import "gvisor.dev/gvisor/pkg/sentry/kernel/time" + +// InodeNew represents ext4 inode structure which can be bigger than +// OldInodeSize. The actual size of this struct should be determined using +// inode.ExtraInodeSize. Accessing any field here should be verified with the +// actual size. The extra space between the end of the inode struct and end of +// the inode record can be used to store extended attr. +// +// If the TimeExtra fields are in scope, the lower 2 bits of those are used +// to extend their counter part to be 34 bits wide; the rest (upper) 30 bits +// are used to provide nanoscond precision. Hence, these timestamps will now +// overflow in May 2446. +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. +type InodeNew struct { + InodeOld + + ExtraInodeSize uint16 + ChecksumHi uint16 + ChangeTimeExtra uint32 + ModificationTimeExtra uint32 + AccessTimeExtra uint32 + CreationTime uint32 + CreationTimeExtra uint32 + VersionHi uint32 + ProjectID uint32 +} + +// Compiles only if InodeNew implements Inode. +var _ Inode = (*InodeNew)(nil) + +// fromExtraTime decodes the extra time and constructs the kernel time struct +// with nanosecond precision. +func fromExtraTime(lo int32, extra uint32) time.Time { + // See description above InodeNew for format. + seconds := (int64(extra&0x3) << 32) + int64(lo) + nanoseconds := int64(extra >> 2) + return time.FromUnix(seconds, nanoseconds) +} + +// Only override methods which change due to ext4 specific fields. + +// Size implements Inode.Size. +func (in *InodeNew) Size() uint64 { + return (uint64(in.SizeHi) << 32) | uint64(in.SizeLo) +} + +// InodeSize implements Inode.InodeSize. +func (in *InodeNew) InodeSize() uint16 { + return oldInodeSize + in.ExtraInodeSize +} + +// ChangeTime implements Inode.ChangeTime. +func (in *InodeNew) ChangeTime() time.Time { + // Apply new timestamp logic if inode.ChangeTimeExtra is in scope. + if in.ExtraInodeSize >= 8 { + return fromExtraTime(in.ChangeTimeRaw, in.ChangeTimeExtra) + } + + return in.InodeOld.ChangeTime() +} + +// ModificationTime implements Inode.ModificationTime. +func (in *InodeNew) ModificationTime() time.Time { + // Apply new timestamp logic if inode.ModificationTimeExtra is in scope. + if in.ExtraInodeSize >= 12 { + return fromExtraTime(in.ModificationTimeRaw, in.ModificationTimeExtra) + } + + return in.InodeOld.ModificationTime() +} + +// AccessTime implements Inode.AccessTime. +func (in *InodeNew) AccessTime() time.Time { + // Apply new timestamp logic if inode.AccessTimeExtra is in scope. + if in.ExtraInodeSize >= 16 { + return fromExtraTime(in.AccessTimeRaw, in.AccessTimeExtra) + } + + return in.InodeOld.AccessTime() +} diff --git a/pkg/sentry/fs/ext4/disklayout/inode_old.go b/pkg/sentry/fs/ext4/disklayout/inode_old.go new file mode 100644 index 000000000..dc4c9d8e4 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/inode_old.go @@ -0,0 +1,117 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +const ( + // oldInodeSize is the inode size in ext2/ext3. + oldInodeSize = 128 +) + +// InodeOld implements Inode interface. It emulates ext2/ext3 inode struct. +// Inode struct size and record size are both 128 bytes for this. +// +// All fields representing time are in seconds since the epoch. Which means that +// they will overflow in January 2038. +type InodeOld struct { + ModeRaw uint16 + UIDLo uint16 + SizeLo uint32 + + // The time fields are signed integers because they could be negative to + // represent time before the epoch. + AccessTimeRaw int32 + ChangeTimeRaw int32 + ModificationTimeRaw int32 + DeletionTimeRaw int32 + + GIDLo uint16 + LinksCountRaw uint16 + BlocksCountLo uint32 + FlagsRaw uint32 + VersionLo uint32 // This is OS dependent. + BlocksRaw [60]byte + Generation uint32 + FileACLLo uint32 + SizeHi uint32 + ObsoFaddr uint32 + + // OS dependent fields have been inlined here. + BlocksCountHi uint16 + FileACLHi uint16 + UIDHi uint16 + GIDHi uint16 + ChecksumLo uint16 + _ uint16 +} + +// Compiles only if InodeOld implements Inode. +var _ Inode = (*InodeOld)(nil) + +// Mode implements Inode.Mode. +func (in *InodeOld) Mode() linux.FileMode { return linux.FileMode(in.ModeRaw) } + +// UID implements Inode.UID. +func (in *InodeOld) UID() auth.KUID { + return auth.KUID((uint32(in.UIDHi) << 16) | uint32(in.UIDLo)) +} + +// GID implements Inode.GID. +func (in *InodeOld) GID() auth.KGID { + return auth.KGID((uint32(in.GIDHi) << 16) | uint32(in.GIDLo)) +} + +// Size implements Inode.Size. +func (in *InodeOld) Size() uint64 { + // In ext2/ext3, in.SizeHi did not exist, it was instead named in.DirACL. + return uint64(in.SizeLo) +} + +// InodeSize implements Inode.InodeSize. +func (in *InodeOld) InodeSize() uint16 { return oldInodeSize } + +// AccessTime implements Inode.AccessTime. +func (in *InodeOld) AccessTime() time.Time { + return time.FromUnix(int64(in.AccessTimeRaw), 0) +} + +// ChangeTime implements Inode.ChangeTime. +func (in *InodeOld) ChangeTime() time.Time { + return time.FromUnix(int64(in.ChangeTimeRaw), 0) +} + +// ModificationTime implements Inode.ModificationTime. +func (in *InodeOld) ModificationTime() time.Time { + return time.FromUnix(int64(in.ModificationTimeRaw), 0) +} + +// DeletionTime implements Inode.DeletionTime. +func (in *InodeOld) DeletionTime() time.Time { + return time.FromUnix(int64(in.DeletionTimeRaw), 0) +} + +// LinksCount implements Inode.LinksCount. +func (in *InodeOld) LinksCount() uint16 { return in.LinksCountRaw } + +// Flags implements Inode.Flags. +func (in *InodeOld) Flags() InodeFlags { return InodeFlagsFromInt(in.FlagsRaw) } + +// Blocks implements Inode.Blocks. +func (in *InodeOld) Blocks() [60]byte { return in.BlocksRaw } diff --git a/pkg/sentry/fs/ext4/disklayout/inode_test.go b/pkg/sentry/fs/ext4/disklayout/inode_test.go new file mode 100644 index 000000000..9cae9e4f0 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/inode_test.go @@ -0,0 +1,222 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "fmt" + "strconv" + "testing" + + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +// TestInodeSize tests that the inode structs are of the correct size. +func TestInodeSize(t *testing.T) { + assertSize(t, InodeOld{}, oldInodeSize) + + // This was updated from 156 bytes to 160 bytes in Oct 2015. + assertSize(t, InodeNew{}, 160) +} + +// TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in +// ext4 inode structs are decoded correctly. +// +// These tests are derived from the table under https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. +func TestTimestampSeconds(t *testing.T) { + type timestampTest struct { + // msbSet tells if the most significant bit of InodeOld.[X]TimeRaw is set. + // If this is set then the 32-bit time is negative. + msbSet bool + + // lowerBound tells if we should take the lowest possible value of + // InodeOld.[X]TimeRaw while satisfying test.msbSet condition. If set to + // false it tells to take the highest possible value. + lowerBound bool + + // extraBits is InodeNew.[X]TimeExtra. + extraBits uint32 + + // want is the kernel time struct that is expected. + want time.Time + } + + tests := []timestampTest{ + // 1901-12-13 + { + msbSet: true, + lowerBound: true, + extraBits: 0, + want: time.FromUnix(int64(-0x80000000), 0), + }, + + // 1969-12-31 + { + msbSet: true, + lowerBound: false, + extraBits: 0, + want: time.FromUnix(int64(-1), 0), + }, + + // 1970-01-01 + { + msbSet: false, + lowerBound: true, + extraBits: 0, + want: time.FromUnix(int64(0), 0), + }, + + // 2038-01-19 + { + msbSet: false, + lowerBound: false, + extraBits: 0, + want: time.FromUnix(int64(0x7fffffff), 0), + }, + + // 2038-01-19 + { + msbSet: true, + lowerBound: true, + extraBits: 1, + want: time.FromUnix(int64(0x80000000), 0), + }, + + // 2106-02-07 + { + msbSet: true, + lowerBound: false, + extraBits: 1, + want: time.FromUnix(int64(0xffffffff), 0), + }, + + // 2106-02-07 + { + msbSet: false, + lowerBound: true, + extraBits: 1, + want: time.FromUnix(int64(0x100000000), 0), + }, + + // 2174-02-25 + { + msbSet: false, + lowerBound: false, + extraBits: 1, + want: time.FromUnix(int64(0x17fffffff), 0), + }, + + // 2174-02-25 + { + msbSet: true, + lowerBound: true, + extraBits: 2, + want: time.FromUnix(int64(0x180000000), 0), + }, + + // 2242-03-16 + { + msbSet: true, + lowerBound: false, + extraBits: 2, + want: time.FromUnix(int64(0x1ffffffff), 0), + }, + + // 2242-03-16 + { + msbSet: false, + lowerBound: true, + extraBits: 2, + want: time.FromUnix(int64(0x200000000), 0), + }, + + // 2310-04-04 + { + msbSet: false, + lowerBound: false, + extraBits: 2, + want: time.FromUnix(int64(0x27fffffff), 0), + }, + + // 2310-04-04 + { + msbSet: true, + lowerBound: true, + extraBits: 3, + want: time.FromUnix(int64(0x280000000), 0), + }, + + // 2378-04-22 + { + msbSet: true, + lowerBound: false, + extraBits: 3, + want: time.FromUnix(int64(0x2ffffffff), 0), + }, + + // 2378-04-22 + { + msbSet: false, + lowerBound: true, + extraBits: 3, + want: time.FromUnix(int64(0x300000000), 0), + }, + + // 2446-05-10 + { + msbSet: false, + lowerBound: false, + extraBits: 3, + want: time.FromUnix(int64(0x37fffffff), 0), + }, + } + + lowerMSB0 := int32(0) // binary: 00000000 00000000 00000000 00000000 + upperMSB0 := int32(0x7fffffff) // binary: 01111111 11111111 11111111 11111111 + lowerMSB1 := int32(-0x80000000) // binary: 10000000 00000000 00000000 00000000 + upperMSB1 := int32(-1) // binary: 11111111 11111111 11111111 11111111 + + get32BitTime := func(test timestampTest) int32 { + if test.msbSet { + if test.lowerBound { + return lowerMSB1 + } + + return upperMSB1 + } + + if test.lowerBound { + return lowerMSB0 + } + + return upperMSB0 + } + + getTestName := func(test timestampTest) string { + return fmt.Sprintf( + "Tests time decoding with epoch bits 0b%s and 32-bit raw time: MSB set=%t, lower bound=%t", + strconv.FormatInt(int64(test.extraBits), 2), + test.msbSet, + test.lowerBound, + ) + } + + for _, test := range tests { + t.Run(getTestName(test), func(t *testing.T) { + if got := fromExtraTime(get32BitTime(test), test.extraBits); got != test.want { + t.Errorf("Expected: %v, Got: %v", test.want, got) + } + }) + } +} diff --git a/pkg/sentry/fs/ext4/disklayout/superblock.go b/pkg/sentry/fs/ext4/disklayout/superblock.go index 030c73410..e4b8f46fb 100644 --- a/pkg/sentry/fs/ext4/disklayout/superblock.go +++ b/pkg/sentry/fs/ext4/disklayout/superblock.go @@ -12,22 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package disklayout provides Linux ext file system's disk level structures -// which can be directly read into from the underlying device. All structures -// on disk are in little-endian order. Only jbd2 (journal) structures are in -// big-endian order. Structs aim to emulate structures `exactly` how they are -// layed out on disk. -// -// This library aims to be compatible with all ext(2/3/4) systems so it -// provides a generic interface for all major structures and various -// implementations (for different versions). The user code is responsible for -// using appropriate implementations based on the underlying device. -// -// Notes: -// - All fields in these structs are exported because binary.Read would -// panic otherwise. -// - All OS dependent fields in these structures will be interpretted using -// the Linux version of that field. package disklayout // SuperBlock should be implemented by structs representing the ext superblock. @@ -109,7 +93,7 @@ type SuperBlock interface { // - Inode disk record size = sb.s_inode_size (function return value). // = 256 (default) // - Inode struct size = 128 + inode.i_extra_isize. - // = 128 + 28 = 156 (default) + // = 128 + 32 = 160 (default) InodeSize() uint16 // InodesPerGroup returns the number of inodes in a block group. diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_32.go b/pkg/sentry/fs/ext4/disklayout/superblock_32.go index 4c3233eed..587e4afaa 100644 --- a/pkg/sentry/fs/ext4/disklayout/superblock_32.go +++ b/pkg/sentry/fs/ext4/disklayout/superblock_32.go @@ -16,10 +16,6 @@ package disklayout // SuperBlock32Bit implements SuperBlock and represents the 32-bit version of // the ext4_super_block struct in fs/ext4/ext4.h. -// -// The suffix `Raw` has been added to indicate that the field does not have a -// counterpart in the 64-bit version and to resolve name collision with the -// interface. type SuperBlock32Bit struct { // We embed the old superblock struct here because the 32-bit version is just // an extension of the old version. @@ -52,6 +48,9 @@ type SuperBlock32Bit struct { JnlBlocks [17]uint32 } +// Compiles only if SuperBlock32Bit implements SuperBlock. +var _ SuperBlock = (*SuperBlock32Bit)(nil) + // Only override methods which change based on the additional fields above. // Not overriding SuperBlock.BgDescSize because it would still return 32 here. diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_64.go b/pkg/sentry/fs/ext4/disklayout/superblock_64.go index 2e945a7c7..a2c2278fb 100644 --- a/pkg/sentry/fs/ext4/disklayout/superblock_64.go +++ b/pkg/sentry/fs/ext4/disklayout/superblock_64.go @@ -18,9 +18,6 @@ package disklayout // the ext4_super_block struct in fs/ext4/ext4.h. This sums up to be exactly // 1024 bytes (smallest possible block size) and hence the superblock always // fits in no more than one data block. -// -// The suffix `Hi` here stands for upper bits because they represent the upper -// half of the fields. type SuperBlock64Bit struct { // We embed the 32-bit struct here because 64-bit version is just an extension // of the 32-bit version. @@ -78,6 +75,9 @@ type SuperBlock64Bit struct { Checksum uint32 } +// Compiles only if SuperBlock64Bit implements SuperBlock. +var _ SuperBlock = (*SuperBlock64Bit)(nil) + // Only override methods which change based on the 64-bit feature. // BlocksCount implements SuperBlock.BlocksCount. diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_old.go b/pkg/sentry/fs/ext4/disklayout/superblock_old.go index 1f7425ba3..c74953610 100644 --- a/pkg/sentry/fs/ext4/disklayout/superblock_old.go +++ b/pkg/sentry/fs/ext4/disklayout/superblock_old.go @@ -16,12 +16,6 @@ package disklayout // SuperBlockOld implements SuperBlock and represents the old version of the // superblock struct in ext2 and ext3 systems. -// -// The suffix `Lo` here stands for lower bits because this is also used in the -// 64-bit version where these fields represent the lower half of the fields. -// The suffix `Raw` has been added to indicate that the field does not have a -// counterpart in the 64-bit version and to resolve name collision with the -// interface. type SuperBlockOld struct { InodesCountRaw uint32 BlocksCountLo uint32 @@ -84,7 +78,7 @@ func (sb *SuperBlockOld) ClusterSize() uint64 { return 1 << (10 + sb.LogClusterS func (sb *SuperBlockOld) ClustersPerGroup() uint32 { return sb.ClustersPerGroupRaw } // InodeSize implements SuperBlock.InodeSize. -func (sb *SuperBlockOld) InodeSize() uint16 { return 128 } +func (sb *SuperBlockOld) InodeSize() uint16 { return oldInodeSize } // InodesPerGroup implements SuperBlock.InodesPerGroup. func (sb *SuperBlockOld) InodesPerGroup() uint32 { return sb.InodesPerGroupRaw } -- cgit v1.2.3 From 659bebab8e83ec9b5f6fef26ca27048af526ee40 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Mon, 8 Jul 2019 12:55:37 -0700 Subject: Don't try to execute a file that is not regular. PiperOrigin-RevId: 257037608 --- pkg/sentry/fs/mounts.go | 5 ++++ runsc/container/container_test.go | 57 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index ce7ffeed2..693ffc760 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -663,6 +663,11 @@ func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name s } defer d.DecRef() + // Check that it is a regular file. + if !IsRegular(d.Inode.StableAttr) { + continue + } + // Check whether we can read and execute the found file. if err := d.Inode.CheckPermission(ctx, PermMask{Read: true, Execute: true}); err != nil { log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err) diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index baba09fe1..c1d6ca7b8 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -17,6 +17,7 @@ package container import ( "bytes" "fmt" + "io" "io/ioutil" "os" "path" @@ -409,6 +410,46 @@ func TestLifecycle(t *testing.T) { // Test the we can execute the application with different path formats. func TestExePath(t *testing.T) { + // Create two directories that will be prepended to PATH. + firstPath, err := ioutil.TempDir(testutil.TmpDir(), "first") + if err != nil { + t.Fatal(err) + } + secondPath, err := ioutil.TempDir(testutil.TmpDir(), "second") + if err != nil { + t.Fatal(err) + } + + // Create two minimal executables in the second path, two of which + // will be masked by files in first path. + for _, p := range []string{"unmasked", "masked1", "masked2"} { + path := filepath.Join(secondPath, p) + f, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0777) + if err != nil { + t.Fatal(err) + } + defer f.Close() + if _, err := io.WriteString(f, "#!/bin/true\n"); err != nil { + t.Fatal(err) + } + } + + // Create a non-executable file in the first path which masks a healthy + // executable in the second. + nonExecutable := filepath.Join(firstPath, "masked1") + f2, err := os.OpenFile(nonExecutable, os.O_CREATE|os.O_EXCL, 0666) + if err != nil { + t.Fatal(err) + } + f2.Close() + + // Create a non-regular file in the first path which masks a healthy + // executable in the second. + nonRegular := filepath.Join(firstPath, "masked2") + if err := os.Mkdir(nonRegular, 0777); err != nil { + t.Fatal(err) + } + for _, conf := range configs(overlay) { t.Logf("Running test with conf: %+v", conf) for _, test := range []struct { @@ -421,8 +462,24 @@ func TestExePath(t *testing.T) { {path: "thisfiledoesntexit", success: false}, {path: "bin/thisfiledoesntexit", success: false}, {path: "/bin/thisfiledoesntexit", success: false}, + + {path: "unmasked", success: true}, + {path: filepath.Join(firstPath, "unmasked"), success: false}, + {path: filepath.Join(secondPath, "unmasked"), success: true}, + + {path: "masked1", success: true}, + {path: filepath.Join(firstPath, "masked1"), success: false}, + {path: filepath.Join(secondPath, "masked1"), success: true}, + + {path: "masked2", success: true}, + {path: filepath.Join(firstPath, "masked2"), success: false}, + {path: filepath.Join(secondPath, "masked2"), success: true}, } { spec := testutil.NewSpecWithArgs(test.path) + spec.Process.Env = []string{ + fmt.Sprintf("PATH=%s:%s:%s", firstPath, secondPath, os.Getenv("PATH")), + } + rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("exec: %s, error setting up container: %v", test.path, err) -- cgit v1.2.3 From e45d724948cf03a7aca871971e75f2cfe1a3bc1f Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Mon, 8 Jul 2019 13:19:54 -0700 Subject: Internal change. PiperOrigin-RevId: 257042681 --- test/util/test_util.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/test/util/test_util.cc b/test/util/test_util.cc index 4fe15afaa..e42bba04a 100644 --- a/test/util/test_util.cc +++ b/test/util/test_util.cc @@ -231,7 +231,6 @@ void TestInit(int* argc, char*** argv) { sa.sa_handler = SIG_IGN; TEST_CHECK(sigaction(SIGPIPE, &sa, nullptr) == 0); } - gvisor:case-end } // namespace testing } // namespace gvisor -- cgit v1.2.3 From 6db3f8d54c0225e6b6c3d8eef30b4b61498848b7 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Mon, 8 Jul 2019 14:56:09 -0700 Subject: Don't mask errors in createAt loop. The error set in the loop in createAt was being masked by other errors declared with ":=". This allowed an ErrResolveViaReadlink error to escape, which can cause a sentry panic. Added test case which repros without the fix. PiperOrigin-RevId: 257061767 --- pkg/sentry/syscalls/linux/sys_file.go | 9 ++++++--- test/syscalls/linux/symlink.cc | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index eb6f5648f..40722abc2 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -353,7 +353,8 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l // No more resolution necessary. defer resolved.DecRef() break - } else if err != fs.ErrResolveViaReadlink { + } + if err != fs.ErrResolveViaReadlink { return err } @@ -363,15 +364,17 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l } // Resolve the symlink to a path via Readlink. - path, err := found.Inode.Readlink(t) + var path string + path, err = found.Inode.Readlink(t) if err != nil { break } remainingTraversals-- // Get the new parent from the target path. + var newParent *fs.Dirent newParentPath, newName := fs.SplitLast(path) - newParent, err := t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals) + newParent, err = t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals) if err != nil { break } diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc index 69650a1d3..b249ff91f 100644 --- a/test/syscalls/linux/symlink.cc +++ b/test/syscalls/linux/symlink.cc @@ -325,6 +325,21 @@ TEST_P(ParamSymlinkTest, CreateExistingSelfLink) { ASSERT_THAT(unlink(linkpath.c_str()), SyscallSucceeds()); } +// Test that opening a file that is a symlink to its parent directory fails +// with ELOOP. +TEST_P(ParamSymlinkTest, CreateExistingParentLink) { + ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds()); + + const std::string linkpath = GetParam(); + const std::string target = JoinPath(linkpath, "child"); + ASSERT_THAT(symlink(target.c_str(), linkpath.c_str()), SyscallSucceeds()); + + EXPECT_THAT(open(linkpath.c_str(), O_CREAT, 0666), + SyscallFailsWithErrno(ELOOP)); + + ASSERT_THAT(unlink(linkpath.c_str()), SyscallSucceeds()); +} + // Test that opening an existing symlink with O_CREAT|O_EXCL will fail with // EEXIST. TEST_P(ParamSymlinkTest, OpenLinkExclFails) { -- cgit v1.2.3 From cceef9d2cfbf72a7ae4feac2e53e46179c33155d Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Tue, 9 Jul 2019 16:16:46 -0700 Subject: Cleanup straggling syscall dependencies. PiperOrigin-RevId: 257293198 --- pkg/abi/linux/BUILD | 2 + pkg/abi/linux/clone.go | 41 +++++++++ pkg/abi/linux/epoll.go | 56 +++++++++++ pkg/abi/linux/fcntl.go | 36 ++++++-- pkg/abi/linux/file.go | 43 ++++++--- pkg/abi/linux/ipc.go | 6 +- pkg/abi/linux/netlink.go | 8 +- pkg/abi/linux/netlink_route.go | 12 +-- pkg/abi/linux/sched.go | 6 ++ pkg/abi/linux/time.go | 6 ++ pkg/sentry/syscalls/epoll.go | 26 +++--- pkg/sentry/syscalls/linux/error.go | 3 +- pkg/sentry/syscalls/linux/linux64.go | 148 +++++++++++++++--------------- pkg/sentry/syscalls/linux/sigset.go | 4 +- pkg/sentry/syscalls/linux/sys_epoll.go | 26 +++--- pkg/sentry/syscalls/linux/sys_eventfd.go | 5 +- pkg/sentry/syscalls/linux/sys_file.go | 22 ++--- pkg/sentry/syscalls/linux/sys_getdents.go | 20 ++-- pkg/sentry/syscalls/linux/sys_inotify.go | 13 ++- pkg/sentry/syscalls/linux/sys_pipe.go | 7 +- pkg/sentry/syscalls/linux/sys_prctl.go | 30 +++--- pkg/sentry/syscalls/linux/sys_sched.go | 23 +++-- pkg/sentry/syscalls/linux/sys_seccomp.go | 9 +- pkg/sentry/syscalls/linux/sys_socket.go | 123 ++++++++++++------------- pkg/sentry/syscalls/linux/sys_thread.go | 94 +++++++++---------- pkg/sentry/syscalls/linux/sys_timer.go | 6 +- pkg/sentry/syscalls/linux/sys_tls.go | 7 +- pkg/sentry/syscalls/linux/timespec.go | 7 +- pkg/sentry/syscalls/syscalls.go | 5 +- pkg/syserror/syserror.go | 2 + 30 files changed, 461 insertions(+), 335 deletions(-) create mode 100644 pkg/abi/linux/clone.go create mode 100644 pkg/abi/linux/epoll.go diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index cd405aa96..2061b38c0 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -13,8 +13,10 @@ go_library( "audit.go", "bpf.go", "capability.go", + "clone.go", "dev.go", "elf.go", + "epoll.go", "errors.go", "eventfd.go", "exec.go", diff --git a/pkg/abi/linux/clone.go b/pkg/abi/linux/clone.go new file mode 100644 index 000000000..c2cbfca5e --- /dev/null +++ b/pkg/abi/linux/clone.go @@ -0,0 +1,41 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// Clone constants per clone(2). +const ( + CLONE_VM = 0x100 + CLONE_FS = 0x200 + CLONE_FILES = 0x400 + CLONE_SIGHAND = 0x800 + CLONE_PARENT = 0x8000 + CLONE_PTRACE = 0x2000 + CLONE_VFORK = 0x4000 + CLONE_THREAD = 0x10000 + CLONE_NEWNS = 0x20000 + CLONE_SYSVSEM = 0x40000 + CLONE_SETTLS = 0x80000 + CLONE_PARENT_SETTID = 0x100000 + CLONE_CHILD_CLEARTID = 0x200000 + CLONE_DETACHED = 0x400000 + CLONE_UNTRACED = 0x800000 + CLONE_CHILD_SETTID = 0x1000000 + CLONE_NEWUTS = 0x4000000 + CLONE_NEWIPC = 0x8000000 + CLONE_NEWUSER = 0x10000000 + CLONE_NEWPID = 0x20000000 + CLONE_NEWNET = 0x40000000 + CLONE_IO = 0x80000000 +) diff --git a/pkg/abi/linux/epoll.go b/pkg/abi/linux/epoll.go new file mode 100644 index 000000000..72083b604 --- /dev/null +++ b/pkg/abi/linux/epoll.go @@ -0,0 +1,56 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// EpollEvent is equivalent to struct epoll_event from epoll(2). +type EpollEvent struct { + Events uint32 + Fd int32 + Data int32 +} + +// Event masks. +const ( + EPOLLIN = 0x1 + EPOLLPRI = 0x2 + EPOLLOUT = 0x4 + EPOLLERR = 0x8 + EPOLLHUP = 0x10 + EPOLLRDNORM = 0x40 + EPOLLRDBAND = 0x80 + EPOLLWRNORM = 0x100 + EPOLLWRBAND = 0x200 + EPOLLMSG = 0x400 + EPOLLRDHUP = 0x2000 +) + +// Per-file descriptor flags. +const ( + EPOLLET = 0x80000000 + EPOLLONESHOT = 0x40000000 +) + +// Operation flags. +const ( + EPOLL_CLOEXEC = 0x80000 + EPOLL_NONBLOCK = 0x800 +) + +// Control operations. +const ( + EPOLL_CTL_ADD = 0x1 + EPOLL_CTL_DEL = 0x2 + EPOLL_CTL_MOD = 0x3 +) diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go index 30902a8ca..f78315ebf 100644 --- a/pkg/abi/linux/fcntl.go +++ b/pkg/abi/linux/fcntl.go @@ -16,21 +16,39 @@ package linux // Commands from linux/fcntl.h. const ( - F_DUPFD = 0 - F_GETFD = 1 - F_GETFL = 3 - F_GETOWN = 9 - F_SETFD = 2 - F_SETFL = 4 - F_SETLK = 6 - F_SETLKW = 7 - F_SETOWN = 8 + F_DUPFD = 0x0 + F_GETFD = 0x1 + F_SETFD = 0x2 + F_GETFL = 0x3 + F_SETFL = 0x4 + F_SETLK = 0x6 + F_SETLKW = 0x7 + F_SETOWN = 0x8 + F_GETOWN = 0x9 F_DUPFD_CLOEXEC = 1024 + 6 F_SETPIPE_SZ = 1024 + 7 F_GETPIPE_SZ = 1024 + 8 ) +// Commands for F_SETLK. +const ( + F_RDLCK = 0x0 + F_WRLCK = 0x1 + F_UNLCK = 0x2 +) + // Flags for fcntl. const ( FD_CLOEXEC = 00000001 ) + +// Lock structure for F_SETLK. +type Flock struct { + Type int16 + Whence int16 + _ [4]byte + Start int64 + Len int64 + Pid int32 + _ [4]byte +} diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index 426003eb7..f78ffaa82 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -161,23 +161,36 @@ const ( // Stat represents struct stat. type Stat struct { - Dev uint64 - Ino uint64 - Nlink uint64 - Mode uint32 - UID uint32 - GID uint32 - X_pad0 int32 - Rdev uint64 - Size int64 - Blksize int64 - Blocks int64 - ATime Timespec - MTime Timespec - CTime Timespec - X_unused [3]int64 + Dev uint64 + Ino uint64 + Nlink uint64 + Mode uint32 + UID uint32 + GID uint32 + _ int32 + Rdev uint64 + Size int64 + Blksize int64 + Blocks int64 + ATime Timespec + MTime Timespec + CTime Timespec + _ [3]int64 } +// File types. +const ( + DT_BLK = 0x6 + DT_CHR = 0x2 + DT_DIR = 0x4 + DT_FIFO = 0x1 + DT_LNK = 0xa + DT_REG = 0x8 + DT_SOCK = 0xc + DT_UNKNOWN = 0x0 + DT_WHT = 0xe +) + // SizeOfStat is the size of a Stat struct. var SizeOfStat = binary.Size(Stat{}) diff --git a/pkg/abi/linux/ipc.go b/pkg/abi/linux/ipc.go index 2ef8d6cbb..22acd2d43 100644 --- a/pkg/abi/linux/ipc.go +++ b/pkg/abi/linux/ipc.go @@ -44,10 +44,10 @@ type IPCPerm struct { CUID uint32 CGID uint32 Mode uint16 - pad1 uint16 + _ uint16 Seq uint16 - pad2 uint16 - pad3 uint32 + _ uint16 + _ uint32 unused1 uint64 unused2 uint64 } diff --git a/pkg/abi/linux/netlink.go b/pkg/abi/linux/netlink.go index 5e718c363..e8b6544b4 100644 --- a/pkg/abi/linux/netlink.go +++ b/pkg/abi/linux/netlink.go @@ -41,10 +41,10 @@ const ( // SockAddrNetlink is struct sockaddr_nl, from uapi/linux/netlink.h. type SockAddrNetlink struct { - Family uint16 - Padding uint16 - PortID uint32 - Groups uint32 + Family uint16 + _ uint16 + PortID uint32 + Groups uint32 } // SockAddrNetlinkSize is the size of SockAddrNetlink. diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go index 630dc339a..dd698e2bc 100644 --- a/pkg/abi/linux/netlink_route.go +++ b/pkg/abi/linux/netlink_route.go @@ -86,12 +86,12 @@ const ( // InterfaceInfoMessage is struct ifinfomsg, from uapi/linux/rtnetlink.h. type InterfaceInfoMessage struct { - Family uint8 - Padding uint8 - Type uint16 - Index int32 - Flags uint32 - Change uint32 + Family uint8 + _ uint8 + Type uint16 + Index int32 + Flags uint32 + Change uint32 } // Interface flags, from uapi/linux/if.h. diff --git a/pkg/abi/linux/sched.go b/pkg/abi/linux/sched.go index 193d9a242..70e820823 100644 --- a/pkg/abi/linux/sched.go +++ b/pkg/abi/linux/sched.go @@ -28,3 +28,9 @@ const ( // reverted back to SCHED_NORMAL on fork. SCHED_RESET_ON_FORK = 0x40000000 ) + +const ( + PRIO_PGRP = 0x1 + PRIO_PROCESS = 0x0 + PRIO_USER = 0x2 +) diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go index e727066d7..546668bca 100644 --- a/pkg/abi/linux/time.go +++ b/pkg/abi/linux/time.go @@ -241,3 +241,9 @@ func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) { Nsec: uint32(nsec % 1e9), } } + +// Utime represents struct utimbuf used by utimes(2). +type Utime struct { + Actime int64 + Modtime int64 +} diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go index 11850dd0f..87dcad18b 100644 --- a/pkg/sentry/syscalls/epoll.go +++ b/pkg/sentry/syscalls/epoll.go @@ -15,12 +15,12 @@ package syscalls import ( - "syscall" "time" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -44,21 +44,21 @@ func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask // Get epoll from the file descriptor. epollfile := t.GetFile(epfd) if epollfile == nil { - return syscall.EBADF + return syserror.EBADF } defer epollfile.DecRef() // Get the target file id. file := t.GetFile(fd) if file == nil { - return syscall.EBADF + return syserror.EBADF } defer file.DecRef() // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syscall.EBADF + return syserror.EBADF } // Try to add the entry. @@ -70,21 +70,21 @@ func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, m // Get epoll from the file descriptor. epollfile := t.GetFile(epfd) if epollfile == nil { - return syscall.EBADF + return syserror.EBADF } defer epollfile.DecRef() // Get the target file id. file := t.GetFile(fd) if file == nil { - return syscall.EBADF + return syserror.EBADF } defer file.DecRef() // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syscall.EBADF + return syserror.EBADF } // Try to update the entry. @@ -96,21 +96,21 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { // Get epoll from the file descriptor. epollfile := t.GetFile(epfd) if epollfile == nil { - return syscall.EBADF + return syserror.EBADF } defer epollfile.DecRef() // Get the target file id. file := t.GetFile(fd) if file == nil { - return syscall.EBADF + return syserror.EBADF } defer file.DecRef() // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syscall.EBADF + return syserror.EBADF } // Try to remove the entry. @@ -122,14 +122,14 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]epoll.Event, e // Get epoll from the file descriptor. epollfile := t.GetFile(fd) if epollfile == nil { - return nil, syscall.EBADF + return nil, syserror.EBADF } defer epollfile.DecRef() // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return nil, syscall.EBADF + return nil, syserror.EBADF } // Try to read events and return right away if we got them or if the @@ -162,7 +162,7 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]epoll.Event, e } if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syscall.ETIMEDOUT { + if err == syserror.ETIMEDOUT { return nil, nil } diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index ac3905c5c..264301bfa 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -17,7 +17,6 @@ package linux import ( "io" "sync" - "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" @@ -54,7 +53,7 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // Do not consume the error and return it as EFBIG. // Simultaneously send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) - return syscall.EFBIG + return syserror.EFBIG case syserror.ErrInterrupted: // The syscall was interrupted. Return nil if it completed // partially, otherwise return the error code that the syscall diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 7bbbb5008..680c0c64c 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -16,8 +16,6 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -116,10 +114,10 @@ var AMD64 = &kernel.SyscallTable{ 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), 67: syscalls.Supported("shmdt", Shmdt), - 68: syscalls.ErrorWithEvent("msgget", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 69: syscalls.ErrorWithEvent("msgsnd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 70: syscalls.ErrorWithEvent("msgrcv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 71: syscalls.ErrorWithEvent("msgctl", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), @@ -170,8 +168,8 @@ var AMD64 = &kernel.SyscallTable{ 119: syscalls.Supported("setresgid", Setresgid), 120: syscalls.Supported("getresgid", Getresgid), 121: syscalls.Supported("getpgid", Getpgid), - 122: syscalls.ErrorWithEvent("setfsuid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 123: syscalls.ErrorWithEvent("setfsgid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) 124: syscalls.Supported("getsid", Getsid), 125: syscalls.Supported("capget", Capget), 126: syscalls.Supported("capset", Capset), @@ -182,12 +180,12 @@ var AMD64 = &kernel.SyscallTable{ 131: syscalls.Supported("sigaltstack", Sigaltstack), 132: syscalls.Supported("utime", Utime), 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), - 134: syscalls.Error("uselib", syscall.ENOSYS, "Obsolete", nil), - 135: syscalls.ErrorWithEvent("personality", syscall.EINVAL, "Unable to change personality.", nil), - 136: syscalls.ErrorWithEvent("ustat", syscall.ENOSYS, "Needs filesystem support.", nil), + 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil), + 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil), 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), - 139: syscalls.ErrorWithEvent("sysfs", syscall.ENOSYS, "", []string{"gvisor.dev/issue/165"}), + 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}), 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), @@ -196,15 +194,15 @@ var AMD64 = &kernel.SyscallTable{ 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), - 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syscall.EPERM, "", nil), + 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), - 154: syscalls.Error("modify_ldt", syscall.EPERM, "", nil), - 155: syscalls.Error("pivot_root", syscall.EPERM, "", nil), - 156: syscalls.Error("sysctl", syscall.EPERM, "Deprecated. Use /proc/sys instead.", nil), + 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil), + 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil), + 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil), 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), @@ -225,50 +223,50 @@ var AMD64 = &kernel.SyscallTable{ 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), - 177: syscalls.Error("get_kernel_syms", syscall.ENOSYS, "Not supported in Linux > 2.6.", nil), - 178: syscalls.Error("query_module", syscall.ENOSYS, "Not supported in Linux > 2.6.", nil), + 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), + 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations - 180: syscalls.Error("nfsservctl", syscall.ENOSYS, "Removed after Linux 3.1.", nil), - 181: syscalls.Error("getpmsg", syscall.ENOSYS, "Not implemented in Linux.", nil), - 182: syscalls.Error("putpmsg", syscall.ENOSYS, "Not implemented in Linux.", nil), - 183: syscalls.Error("afs_syscall", syscall.ENOSYS, "Not implemented in Linux.", nil), - 184: syscalls.Error("tuxcall", syscall.ENOSYS, "Not implemented in Linux.", nil), - 185: syscalls.Error("security", syscall.ENOSYS, "Not implemented in Linux.", nil), + 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), 186: syscalls.Supported("gettid", Gettid), - 187: syscalls.ErrorWithEvent("readahead", syscall.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341) - 188: syscalls.ErrorWithEvent("setxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 189: syscalls.ErrorWithEvent("lsetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 190: syscalls.ErrorWithEvent("fsetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 191: syscalls.ErrorWithEvent("getxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 192: syscalls.ErrorWithEvent("lgetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 193: syscalls.ErrorWithEvent("fgetxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 194: syscalls.ErrorWithEvent("listxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 195: syscalls.ErrorWithEvent("llistxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 196: syscalls.ErrorWithEvent("flistxattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 197: syscalls.ErrorWithEvent("removexattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 198: syscalls.ErrorWithEvent("lremovexattr", syscall.ENOTSUP, "Requires filesystem support.", nil), - 199: syscalls.ErrorWithEvent("fremovexattr", syscall.ENOTSUP, "Requires filesystem support.", nil), + 187: syscalls.ErrorWithEvent("readahead", syserror.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341) + 188: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 189: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 190: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 200: syscalls.Supported("tkill", Tkill), 201: syscalls.Supported("time", Time), 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), - 205: syscalls.Error("set_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 211: syscalls.Error("get_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), 213: syscalls.Supported("epoll_create", EpollCreate), - 214: syscalls.ErrorWithEvent("epoll_ctl_old", syscall.ENOSYS, "Deprecated.", nil), - 215: syscalls.ErrorWithEvent("epoll_wait_old", syscall.ENOSYS, "Deprecated.", nil), - 216: syscalls.ErrorWithEvent("remap_file_pages", syscall.ENOSYS, "Deprecated since Linux 3.16.", nil), + 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil), + 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil), + 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), 217: syscalls.Supported("getdents64", Getdents64), 218: syscalls.Supported("set_tid_address", SetTidAddress), 219: syscalls.Supported("restart_syscall", RestartSyscall), - 220: syscalls.ErrorWithEvent("semtimedop", syscall.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) + 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), 222: syscalls.Supported("timer_create", TimerCreate), 223: syscalls.Supported("timer_settime", TimerSettime), @@ -284,21 +282,21 @@ var AMD64 = &kernel.SyscallTable{ 233: syscalls.Supported("epoll_ctl", EpollCtl), 234: syscalls.Supported("tgkill", Tgkill), 235: syscalls.Supported("utimes", Utimes), - 236: syscalls.Error("vserver", syscall.ENOSYS, "Not implemented by Linux", nil), + 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil), 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), - 240: syscalls.ErrorWithEvent("mq_open", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 241: syscalls.ErrorWithEvent("mq_unlink", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 242: syscalls.ErrorWithEvent("mq_timedsend", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 243: syscalls.ErrorWithEvent("mq_timedreceive", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 244: syscalls.ErrorWithEvent("mq_notify", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 245: syscalls.ErrorWithEvent("mq_getsetattr", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), 247: syscalls.Supported("waitid", Waitid), - 248: syscalls.Error("add_key", syscall.EACCES, "Not available to user.", nil), - 249: syscalls.Error("request_key", syscall.EACCES, "Not available to user.", nil), - 250: syscalls.Error("keyctl", syscall.EACCES, "Not available to user.", nil), + 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), + 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), + 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil), @@ -321,23 +319,23 @@ var AMD64 = &kernel.SyscallTable{ 270: syscalls.Supported("pselect", Pselect), 271: syscalls.Supported("ppoll", Ppoll), 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), - 273: syscalls.Error("set_robust_list", syscall.ENOSYS, "Obsolete.", nil), - 274: syscalls.Error("get_robust_list", syscall.ENOSYS, "Obsolete.", nil), + 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 276: syscalls.ErrorWithEvent("tee", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 276: syscalls.ErrorWithEvent("tee", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), - 278: syscalls.ErrorWithEvent("vmsplice", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) + 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) 280: syscalls.Supported("utimensat", Utimensat), 281: syscalls.Supported("epoll_pwait", EpollPwait), - 282: syscalls.ErrorWithEvent("signalfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) + 282: syscalls.ErrorWithEvent("signalfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) 283: syscalls.Supported("timerfd_create", TimerfdCreate), 284: syscalls.Supported("eventfd", Eventfd), 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), 286: syscalls.Supported("timerfd_settime", TimerfdSettime), 287: syscalls.Supported("timerfd_gettime", TimerfdGettime), 288: syscalls.Supported("accept4", Accept4), - 289: syscalls.ErrorWithEvent("signalfd4", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) + 289: syscalls.ErrorWithEvent("signalfd4", syserror.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) 290: syscalls.Supported("eventfd2", Eventfd2), 291: syscalls.Supported("epoll_create1", EpollCreate1), 292: syscalls.Supported("dup3", Dup3), @@ -346,37 +344,37 @@ var AMD64 = &kernel.SyscallTable{ 295: syscalls.Supported("preadv", Preadv), 296: syscalls.Supported("pwritev", Pwritev), 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), - 298: syscalls.ErrorWithEvent("perf_event_open", syscall.ENODEV, "No support for perf counters", nil), + 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), - 300: syscalls.ErrorWithEvent("fanotify_init", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 301: syscalls.ErrorWithEvent("fanotify_mark", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 302: syscalls.Supported("prlimit64", Prlimit64), - 303: syscalls.Error("name_to_handle_at", syscall.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 304: syscalls.Error("open_by_handle_at", syscall.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), - 308: syscalls.ErrorWithEvent("setns", syscall.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) 309: syscalls.Supported("getcpu", Getcpu), - 310: syscalls.ErrorWithEvent("process_vm_readv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 311: syscalls.ErrorWithEvent("process_vm_writev", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), - 314: syscalls.ErrorWithEvent("sched_setattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 315: syscalls.ErrorWithEvent("sched_getattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 316: syscalls.ErrorWithEvent("renameat2", syscall.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) 317: syscalls.Supported("seccomp", Seccomp), 318: syscalls.Supported("getrandom", GetRandom), 319: syscalls.Supported("memfd_create", MemfdCreate), 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), - 322: syscalls.ErrorWithEvent("execveat", syscall.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836) - 323: syscalls.ErrorWithEvent("userfaultfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) - 324: syscalls.ErrorWithEvent("membarrier", syscall.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897) + 322: syscalls.ErrorWithEvent("execveat", syserror.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836) + 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897) 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), // Syscalls after 325 are "backports" from versions of Linux after 4.4. - 326: syscalls.ErrorWithEvent("copy_file_range", syscall.ENOSYS, "", nil), + 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), 327: syscalls.Supported("preadv2", Preadv2), 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), 332: syscalls.Supported("statx", Statx), diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go index 00b7e7cf2..333013d8c 100644 --- a/pkg/sentry/syscalls/linux/sigset.go +++ b/pkg/sentry/syscalls/linux/sigset.go @@ -15,8 +15,6 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -27,7 +25,7 @@ import ( // STOP are clear. func copyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) { if size != linux.SignalSetSize { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } b := t.CopyScratchBuffer(8) if _, err := t.CopyInBytes(sigSetAddr, b); err != nil { diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index a0dd4d4e3..4a2b9f061 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -15,8 +15,7 @@ package linux import ( - "syscall" - + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" @@ -29,11 +28,11 @@ import ( // EpollCreate1 implements the epoll_create1(2) linux syscall. func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() - if flags & ^syscall.EPOLL_CLOEXEC != 0 { + if flags & ^linux.EPOLL_CLOEXEC != 0 { return 0, nil, syserror.EINVAL } - closeOnExec := flags&syscall.EPOLL_CLOEXEC != 0 + closeOnExec := flags&linux.EPOLL_CLOEXEC != 0 fd, err := syscalls.CreateEpoll(t, closeOnExec) if err != nil { return 0, nil, err @@ -69,37 +68,34 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc flags := epoll.EntryFlags(0) mask := waiter.EventMask(0) var data [2]int32 - if op != syscall.EPOLL_CTL_DEL { - var e syscall.EpollEvent + if op != linux.EPOLL_CTL_DEL { + var e linux.EpollEvent if _, err := t.CopyIn(eventAddr, &e); err != nil { return 0, nil, err } - if e.Events&syscall.EPOLLONESHOT != 0 { + if e.Events&linux.EPOLLONESHOT != 0 { flags |= epoll.OneShot } - // syscall.EPOLLET is incorrectly generated as a negative number - // in Go, see https://github.com/golang/go/issues/5328 for - // details. - if e.Events&-syscall.EPOLLET != 0 { + if e.Events&linux.EPOLLET != 0 { flags |= epoll.EdgeTriggered } mask = waiter.EventMaskFromLinux(e.Events) data[0] = e.Fd - data[1] = e.Pad + data[1] = e.Data } // Perform the requested operations. switch op { - case syscall.EPOLL_CTL_ADD: + case linux.EPOLL_CTL_ADD: // See fs/eventpoll.c. mask |= waiter.EventHUp | waiter.EventErr return 0, nil, syscalls.AddEpoll(t, epfd, fd, flags, mask, data) - case syscall.EPOLL_CTL_DEL: + case linux.EPOLL_CTL_DEL: return 0, nil, syscalls.RemoveEpoll(t, epfd, fd) - case syscall.EPOLL_CTL_MOD: + case linux.EPOLL_CTL_MOD: // Same as EPOLL_CTL_ADD. mask |= waiter.EventHUp | waiter.EventErr return 0, nil, syscalls.UpdateEpoll(t, epfd, fd, flags, mask, data) diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go index 5cfc2c3c8..8a34c4e99 100644 --- a/pkg/sentry/syscalls/linux/sys_eventfd.go +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -15,12 +15,11 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" + "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -38,7 +37,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc allOps := uint(EFD_SEMAPHORE | EFD_NONBLOCK | EFD_CLOEXEC) if flags & ^allOps != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } event := eventfd.New(t, uint64(initVal), flags&EFD_SEMAPHORE != 0) diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 40722abc2..2e00a91ce 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -768,7 +768,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall defer file.DecRef() err := file.Flush(t) - return 0, nil, handleIOError(t, false /* partial */, err, syscall.EINTR, "close", file) + return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file) } // Dup implements linux syscall dup(2). @@ -910,7 +910,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Copy in the lock request. flockAddr := args[2].Pointer() - var flock syscall.Flock_t + var flock linux.Flock if _, err := t.CopyIn(flockAddr, &flock); err != nil { return 0, nil, err } @@ -959,11 +959,11 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // These locks don't block; execute the non-blocking operation using the inode's lock // context directly. switch flock.Type { - case syscall.F_RDLCK: + case linux.F_RDLCK: if !file.Flags().Read { return 0, nil, syserror.EBADF } - if cmd == syscall.F_SETLK { + if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) { return 0, nil, syserror.EAGAIN @@ -975,11 +975,11 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } return 0, nil, nil - case syscall.F_WRLCK: + case linux.F_WRLCK: if !file.Flags().Write { return 0, nil, syserror.EBADF } - if cmd == syscall.F_SETLK { + if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) { return 0, nil, syserror.EAGAIN @@ -991,7 +991,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } return 0, nil, nil - case syscall.F_UNLCK: + case linux.F_UNLCK: file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng) return 0, nil, nil default: @@ -1476,7 +1476,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { t.SendSignal(&arch.SignalInfo{ - Signo: int32(syscall.SIGXFSZ), + Signo: int32(linux.SIGXFSZ), Code: arch.SignalInfoUser, }) return 0, nil, syserror.EFBIG @@ -1536,7 +1536,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { t.SendSignal(&arch.SignalInfo{ - Signo: int32(syscall.SIGXFSZ), + Signo: int32(linux.SIGXFSZ), Code: arch.SignalInfoUser, }) return 0, nil, syserror.EFBIG @@ -1826,7 +1826,7 @@ func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // No timesAddr argument will be interpreted as current system time. ts := defaultSetToSystemTimeSpec() if timesAddr != 0 { - var times syscall.Utimbuf + var times linux.Utime if _, err := t.CopyIn(timesAddr, ×); err != nil { return 0, nil, err } @@ -2018,7 +2018,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { t.SendSignal(&arch.SignalInfo{ - Signo: int32(syscall.SIGXFSZ), + Signo: int32(linux.SIGXFSZ), Code: arch.SignalInfoUser, }) return 0, nil, syserror.EFBIG diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index 7a2beda23..63e2c5a5d 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -17,8 +17,8 @@ package linux import ( "bytes" "io" - "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -82,7 +82,7 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir switch err := handleIOError(t, ds.Written() > 0, rerr, kernel.ERESTARTSYS, "getdents", dir); err { case nil: - dir.Dirent.InotifyEvent(syscall.IN_ACCESS, 0) + dir.Dirent.InotifyEvent(linux.IN_ACCESS, 0) return uintptr(ds.Written()), nil case io.EOF: return 0, nil @@ -146,21 +146,21 @@ func smallestDirent64(a arch.Context) uint { func toType(nodeType fs.InodeType) uint8 { switch nodeType { case fs.RegularFile, fs.SpecialFile: - return syscall.DT_REG + return linux.DT_REG case fs.Symlink: - return syscall.DT_LNK + return linux.DT_LNK case fs.Directory, fs.SpecialDirectory: - return syscall.DT_DIR + return linux.DT_DIR case fs.Pipe: - return syscall.DT_FIFO + return linux.DT_FIFO case fs.CharacterDevice: - return syscall.DT_CHR + return linux.DT_CHR case fs.BlockDevice: - return syscall.DT_BLK + return linux.DT_BLK case fs.Socket: - return syscall.DT_SOCK + return linux.DT_SOCK default: - return syscall.DT_UNKNOWN + return linux.DT_UNKNOWN } } diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go index a4f36e01f..b2c7b3444 100644 --- a/pkg/sentry/syscalls/linux/sys_inotify.go +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -15,13 +15,12 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" ) const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC) @@ -31,7 +30,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. flags := int(args[0].Int()) if flags&^allFlags != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } dirent := fs.NewDirent(t, anon.NewInode(t), "inotify") @@ -66,14 +65,14 @@ func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) { file := t.GetFile(fd) if file == nil { // Invalid fd. - return nil, nil, syscall.EBADF + return nil, nil, syserror.EBADF } ino, ok := file.FileOperations.(*fs.Inotify) if !ok { // Not an inotify fd. file.DecRef() - return nil, nil, syscall.EINVAL + return nil, nil, syserror.EINVAL } return ino, file, nil @@ -92,7 +91,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern // "EINVAL: The given event mask contains no valid events." // -- inotify_add_watch(2) if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } ino, file, err := fdToInotify(t, fd) @@ -109,7 +108,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern err = fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, dirent *fs.Dirent, _ uint) error { // "IN_ONLYDIR: Only watch pathname if it is a directory." -- inotify(7) if onlyDir := mask&linux.IN_ONLYDIR != 0; onlyDir && !fs.IsDir(dirent.Inode.StableAttr) { - return syscall.ENOTDIR + return syserror.ENOTDIR } // Copy out to the return frame. diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index 576022dd5..418d7fa5f 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -15,20 +15,19 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" ) // pipe2 implements the actual system call with flags. func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize) @@ -49,7 +48,7 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { // The files are not closed in this case, the exact semantics // of this error case are not well defined, but they could have // already been observed by user space. - return 0, syscall.EFAULT + return 0, syserror.EFAULT } return 0, nil } diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 5329023e6..98db32d77 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -16,7 +16,6 @@ package linux import ( "fmt" - "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/syserror" ) // Prctl implements linux syscall prctl(2). @@ -36,7 +36,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_PDEATHSIG: sig := linux.Signal(args[1].Int()) if sig != 0 && !sig.IsValid() { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } t.SetParentDeathSignal(sig) return 0, nil, nil @@ -67,7 +67,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall d = mm.UserDumpable default: // N.B. Userspace may not pass SUID_DUMP_ROOT. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } t.MemoryManager().SetDumpability(d) return 0, nil, nil @@ -88,7 +88,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } else if val == 1 { t.SetKeepCaps(true) } else { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil @@ -96,7 +96,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_NAME: addr := args[1].Pointer() name, err := t.CopyInString(addr, linux.TASK_COMM_LEN-1) - if err != nil && err != syscall.ENAMETOOLONG { + if err != nil && err != syserror.ENAMETOOLONG { return 0, nil, err } t.SetName(name) @@ -116,7 +116,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_MM: if !t.HasCapability(linux.CAP_SYS_RESOURCE) { - return 0, nil, syscall.EPERM + return 0, nil, syserror.EPERM } switch args[1].Int() { @@ -125,13 +125,13 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // They trying to set exe to a non-file? if !fs.IsFile(file.Dirent.Inode.StableAttr) { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } // Set the underlying executable. @@ -153,12 +153,12 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } case linux.PR_SET_NO_NEW_PRIVS: if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // no_new_privs is assumed to always be set. See // kernel.Task.updateCredsForExec. @@ -166,14 +166,14 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_GET_NO_NEW_PRIVS: if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 1, nil, nil case linux.PR_SET_SECCOMP: if args[1].Int() != linux.SECCOMP_MODE_FILTER { // Unsupported mode. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer()) @@ -184,7 +184,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_CAPBSET_READ: cp := linux.Capability(args[1].Uint64()) if !cp.Ok() { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } var rv uintptr if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 { @@ -195,7 +195,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_CAPBSET_DROP: cp := linux.Capability(args[1].Uint64()) if !cp.Ok() { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, t.DropBoundingCapability(cp) @@ -220,7 +220,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go index 434bbb322..99f6993f5 100644 --- a/pkg/sentry/syscalls/linux/sys_sched.go +++ b/pkg/sentry/syscalls/linux/sys_sched.go @@ -15,11 +15,10 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -37,13 +36,13 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel pid := args[0].Int() param := args[1].Pointer() if param == 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if pid < 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } r := SchedParam{schedPriority: onlyPriority} if _, err := t.CopyOut(param, r); err != nil { @@ -57,10 +56,10 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel func SchedGetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := args[0].Int() if pid < 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } return onlyScheduler, nil, nil } @@ -71,20 +70,20 @@ func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ke policy := args[1].Int() param := args[2].Pointer() if pid < 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if policy != onlyScheduler { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } var r SchedParam if _, err := t.CopyIn(param, &r); err != nil { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if r.schedPriority != onlyPriority { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go index 4885b5e40..18510ead8 100644 --- a/pkg/sentry/syscalls/linux/sys_seccomp.go +++ b/pkg/sentry/syscalls/linux/sys_seccomp.go @@ -15,13 +15,12 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" ) // userSockFprog is equivalent to Linux's struct sock_fprog on amd64. @@ -43,7 +42,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error { // We only support SECCOMP_SET_MODE_FILTER at the moment. if mode != linux.SECCOMP_SET_MODE_FILTER { // Unsupported mode. - return syscall.EINVAL + return syserror.EINVAL } tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0 @@ -51,7 +50,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error { // The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC. if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 { // Unsupported flag. - return syscall.EINVAL + return syserror.EINVAL } var fprog userSockFprog @@ -65,7 +64,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error { compiledFilter, err := bpf.Compile(filter) if err != nil { t.Debugf("Invalid seccomp-bpf filter: %v", err) - return syscall.EINVAL + return syserror.EINVAL } return t.AppendSyscallFilter(compiledFilter, tsync) diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index d1165a57a..195734257 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -15,7 +15,6 @@ package linux import ( - "syscall" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -131,7 +130,7 @@ func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader // from the untrusted address space range. func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) { if addrlen > maxAddrLen { - return nil, syscall.EINVAL + return nil, syserror.EINVAL } addrBuf := make([]byte, addrlen) @@ -153,7 +152,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user } if int32(bufLen) < 0 { - return syscall.EINVAL + return syserror.EINVAL } // Write the length unconditionally. @@ -186,7 +185,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Create the new socket. @@ -218,7 +217,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } fileFlags := fs.SettableFileFlags{ @@ -261,14 +260,14 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Capture address and call syscall implementation. @@ -286,20 +285,20 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) { // Check that no unsupported flags are passed in. if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, syscall.EBADF + return 0, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syscall.ENOTSOCK + return 0, syserror.ENOTSOCK } // Call the syscall implementation for this socket, then copy the @@ -314,7 +313,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f if peerRequested { // NOTE(magi): Linux does not give you an error if it can't // write the data back out so neither do we. - if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syscall.EINVAL { + if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL { return 0, err } } @@ -351,14 +350,14 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Capture address and call syscall implementation. @@ -378,14 +377,14 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Per Linux, the backlog is silently capped to reasonable values. @@ -407,21 +406,21 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Validate how, then call syscall implementation. switch how { case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR: default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, s.Shutdown(t, int(how)).ToError() @@ -438,14 +437,14 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Read the length if present. Reject negative values. @@ -456,7 +455,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } if optLen < 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } } @@ -522,21 +521,21 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } if optLen <= 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if optLen > maxOptLen { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } buf := t.CopyScratchBuffer(int(optLen)) if _, err := t.CopyIn(optValAddr, &buf); err != nil { @@ -560,14 +559,14 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Get the socket name and copy it to the caller. @@ -588,14 +587,14 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Get the socket peer name and copy it to the caller. @@ -615,25 +614,25 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if file.Flags().NonBlocking { @@ -663,25 +662,25 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } if file.Flags().NonBlocking { @@ -696,7 +695,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } if !ts.Valid() { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration()) haveDeadline = true @@ -716,7 +715,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syscall.EFAULT + return 0, nil, syserror.EFAULT } var n uintptr if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil { @@ -726,7 +725,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syscall.EFAULT + return 0, nil, syserror.EFAULT } if _, err = t.CopyOut(lp, uint32(n)); err != nil { break @@ -748,7 +747,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i } if msg.IovLen > linux.UIO_MAXIOV { - return 0, syscall.EMSGSIZE + return 0, syserror.EMSGSIZE } dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -759,7 +758,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i // FIXME(b/63594852): Pretend we have an empty error queue. if flags&linux.MSG_ERRQUEUE != 0 { - return 0, syscall.EAGAIN + return 0, syserror.EAGAIN } // Fast path when no control message nor name buffers are provided. @@ -784,7 +783,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i } if msg.ControlLen > maxControlLen { - return 0, syscall.ENOBUFS + return 0, syserror.ENOBUFS } n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) if e != nil { @@ -836,25 +835,25 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i // recvfrom and recv syscall handlers. func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) { if int(bufLen) < 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, syscall.EBADF + return 0, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syscall.ENOTSOCK + return 0, syserror.ENOTSOCK } if file.Flags().NonBlocking { @@ -914,25 +913,25 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if file.Flags().NonBlocking { @@ -952,25 +951,25 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syscall.EBADF + return 0, nil, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syscall.ENOTSOCK + return 0, nil, syserror.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if file.Flags().NonBlocking { @@ -982,7 +981,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syscall.EFAULT + return 0, nil, syserror.EFAULT } var n uintptr if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil { @@ -992,7 +991,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syscall.EFAULT + return 0, nil, syserror.EFAULT } if _, err = t.CopyOut(lp, uint32(n)); err != nil { break @@ -1017,7 +1016,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme if msg.ControlLen > 0 { // Put an upper bound to prevent large allocations. if msg.ControlLen > maxControlLen { - return 0, syscall.ENOBUFS + return 0, syserror.ENOBUFS } controlData = make([]byte, msg.ControlLen) if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil { @@ -1037,7 +1036,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme // Read data then call the sendmsg implementation. if msg.IovLen > linux.UIO_MAXIOV { - return 0, syscall.EMSGSIZE + return 0, syserror.EMSGSIZE } src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -1074,20 +1073,20 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) { bl := int(bufLen) if bl < 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, syscall.EBADF + return 0, syserror.EBADF } defer file.DecRef() // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syscall.ENOTSOCK + return 0, syserror.ENOTSOCK } if file.Flags().NonBlocking { diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 9e037bd7b..595eb9155 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -123,29 +123,29 @@ func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr, childTID usermem.Addr, tls usermem.Addr) (uintptr, *kernel.SyscallControl, error) { opts := kernel.CloneOptions{ SharingOptions: kernel.SharingOptions{ - NewAddressSpace: flags&syscall.CLONE_VM == 0, - NewSignalHandlers: flags&syscall.CLONE_SIGHAND == 0, - NewThreadGroup: flags&syscall.CLONE_THREAD == 0, + NewAddressSpace: flags&linux.CLONE_VM == 0, + NewSignalHandlers: flags&linux.CLONE_SIGHAND == 0, + NewThreadGroup: flags&linux.CLONE_THREAD == 0, TerminationSignal: linux.Signal(flags & exitSignalMask), - NewPIDNamespace: flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID, - NewUserNamespace: flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER, - NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET, - NewFiles: flags&syscall.CLONE_FILES == 0, - NewFSContext: flags&syscall.CLONE_FS == 0, - NewUTSNamespace: flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS, - NewIPCNamespace: flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC, + NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, + NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, + NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, + NewFiles: flags&linux.CLONE_FILES == 0, + NewFSContext: flags&linux.CLONE_FS == 0, + NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, + NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, }, Stack: stack, - SetTLS: flags&syscall.CLONE_SETTLS == syscall.CLONE_SETTLS, + SetTLS: flags&linux.CLONE_SETTLS == linux.CLONE_SETTLS, TLS: tls, - ChildClearTID: flags&syscall.CLONE_CHILD_CLEARTID == syscall.CLONE_CHILD_CLEARTID, - ChildSetTID: flags&syscall.CLONE_CHILD_SETTID == syscall.CLONE_CHILD_SETTID, + ChildClearTID: flags&linux.CLONE_CHILD_CLEARTID == linux.CLONE_CHILD_CLEARTID, + ChildSetTID: flags&linux.CLONE_CHILD_SETTID == linux.CLONE_CHILD_SETTID, ChildTID: childTID, - ParentSetTID: flags&syscall.CLONE_PARENT_SETTID == syscall.CLONE_PARENT_SETTID, + ParentSetTID: flags&linux.CLONE_PARENT_SETTID == linux.CLONE_PARENT_SETTID, ParentTID: parentTID, - Vfork: flags&syscall.CLONE_VFORK == syscall.CLONE_VFORK, - Untraced: flags&syscall.CLONE_UNTRACED == syscall.CLONE_UNTRACED, - InheritTracer: flags&syscall.CLONE_PTRACE == syscall.CLONE_PTRACE, + Vfork: flags&linux.CLONE_VFORK == linux.CLONE_VFORK, + Untraced: flags&linux.CLONE_UNTRACED == linux.CLONE_UNTRACED, + InheritTracer: flags&linux.CLONE_PTRACE == linux.CLONE_PTRACE, } ntid, ctrl, err := t.Clone(&opts) return uintptr(ntid), ctrl, err @@ -168,7 +168,7 @@ func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // "A call to fork() is equivalent to a call to clone(2) specifying flags // as just SIGCHLD." - fork(2) - return clone(t, int(syscall.SIGCHLD), 0, 0, 0, 0) + return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0) } // Vfork implements Linux syscall vfork(2). @@ -178,7 +178,7 @@ func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // // CLONE_VM | CLONE_VFORK | SIGCHLD // """ - vfork(2) - return clone(t, syscall.CLONE_VM|syscall.CLONE_VFORK|int(syscall.SIGCHLD), 0, 0, 0, 0) + return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0) } // parseCommonWaitOptions applies the options common to wait4 and waitid to @@ -193,7 +193,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { wopts.NonCloneTasks = true wopts.CloneTasks = true default: - return syscall.EINVAL + return syserror.EINVAL } if options&linux.WCONTINUED != 0 { wopts.Events |= kernel.EventGroupContinue @@ -210,7 +210,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { // wait4 waits for the given child process to exit. func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) { if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } wopts := kernel.WaitOptions{ Events: kernel.EventExit | kernel.EventTraceeStop, @@ -291,10 +291,10 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal rusageAddr := args[4].Pointer() if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 { - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } wopts := kernel.WaitOptions{ Events: kernel.EventTraceeStop, @@ -307,7 +307,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.P_PGID: wopts.SpecificPGID = kernel.ProcessGroupID(id) default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } if err := parseCommonWaitOptions(&wopts, options); err != nil { @@ -347,12 +347,12 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, nil } si := arch.SignalInfo{ - Signo: int32(syscall.SIGCHLD), + Signo: int32(linux.SIGCHLD), } si.SetPid(int32(wr.TID)) si.SetUid(int32(wr.UID)) // TODO(b/73541790): convert kernel.ExitStatus to functions and make - // WaitResult.Status a linux.WaitStatus + // WaitResult.Status a linux.WaitStatus. s := syscall.WaitStatus(wr.Status) switch { case s.Exited(): @@ -374,7 +374,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } case s.Continued(): si.Code = arch.CLD_CONTINUED - si.SetStatus(int32(syscall.SIGCONT)) + si.SetStatus(int32(linux.SIGCONT)) default: t.Warningf("waitid got incomprehensible wait status %d", s) } @@ -395,16 +395,16 @@ func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() opts := kernel.SharingOptions{ - NewAddressSpace: flags&syscall.CLONE_VM == syscall.CLONE_VM, - NewSignalHandlers: flags&syscall.CLONE_SIGHAND == syscall.CLONE_SIGHAND, - NewThreadGroup: flags&syscall.CLONE_THREAD == syscall.CLONE_THREAD, - NewPIDNamespace: flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID, - NewUserNamespace: flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER, - NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET, - NewFiles: flags&syscall.CLONE_FILES == syscall.CLONE_FILES, - NewFSContext: flags&syscall.CLONE_FS == syscall.CLONE_FS, - NewUTSNamespace: flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS, - NewIPCNamespace: flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC, + NewAddressSpace: flags&linux.CLONE_VM == linux.CLONE_VM, + NewSignalHandlers: flags&linux.CLONE_SIGHAND == linux.CLONE_SIGHAND, + NewThreadGroup: flags&linux.CLONE_THREAD == linux.CLONE_THREAD, + NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, + NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, + NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, + NewFiles: flags&linux.CLONE_FILES == linux.CLONE_FILES, + NewFSContext: flags&linux.CLONE_FS == linux.CLONE_FS, + NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, + NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, } // "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2) if opts.NewPIDNamespace { @@ -623,7 +623,7 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S who := kernel.ThreadID(args[1].Int()) switch which { - case syscall.PRIO_PROCESS: + case linux.PRIO_PROCESS: // Look for who, return ESRCH if not found. var task *kernel.Task if who == 0 { @@ -633,7 +633,7 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } if task == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } // From kernel/sys.c:getpriority: @@ -641,13 +641,13 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // will not return the normal nice-value, but a negated // value that has been offset by 20" return uintptr(20 - task.Niceness()), nil, nil - case syscall.PRIO_USER: + case linux.PRIO_USER: fallthrough - case syscall.PRIO_PGRP: + case linux.PRIO_PGRP: // PRIO_USER and PRIO_PGRP have no further implementation yet. return 0, nil, nil default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } } @@ -669,7 +669,7 @@ func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } switch which { - case syscall.PRIO_PROCESS: + case linux.PRIO_PROCESS: // Look for who, return ESRCH if not found. var task *kernel.Task if who == 0 { @@ -679,17 +679,17 @@ func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } if task == nil { - return 0, nil, syscall.ESRCH + return 0, nil, syserror.ESRCH } task.SetNiceness(niceval) - case syscall.PRIO_USER: + case linux.PRIO_USER: fallthrough - case syscall.PRIO_PGRP: + case linux.PRIO_PGRP: // PRIO_USER and PRIO_PGRP have no further implementation yet. return 0, nil, nil default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go index ca5ccb7c3..d4134207b 100644 --- a/pkg/sentry/syscalls/linux/sys_timer.go +++ b/pkg/sentry/syscalls/linux/sys_timer.go @@ -15,13 +15,13 @@ package linux import ( - "syscall" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" ) const nsecPerSec = int64(time.Second) @@ -47,7 +47,7 @@ func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) return itv, nil default: - return linux.ItimerVal{}, syscall.ENOSYS + return linux.ItimerVal{}, syserror.ENOSYS } } @@ -65,7 +65,7 @@ func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) e _, err := t.CopyOut(addr, itv) return err default: - return syscall.ENOSYS + return syserror.ENOSYS } } diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go index e3d1c6201..b3eb96a1c 100644 --- a/pkg/sentry/syscalls/linux/sys_tls.go +++ b/pkg/sentry/syscalls/linux/sys_tls.go @@ -17,11 +17,10 @@ package linux import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" ) // ArchPrctl implements linux syscall arch_prctl(2). @@ -39,14 +38,14 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys case linux.ARCH_SET_FS: fsbase := args[1].Uint64() if !t.Arch().SetTLS(uintptr(fsbase)) { - return 0, nil, syscall.EPERM + return 0, nil, syserror.EPERM } case linux.ARCH_GET_GS, linux.ARCH_SET_GS: t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syscall.EINVAL + return 0, nil, syserror.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go index 9ba0eba7a..4ff8f9234 100644 --- a/pkg/sentry/syscalls/linux/timespec.go +++ b/pkg/sentry/syscalls/linux/timespec.go @@ -15,7 +15,6 @@ package linux import ( - "syscall" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -70,7 +69,7 @@ func copyTimevalIn(t *kernel.Task, addr usermem.Addr) (linux.Timeval, error) { tv.Usec = int64(usermem.ByteOrder.Uint64(in[8:])) return tv, nil default: - return linux.Timeval{}, syscall.ENOSYS + return linux.Timeval{}, syserror.ENOSYS } } @@ -84,7 +83,7 @@ func copyTimevalOut(t *kernel.Task, addr usermem.Addr, tv *linux.Timeval) error _, err := t.CopyOutBytes(addr, out) return err default: - return syscall.ENOSYS + return syserror.ENOSYS } } @@ -104,7 +103,7 @@ func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.D return 0, err } if !timespec.Valid() { - return 0, syscall.EINVAL + return 0, syserror.EINVAL } timeout = time.Duration(timespec.ToNsecCapped()) } diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go index 94d52d5d5..f88055676 100644 --- a/pkg/sentry/syscalls/syscalls.go +++ b/pkg/sentry/syscalls/syscalls.go @@ -26,7 +26,6 @@ package syscalls import ( "fmt" - "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -56,7 +55,7 @@ func PartiallySupported(name string, fn kernel.SyscallFn, note string, urls []st } // Error returns a syscall handler that will always give the passed error. -func Error(name string, err syscall.Errno, note string, urls []string) kernel.Syscall { +func Error(name string, err error, note string, urls []string) kernel.Syscall { if note != "" { note = note + "; " } @@ -73,7 +72,7 @@ func Error(name string, err syscall.Errno, note string, urls []string) kernel.Sy // ErrorWithEvent gives a syscall function that sends an unimplemented // syscall event via the event channel and returns the passed error. -func ErrorWithEvent(name string, err syscall.Errno, note string, urls []string) kernel.Syscall { +func ErrorWithEvent(name string, err error, note string, urls []string) kernel.Syscall { if note != "" { note = note + "; " } diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go index 345653544..1987e89cc 100644 --- a/pkg/syserror/syserror.go +++ b/pkg/syserror/syserror.go @@ -48,6 +48,7 @@ var ( EMSGSIZE = error(syscall.EMSGSIZE) ENAMETOOLONG = error(syscall.ENAMETOOLONG) ENOATTR = ENODATA + ENOBUFS = error(syscall.ENOBUFS) ENODATA = error(syscall.ENODATA) ENODEV = error(syscall.ENODEV) ENOENT = error(syscall.ENOENT) @@ -59,6 +60,7 @@ var ( ENOSYS = error(syscall.ENOSYS) ENOTDIR = error(syscall.ENOTDIR) ENOTEMPTY = error(syscall.ENOTEMPTY) + ENOTSOCK = error(syscall.ENOTSOCK) ENOTSUP = error(syscall.ENOTSUP) ENOTTY = error(syscall.ENOTTY) ENXIO = error(syscall.ENXIO) -- cgit v1.2.3 From dea3cb92f2c9fffb604cedde6998b3209c91e716 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Tue, 9 Jul 2019 16:42:54 -0700 Subject: build: add nogo for static validation PiperOrigin-RevId: 257297820 --- BUILD | 14 ++++++++- WORKSPACE | 11 ++++++- pkg/sentry/kernel/BUILD | 4 +-- pkg/sentry/kernel/auth/BUILD | 4 +-- pkg/sentry/kernel/futex/BUILD | 4 +-- pkg/sentry/time/BUILD | 4 +-- third_party/gvsync/atomicptrtest/BUILD | 4 +-- third_party/gvsync/seqatomictest/BUILD | 4 +-- tools/checkunsafe/BUILD | 13 ++++++++ tools/checkunsafe/check_unsafe.go | 56 ++++++++++++++++++++++++++++++++++ tools/nogo.js | 7 +++++ 11 files changed, 111 insertions(+), 14 deletions(-) create mode 100644 tools/checkunsafe/BUILD create mode 100644 tools/checkunsafe/check_unsafe.go create mode 100644 tools/nogo.js diff --git a/BUILD b/BUILD index 6d5e800ca..60ed992c4 100644 --- a/BUILD +++ b/BUILD @@ -1,6 +1,6 @@ package(licenses = ["notice"]) # Apache 2.0 -load("@io_bazel_rules_go//go:def.bzl", "go_path") +load("@io_bazel_rules_go//go:def.bzl", "go_path", "nogo") load("@bazel_gazelle//:def.bzl", "gazelle") # The sandbox filegroup is used for sandbox-internal dependencies. @@ -29,3 +29,15 @@ go_path( # To update the WORKSPACE from go.mod, use: # bazel run //:gazelle -- update-repos -from_file=go.mod gazelle(name = "gazelle") + +# nogo applies checks to all Go source in this repository, enforcing code +# guidelines and restrictions. Note that the tool libraries themselves should +# live in the tools subdirectory (unless they are standard). +nogo( + name = "nogo", + config = "tools/nogo.js", + visibility = ["//visibility:public"], + deps = [ + "//tools/checkunsafe", + ], +) diff --git a/WORKSPACE b/WORKSPACE index b76442d9e..26243b7f2 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -20,7 +20,10 @@ load("@io_bazel_rules_go//go:deps.bzl", "go_rules_dependencies", "go_register_to go_rules_dependencies() -go_register_toolchains(go_version = "1.12.6") +go_register_toolchains( + go_version = "1.12.6", + nogo = "@//:nogo", +) load("@bazel_gazelle//:deps.bzl", "gazelle_dependencies", "go_repository") @@ -141,6 +144,12 @@ go_repository( importpath = "golang.org/x/sys", ) +go_repository( + name = "org_golang_x_tools", + commit = "aa82965741a9fecd12b026fbb3d3c6ed3231b8f8", + importpath = "golang.org/x/tools", +) + go_repository( name = "com_github_google_btree", importpath = "github.com/google/btree", diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 7b92f1b8d..e61d39c82 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -31,7 +31,7 @@ go_template_instance( go_template_instance( name = "seqatomic_taskgoroutineschedinfo", - out = "seqatomic_taskgoroutineschedinfo.go", + out = "seqatomic_taskgoroutineschedinfo_unsafe.go", package = "kernel", suffix = "TaskGoroutineSchedInfo", template = "//third_party/gvsync:generic_seqatomic", @@ -112,7 +112,7 @@ go_library( "ptrace_arm64.go", "rseq.go", "seccomp.go", - "seqatomic_taskgoroutineschedinfo.go", + "seqatomic_taskgoroutineschedinfo_unsafe.go", "session_list.go", "sessions.go", "signal.go", diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 42779baa9..1d00a6310 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -5,7 +5,7 @@ load("//tools/go_stateify:defs.bzl", "go_library") go_template_instance( name = "atomicptr_credentials", - out = "atomicptr_credentials.go", + out = "atomicptr_credentials_unsafe.go", package = "auth", suffix = "Credentials", template = "//third_party/gvsync:generic_atomicptr", @@ -45,7 +45,7 @@ go_template_instance( go_library( name = "auth", srcs = [ - "atomicptr_credentials.go", + "atomicptr_credentials_unsafe.go", "auth.go", "capability_set.go", "context.go", diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index a5cf1f627..6a31dc044 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -5,7 +5,7 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test") go_template_instance( name = "atomicptr_bucket", - out = "atomicptr_bucket.go", + out = "atomicptr_bucket_unsafe.go", package = "futex", suffix = "Bucket", template = "//third_party/gvsync:generic_atomicptr", @@ -29,7 +29,7 @@ go_template_instance( go_library( name = "futex", srcs = [ - "atomicptr_bucket.go", + "atomicptr_bucket_unsafe.go", "futex.go", "waiter_list.go", ], diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index d2ede0353..8aa6a3017 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -6,7 +6,7 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") go_template_instance( name = "seqatomic_parameters", - out = "seqatomic_parameters.go", + out = "seqatomic_parameters_unsafe.go", package = "time", suffix = "Parameters", template = "//third_party/gvsync:generic_seqatomic", @@ -27,7 +27,7 @@ go_library( "parameters.go", "sampler.go", "sampler_unsafe.go", - "seqatomic_parameters.go", + "seqatomic_parameters_unsafe.go", "tsc_amd64.s", "tsc_arm64.s", ], diff --git a/third_party/gvsync/atomicptrtest/BUILD b/third_party/gvsync/atomicptrtest/BUILD index 631b0b64c..6cf69ea91 100644 --- a/third_party/gvsync/atomicptrtest/BUILD +++ b/third_party/gvsync/atomicptrtest/BUILD @@ -6,7 +6,7 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") go_template_instance( name = "atomicptr_int", - out = "atomicptr_int.go", + out = "atomicptr_int_unsafe.go", package = "atomicptr", suffix = "Int", template = "//third_party/gvsync:generic_atomicptr", @@ -17,7 +17,7 @@ go_template_instance( go_library( name = "atomicptr", - srcs = ["atomicptr_int.go"], + srcs = ["atomicptr_int_unsafe.go"], importpath = "gvisor.dev/gvisor/third_party/gvsync/atomicptr", ) diff --git a/third_party/gvsync/seqatomictest/BUILD b/third_party/gvsync/seqatomictest/BUILD index 9dd600148..9e87e0bc5 100644 --- a/third_party/gvsync/seqatomictest/BUILD +++ b/third_party/gvsync/seqatomictest/BUILD @@ -6,7 +6,7 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") go_template_instance( name = "seqatomic_int", - out = "seqatomic_int.go", + out = "seqatomic_int_unsafe.go", package = "seqatomic", suffix = "Int", template = "//third_party/gvsync:generic_seqatomic", @@ -17,7 +17,7 @@ go_template_instance( go_library( name = "seqatomic", - srcs = ["seqatomic_int.go"], + srcs = ["seqatomic_int_unsafe.go"], importpath = "gvisor.dev/gvisor/third_party/gvsync/seqatomic", deps = [ "//third_party/gvsync", diff --git a/tools/checkunsafe/BUILD b/tools/checkunsafe/BUILD new file mode 100644 index 000000000..d85c56131 --- /dev/null +++ b/tools/checkunsafe/BUILD @@ -0,0 +1,13 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_tool_library") + +package(licenses = ["notice"]) + +go_tool_library( + name = "checkunsafe", + srcs = ["check_unsafe.go"], + importpath = "checkunsafe", + visibility = ["//visibility:public"], + deps = [ + "@org_golang_x_tools//go/analysis:go_tool_library", + ], +) diff --git a/tools/checkunsafe/check_unsafe.go b/tools/checkunsafe/check_unsafe.go new file mode 100644 index 000000000..4ccd7cc5a --- /dev/null +++ b/tools/checkunsafe/check_unsafe.go @@ -0,0 +1,56 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package checkunsafe allows unsafe imports only in files named appropriately. +package checkunsafe + +import ( + "fmt" + "path" + "strconv" + "strings" + + "golang.org/x/tools/go/analysis" +) + +// Analyzer defines the entrypoint. +var Analyzer = &analysis.Analyzer{ + Name: "checkunsafe", + Doc: "allows unsafe use only in specified files", + Run: run, +} + +func run(pass *analysis.Pass) (interface{}, error) { + for _, f := range pass.Files { + for _, imp := range f.Imports { + // Is this an unsafe import? + pkg, err := strconv.Unquote(imp.Path.Value) + if err != nil || pkg != "unsafe" { + continue + } + + // Extract the filename. + filename := pass.Fset.File(imp.Pos()).Name() + + // Allow files named _unsafe.go or _test.go to opt out. + if strings.HasSuffix(filename, "_unsafe.go") || strings.HasSuffix(filename, "_test.go") { + continue + } + + // Throw the error. + pass.Reportf(imp.Pos(), fmt.Sprintf("package unsafe imported by %s; must end with _unsafe.go", path.Base(filename))) + } + } + return nil, nil +} diff --git a/tools/nogo.js b/tools/nogo.js new file mode 100644 index 000000000..fc0a4d1f0 --- /dev/null +++ b/tools/nogo.js @@ -0,0 +1,7 @@ +{ + "checkunsafe": { + "exclude_files": { + "/external/": "not subject to constraint" + } + } +} -- cgit v1.2.3 From 7965b1272bb0579da47960e64ea902c74f49483d Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Tue, 9 Jul 2019 18:34:58 -0700 Subject: ext4: disklayout: Directory Entry implementation. PiperOrigin-RevId: 257314911 --- pkg/sentry/fs/ext4/disklayout/BUILD | 5 ++ pkg/sentry/fs/ext4/disklayout/dirent.go | 69 ++++++++++++++++++++++++++++ pkg/sentry/fs/ext4/disklayout/dirent_new.go | 61 ++++++++++++++++++++++++ pkg/sentry/fs/ext4/disklayout/dirent_old.go | 50 ++++++++++++++++++++ pkg/sentry/fs/ext4/disklayout/dirent_test.go | 28 +++++++++++ 5 files changed, 213 insertions(+) create mode 100644 pkg/sentry/fs/ext4/disklayout/dirent.go create mode 100644 pkg/sentry/fs/ext4/disklayout/dirent_new.go create mode 100644 pkg/sentry/fs/ext4/disklayout/dirent_old.go create mode 100644 pkg/sentry/fs/ext4/disklayout/dirent_test.go diff --git a/pkg/sentry/fs/ext4/disklayout/BUILD b/pkg/sentry/fs/ext4/disklayout/BUILD index d1cbab275..2920bfb52 100644 --- a/pkg/sentry/fs/ext4/disklayout/BUILD +++ b/pkg/sentry/fs/ext4/disklayout/BUILD @@ -8,6 +8,9 @@ go_library( "block_group.go", "block_group_32.go", "block_group_64.go", + "dirent.go", + "dirent_new.go", + "dirent_old.go", "disklayout.go", "inode.go", "inode_new.go", @@ -22,6 +25,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", ], @@ -32,6 +36,7 @@ go_test( size = "small", srcs = [ "block_group_test.go", + "dirent_test.go", "inode_test.go", "superblock_test.go", ], diff --git a/pkg/sentry/fs/ext4/disklayout/dirent.go b/pkg/sentry/fs/ext4/disklayout/dirent.go new file mode 100644 index 000000000..685bf57b8 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/dirent.go @@ -0,0 +1,69 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +const ( + // MaxFileName is the maximum length of an ext fs file's name. + MaxFileName = 255 +) + +var ( + // inodeTypeByFileType maps ext4 file types to vfs inode types. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#ftype. + inodeTypeByFileType = map[uint8]fs.InodeType{ + 0: fs.Anonymous, + 1: fs.RegularFile, + 2: fs.Directory, + 3: fs.CharacterDevice, + 4: fs.BlockDevice, + 5: fs.Pipe, + 6: fs.Socket, + 7: fs.Symlink, + } +) + +// The Dirent interface should be implemented by structs representing ext +// directory entries. These are for the linear classical directories which +// just store a list of dirent structs. A directory is a series of data blocks +// where is each data block contains a linear array of dirents. The last entry +// of the block has a record size that takes it to the end of the block. The +// end of the directory is when you read dirInode.Size() bytes from the blocks. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories. +type Dirent interface { + // Inode returns the absolute inode number of the underlying inode. + // Inode number 0 signifies an unused dirent. + Inode() uint32 + + // RecordSize returns the record length of this dirent on disk. The next + // dirent in the dirent list should be read after these many bytes from + // the current dirent. Must be a multiple of 4. + RecordSize() uint16 + + // FileName returns the name of the file. Can be at most 255 is length. + FileName() string + + // FileType returns the inode type of the underlying inode. This is a + // performance hack so that we do not have to read the underlying inode struct + // to know the type of inode. This will only work when the SbDirentFileType + // feature is set. If not, the second returned value will be false indicating + // that user code has to use the inode mode to extract the file type. + FileType() (fs.InodeType, bool) +} diff --git a/pkg/sentry/fs/ext4/disklayout/dirent_new.go b/pkg/sentry/fs/ext4/disklayout/dirent_new.go new file mode 100644 index 000000000..29ae4a5c2 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/dirent_new.go @@ -0,0 +1,61 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +// DirentNew represents the ext4 directory entry struct. This emulates Linux's +// ext4_dir_entry_2 struct. The FileName can not be more than 255 bytes so we +// only need 8 bits to store the NameLength. As a result, NameLength has been +// shortened and the other 8 bits are used to encode the file type. Use the +// FileTypeRaw field only if the SbDirentFileType feature is set. +// +// Note: This struct can be of variable size on disk. The one described below +// is of maximum size and the FileName beyond NameLength bytes might contain +// garbage. +type DirentNew struct { + InodeNumber uint32 + RecordLength uint16 + NameLength uint8 + FileTypeRaw uint8 + FileNameRaw [MaxFileName]byte +} + +// Compiles only if DirentNew implements Dirent. +var _ Dirent = (*DirentNew)(nil) + +// Inode implements Dirent.Inode. +func (d *DirentNew) Inode() uint32 { return d.InodeNumber } + +// RecordSize implements Dirent.RecordSize. +func (d *DirentNew) RecordSize() uint16 { return d.RecordLength } + +// FileName implements Dirent.FileName. +func (d *DirentNew) FileName() string { + return string(d.FileNameRaw[:d.NameLength]) +} + +// FileType implements Dirent.FileType. +func (d *DirentNew) FileType() (fs.InodeType, bool) { + if inodeType, ok := inodeTypeByFileType[d.FileTypeRaw]; ok { + return inodeType, true + } + + panic(fmt.Sprintf("unknown file type %v", d.FileTypeRaw)) +} diff --git a/pkg/sentry/fs/ext4/disklayout/dirent_old.go b/pkg/sentry/fs/ext4/disklayout/dirent_old.go new file mode 100644 index 000000000..2e0f9c812 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/dirent_old.go @@ -0,0 +1,50 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import "gvisor.dev/gvisor/pkg/sentry/fs" + +// DirentOld represents the old directory entry struct which does not contain +// the file type. This emulates Linux's ext4_dir_entry struct. This is used in +// ext2, ext3 and sometimes in ext4. +// +// Note: This struct can be of variable size on disk. The one described below +// is of maximum size and the FileName beyond NameLength bytes might contain +// garbage. +type DirentOld struct { + InodeNumber uint32 + RecordLength uint16 + NameLength uint16 + FileNameRaw [MaxFileName]byte +} + +// Compiles only if DirentOld implements Dirent. +var _ Dirent = (*DirentOld)(nil) + +// Inode implements Dirent.Inode. +func (d *DirentOld) Inode() uint32 { return d.InodeNumber } + +// RecordSize implements Dirent.RecordSize. +func (d *DirentOld) RecordSize() uint16 { return d.RecordLength } + +// FileName implements Dirent.FileName. +func (d *DirentOld) FileName() string { + return string(d.FileNameRaw[:d.NameLength]) +} + +// FileType implements Dirent.FileType. +func (d *DirentOld) FileType() (fs.InodeType, bool) { + return fs.Anonymous, false +} diff --git a/pkg/sentry/fs/ext4/disklayout/dirent_test.go b/pkg/sentry/fs/ext4/disklayout/dirent_test.go new file mode 100644 index 000000000..cc6dff2c9 --- /dev/null +++ b/pkg/sentry/fs/ext4/disklayout/dirent_test.go @@ -0,0 +1,28 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "testing" +) + +// TestDirentSize tests that the dirent structs are of the correct +// size. +func TestDirentSize(t *testing.T) { + want := uintptr(263) + + assertSize(t, DirentOld{}, want) + assertSize(t, DirentNew{}, want) +} -- cgit v1.2.3 From 7581e84cb6b709019c51a1e7d86414f696470554 Mon Sep 17 00:00:00 2001 From: Liu Hua Date: Tue, 9 Jul 2019 22:20:45 -0700 Subject: tss: block userspace access to all I/O ports. A userspace process (CPL=3) can access an i/o port if the bit corresponding to the port is set to 0 in the I/O permission bitmap. Configure the I/O permission bitmap address beyond the last valid byte in the TSS so access to all i/o ports is blocked. Signed-off-by: Liu Hua Change-Id: I3df76980c3735491db768f7210e71703f86bb989 PiperOrigin-RevId: 257336518 --- pkg/sentry/platform/ring0/kernel_amd64.go | 8 ++++++++ test/syscalls/linux/exceptions.cc | 34 +++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go index 3577b5127..0feff8778 100644 --- a/pkg/sentry/platform/ring0/kernel_amd64.go +++ b/pkg/sentry/platform/ring0/kernel_amd64.go @@ -70,6 +70,14 @@ func (c *CPU) init() { c.tss.ist1Lo = uint32(stackAddr) c.tss.ist1Hi = uint32(stackAddr >> 32) + // Set the I/O bitmap base address beyond the last byte in the TSS + // to block access to the entire I/O address range. + // + // From section 18.5.2 "I/O Permission Bit Map" from Intel SDM vol1: + // I/O addresses not spanned by the map are treated as if they had set + // bits in the map. + c.tss.ioPerm = tssLimit + 1 + // Permanently set the kernel segments. c.registers.Cs = uint64(Kcode) c.registers.Ds = uint64(Kdata) diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc index 0da4c817d..370e85166 100644 --- a/test/syscalls/linux/exceptions.cc +++ b/test/syscalls/linux/exceptions.cc @@ -56,6 +56,26 @@ void inline Int3Normal() { asm(".byte 0xcd, 0x03\r\n"); } void inline Int3Compact() { asm(".byte 0xcc\r\n"); } +void InIOHelper(int width, int value) { + EXPECT_EXIT( + { + switch (width) { + case 1: + asm volatile("inb %%dx, %%al" ::"d"(value) : "%eax"); + break; + case 2: + asm volatile("inw %%dx, %%ax" ::"d"(value) : "%eax"); + break; + case 4: + asm volatile("inl %%dx, %%eax" ::"d"(value) : "%eax"); + break; + default: + FAIL() << "invalid input width, only 1, 2 or 4 is allowed"; + } + }, + ::testing::KilledBySignal(SIGSEGV), ""); +} + TEST(ExceptionTest, Halt) { // In order to prevent the regular handler from messing with things (and // perhaps refaulting until some other signal occurs), we reset the handler to @@ -87,6 +107,20 @@ TEST(ExceptionTest, DivideByZero) { ::testing::KilledBySignal(SIGFPE), ""); } +TEST(ExceptionTest, IOAccessFault) { + // See above. + struct sigaction sa = {}; + sa.sa_handler = SIG_DFL; + auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGSEGV, sa)); + + InIOHelper(1, 0x0); + InIOHelper(2, 0x7); + InIOHelper(4, 0x6); + InIOHelper(1, 0xffff); + InIOHelper(2, 0xffff); + InIOHelper(4, 0xfffd); +} + TEST(ExceptionTest, Alignment) { SetAlignmentCheck(); ClearAlignmentCheck(); -- cgit v1.2.3 From a018b229b57dff92ca0d99079180aa556dacbabc Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 11 Jul 2019 12:52:12 -0700 Subject: kokoro: use bazel 2.27.1 The latest version 2.28.0 doesn't work: ./runsc/linux_amd64_pure_stripped/runsc: operation not permitted, want 0 PiperOrigin-RevId: 257663312 --- tools/run_tests.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/run_tests.sh b/tools/run_tests.sh index d1669b343..483b9cb50 100755 --- a/tools/run_tests.sh +++ b/tools/run_tests.sh @@ -45,7 +45,8 @@ readonly TEST_PACKAGES=("//pkg/..." "//runsc/..." "//tools/...") ####################### # Install the latest version of Bazel and log the version. -(which use_bazel.sh && use_bazel.sh latest) || which bazel +# FIXME(b/137285694): Unable to build runsc with bazel 0.28.0. +(which use_bazel.sh && use_bazel.sh 0.27.1) || which bazel bazel version # Load the kvm module. -- cgit v1.2.3 From 5242face2eea968f5e3f367f7edfebdd3ae60430 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Thu, 11 Jul 2019 15:04:23 -0700 Subject: ext: boilerplate code. Renamed ext4 to ext since we are targeting ext(2/3/4). Removed fs.go since we are targeting VFS2. Added ext.go with filesystem struct. PiperOrigin-RevId: 257689775 --- pkg/sentry/fs/ext/BUILD | 11 + pkg/sentry/fs/ext/disklayout/BUILD | 46 +++ pkg/sentry/fs/ext/disklayout/block_group.go | 127 ++++++ pkg/sentry/fs/ext/disklayout/block_group_32.go | 72 ++++ pkg/sentry/fs/ext/disklayout/block_group_64.go | 93 +++++ pkg/sentry/fs/ext/disklayout/block_group_test.go | 26 ++ pkg/sentry/fs/ext/disklayout/dirent.go | 69 ++++ pkg/sentry/fs/ext/disklayout/dirent_new.go | 61 +++ pkg/sentry/fs/ext/disklayout/dirent_old.go | 50 +++ pkg/sentry/fs/ext/disklayout/dirent_test.go | 28 ++ pkg/sentry/fs/ext/disklayout/disklayout.go | 50 +++ pkg/sentry/fs/ext/disklayout/inode.go | 267 ++++++++++++ pkg/sentry/fs/ext/disklayout/inode_new.go | 96 +++++ pkg/sentry/fs/ext/disklayout/inode_old.go | 117 ++++++ pkg/sentry/fs/ext/disklayout/inode_test.go | 222 ++++++++++ pkg/sentry/fs/ext/disklayout/superblock.go | 468 ++++++++++++++++++++++ pkg/sentry/fs/ext/disklayout/superblock_32.go | 75 ++++ pkg/sentry/fs/ext/disklayout/superblock_64.go | 94 +++++ pkg/sentry/fs/ext/disklayout/superblock_old.go | 102 +++++ pkg/sentry/fs/ext/disklayout/superblock_test.go | 27 ++ pkg/sentry/fs/ext/disklayout/test_utils.go | 30 ++ pkg/sentry/fs/ext/ext.go | 49 +++ pkg/sentry/fs/ext4/BUILD | 14 - pkg/sentry/fs/ext4/disklayout/BUILD | 45 --- pkg/sentry/fs/ext4/disklayout/block_group.go | 127 ------ pkg/sentry/fs/ext4/disklayout/block_group_32.go | 72 ---- pkg/sentry/fs/ext4/disklayout/block_group_64.go | 93 ----- pkg/sentry/fs/ext4/disklayout/block_group_test.go | 26 -- pkg/sentry/fs/ext4/disklayout/dirent.go | 69 ---- pkg/sentry/fs/ext4/disklayout/dirent_new.go | 61 --- pkg/sentry/fs/ext4/disklayout/dirent_old.go | 50 --- pkg/sentry/fs/ext4/disklayout/dirent_test.go | 28 -- pkg/sentry/fs/ext4/disklayout/disklayout.go | 50 --- pkg/sentry/fs/ext4/disklayout/inode.go | 267 ------------ pkg/sentry/fs/ext4/disklayout/inode_new.go | 96 ----- pkg/sentry/fs/ext4/disklayout/inode_old.go | 117 ------ pkg/sentry/fs/ext4/disklayout/inode_test.go | 222 ---------- pkg/sentry/fs/ext4/disklayout/superblock.go | 468 ---------------------- pkg/sentry/fs/ext4/disklayout/superblock_32.go | 75 ---- pkg/sentry/fs/ext4/disklayout/superblock_64.go | 94 ----- pkg/sentry/fs/ext4/disklayout/superblock_old.go | 102 ----- pkg/sentry/fs/ext4/disklayout/superblock_test.go | 27 -- pkg/sentry/fs/ext4/disklayout/test_utils.go | 30 -- pkg/sentry/fs/ext4/fs.go | 61 --- 44 files changed, 2180 insertions(+), 2194 deletions(-) create mode 100644 pkg/sentry/fs/ext/BUILD create mode 100644 pkg/sentry/fs/ext/disklayout/BUILD create mode 100644 pkg/sentry/fs/ext/disklayout/block_group.go create mode 100644 pkg/sentry/fs/ext/disklayout/block_group_32.go create mode 100644 pkg/sentry/fs/ext/disklayout/block_group_64.go create mode 100644 pkg/sentry/fs/ext/disklayout/block_group_test.go create mode 100644 pkg/sentry/fs/ext/disklayout/dirent.go create mode 100644 pkg/sentry/fs/ext/disklayout/dirent_new.go create mode 100644 pkg/sentry/fs/ext/disklayout/dirent_old.go create mode 100644 pkg/sentry/fs/ext/disklayout/dirent_test.go create mode 100644 pkg/sentry/fs/ext/disklayout/disklayout.go create mode 100644 pkg/sentry/fs/ext/disklayout/inode.go create mode 100644 pkg/sentry/fs/ext/disklayout/inode_new.go create mode 100644 pkg/sentry/fs/ext/disklayout/inode_old.go create mode 100644 pkg/sentry/fs/ext/disklayout/inode_test.go create mode 100644 pkg/sentry/fs/ext/disklayout/superblock.go create mode 100644 pkg/sentry/fs/ext/disklayout/superblock_32.go create mode 100644 pkg/sentry/fs/ext/disklayout/superblock_64.go create mode 100644 pkg/sentry/fs/ext/disklayout/superblock_old.go create mode 100644 pkg/sentry/fs/ext/disklayout/superblock_test.go create mode 100644 pkg/sentry/fs/ext/disklayout/test_utils.go create mode 100644 pkg/sentry/fs/ext/ext.go delete mode 100644 pkg/sentry/fs/ext4/BUILD delete mode 100644 pkg/sentry/fs/ext4/disklayout/BUILD delete mode 100644 pkg/sentry/fs/ext4/disklayout/block_group.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/block_group_32.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/block_group_64.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/block_group_test.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/dirent.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/dirent_new.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/dirent_old.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/dirent_test.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/disklayout.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/inode.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/inode_new.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/inode_old.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/inode_test.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/superblock.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/superblock_32.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/superblock_64.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/superblock_old.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/superblock_test.go delete mode 100644 pkg/sentry/fs/ext4/disklayout/test_utils.go delete mode 100644 pkg/sentry/fs/ext4/fs.go diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD new file mode 100644 index 000000000..3c2a02eac --- /dev/null +++ b/pkg/sentry/fs/ext/BUILD @@ -0,0 +1,11 @@ +package(licenses = ["notice"]) + +load("//tools/go_stateify:defs.bzl", "go_library") + +go_library( + name = "ext", + srcs = ["ext.go"], + importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext", + visibility = ["//pkg/sentry:internal"], + deps = ["//pkg/sentry/fs/ext/disklayout"], +) diff --git a/pkg/sentry/fs/ext/disklayout/BUILD b/pkg/sentry/fs/ext/disklayout/BUILD new file mode 100644 index 000000000..e4cb26645 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/BUILD @@ -0,0 +1,46 @@ +package(licenses = ["notice"]) + +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + +go_library( + name = "disklayout", + srcs = [ + "block_group.go", + "block_group_32.go", + "block_group_64.go", + "dirent.go", + "dirent_new.go", + "dirent_old.go", + "disklayout.go", + "inode.go", + "inode_new.go", + "inode_old.go", + "superblock.go", + "superblock_32.go", + "superblock_64.go", + "superblock_old.go", + "test_utils.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + ], +) + +go_test( + name = "disklayout_test", + size = "small", + srcs = [ + "block_group_test.go", + "dirent_test.go", + "inode_test.go", + "superblock_test.go", + ], + embed = [":disklayout"], + deps = ["//pkg/sentry/kernel/time"], +) diff --git a/pkg/sentry/fs/ext/disklayout/block_group.go b/pkg/sentry/fs/ext/disklayout/block_group.go new file mode 100644 index 000000000..32ea3d97d --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/block_group.go @@ -0,0 +1,127 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// BlockGroup represents a Linux ext block group descriptor. An ext file system +// is split into a series of block groups. This provides an access layer to +// information needed to access and use a block group. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors. +type BlockGroup interface { + // InodeTable returns the absolute block number of the block containing the + // inode table. This points to an array of Inode structs. Inode tables are + // statically allocated at mkfs time. The superblock records the number of + // inodes per group (length of this table) and the size of each inode struct. + InodeTable() uint64 + + // BlockBitmap returns the absolute block number of the block containing the + // block bitmap. This bitmap tracks the usage of data blocks within this block + // group and has its own checksum. + BlockBitmap() uint64 + + // InodeBitmap returns the absolute block number of the block containing the + // inode bitmap. This bitmap tracks the usage of this group's inode table + // entries and has its own checksum. + InodeBitmap() uint64 + + // ExclusionBitmap returns the absolute block number of the snapshot exclusion + // bitmap. + ExclusionBitmap() uint64 + + // FreeBlocksCount returns the number of free blocks in the group. + FreeBlocksCount() uint32 + + // FreeInodesCount returns the number of free inodes in the group. + FreeInodesCount() uint32 + + // DirectoryCount returns the number of inodes that represent directories + // under this block group. + DirectoryCount() uint32 + + // UnusedInodeCount returns the number of unused inodes beyond the last used + // inode in this group's inode table. As a result, we needn’t scan past the + // (InodesPerGroup - UnusedInodeCount())th entry in the inode table. + UnusedInodeCount() uint32 + + // BlockBitmapChecksum returns the block bitmap checksum. This is calculated + // using crc32c(FS UUID + group number + entire bitmap). + BlockBitmapChecksum() uint32 + + // InodeBitmapChecksum returns the inode bitmap checksum. This is calculated + // using crc32c(FS UUID + group number + entire bitmap). + InodeBitmapChecksum() uint32 + + // Checksum returns this block group's checksum. + // + // If SbMetadataCsum feature is set: + // - checksum is crc32c(FS UUID + group number + group descriptor + // structure) & 0xFFFF. + // + // If SbGdtCsum feature is set: + // - checksum is crc16(FS UUID + group number + group descriptor + // structure). + // + // SbMetadataCsum and SbGdtCsum should not be both set. + // If they are, Linux warns and asks to run fsck. + Checksum() uint16 + + // Flags returns BGFlags which represents the block group flags. + Flags() BGFlags +} + +// These are the different block group flags. +const ( + // BgInodeUninit indicates that inode table and bitmap are not initialized. + BgInodeUninit uint16 = 0x1 + + // BgBlockUninit indicates that block bitmap is not initialized. + BgBlockUninit uint16 = 0x2 + + // BgInodeZeroed indicates that inode table is zeroed. + BgInodeZeroed uint16 = 0x4 +) + +// BGFlags represents all the different combinations of block group flags. +type BGFlags struct { + InodeUninit bool + BlockUninit bool + InodeZeroed bool +} + +// ToInt converts a BGFlags struct back to its 16-bit representation. +func (f BGFlags) ToInt() uint16 { + var res uint16 + + if f.InodeUninit { + res |= BgInodeUninit + } + if f.BlockUninit { + res |= BgBlockUninit + } + if f.InodeZeroed { + res |= BgInodeZeroed + } + + return res +} + +// BGFlagsFromInt converts the 16-bit flag representation to a BGFlags struct. +func BGFlagsFromInt(flags uint16) BGFlags { + return BGFlags{ + InodeUninit: flags&BgInodeUninit > 0, + BlockUninit: flags&BgBlockUninit > 0, + InodeZeroed: flags&BgInodeZeroed > 0, + } +} diff --git a/pkg/sentry/fs/ext/disklayout/block_group_32.go b/pkg/sentry/fs/ext/disklayout/block_group_32.go new file mode 100644 index 000000000..3e16c76db --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/block_group_32.go @@ -0,0 +1,72 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// BlockGroup32Bit emulates the first half of struct ext4_group_desc in +// fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and +// 32-bit ext4 filesystems. It implements BlockGroup interface. +type BlockGroup32Bit struct { + BlockBitmapLo uint32 + InodeBitmapLo uint32 + InodeTableLo uint32 + FreeBlocksCountLo uint16 + FreeInodesCountLo uint16 + UsedDirsCountLo uint16 + FlagsRaw uint16 + ExcludeBitmapLo uint32 + BlockBitmapChecksumLo uint16 + InodeBitmapChecksumLo uint16 + ItableUnusedLo uint16 + ChecksumRaw uint16 +} + +// Compiles only if BlockGroup32Bit implements BlockGroup. +var _ BlockGroup = (*BlockGroup32Bit)(nil) + +// InodeTable implements BlockGroup.InodeTable. +func (bg *BlockGroup32Bit) InodeTable() uint64 { return uint64(bg.InodeTableLo) } + +// BlockBitmap implements BlockGroup.BlockBitmap. +func (bg *BlockGroup32Bit) BlockBitmap() uint64 { return uint64(bg.BlockBitmapLo) } + +// InodeBitmap implements BlockGroup.InodeBitmap. +func (bg *BlockGroup32Bit) InodeBitmap() uint64 { return uint64(bg.InodeBitmapLo) } + +// ExclusionBitmap implements BlockGroup.ExclusionBitmap. +func (bg *BlockGroup32Bit) ExclusionBitmap() uint64 { return uint64(bg.ExcludeBitmapLo) } + +// FreeBlocksCount implements BlockGroup.FreeBlocksCount. +func (bg *BlockGroup32Bit) FreeBlocksCount() uint32 { return uint32(bg.FreeBlocksCountLo) } + +// FreeInodesCount implements BlockGroup.FreeInodesCount. +func (bg *BlockGroup32Bit) FreeInodesCount() uint32 { return uint32(bg.FreeInodesCountLo) } + +// DirectoryCount implements BlockGroup.DirectoryCount. +func (bg *BlockGroup32Bit) DirectoryCount() uint32 { return uint32(bg.UsedDirsCountLo) } + +// UnusedInodeCount implements BlockGroup.UnusedInodeCount. +func (bg *BlockGroup32Bit) UnusedInodeCount() uint32 { return uint32(bg.ItableUnusedLo) } + +// BlockBitmapChecksum implements BlockGroup.BlockBitmapChecksum. +func (bg *BlockGroup32Bit) BlockBitmapChecksum() uint32 { return uint32(bg.BlockBitmapChecksumLo) } + +// InodeBitmapChecksum implements BlockGroup.InodeBitmapChecksum. +func (bg *BlockGroup32Bit) InodeBitmapChecksum() uint32 { return uint32(bg.InodeBitmapChecksumLo) } + +// Checksum implements BlockGroup.Checksum. +func (bg *BlockGroup32Bit) Checksum() uint16 { return bg.ChecksumRaw } + +// Flags implements BlockGroup.Flags. +func (bg *BlockGroup32Bit) Flags() BGFlags { return BGFlagsFromInt(bg.FlagsRaw) } diff --git a/pkg/sentry/fs/ext/disklayout/block_group_64.go b/pkg/sentry/fs/ext/disklayout/block_group_64.go new file mode 100644 index 000000000..9a809197a --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/block_group_64.go @@ -0,0 +1,93 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// BlockGroup64Bit emulates struct ext4_group_desc in fs/ext4/ext4.h. +// It is the block group descriptor struct for 64-bit ext4 filesystems. +// It implements BlockGroup interface. It is an extension of the 32-bit +// version of BlockGroup. +type BlockGroup64Bit struct { + // We embed the 32-bit struct here because 64-bit version is just an extension + // of the 32-bit version. + BlockGroup32Bit + + // 64-bit specific fields. + BlockBitmapHi uint32 + InodeBitmapHi uint32 + InodeTableHi uint32 + FreeBlocksCountHi uint16 + FreeInodesCountHi uint16 + UsedDirsCountHi uint16 + ItableUnusedHi uint16 + ExcludeBitmapHi uint32 + BlockBitmapChecksumHi uint16 + InodeBitmapChecksumHi uint16 + _ uint32 // Padding to 64 bytes. +} + +// Compiles only if BlockGroup64Bit implements BlockGroup. +var _ BlockGroup = (*BlockGroup64Bit)(nil) + +// Methods to override. Checksum() and Flags() are not overridden. + +// InodeTable implements BlockGroup.InodeTable. +func (bg *BlockGroup64Bit) InodeTable() uint64 { + return (uint64(bg.InodeTableHi) << 32) | uint64(bg.InodeTableLo) +} + +// BlockBitmap implements BlockGroup.BlockBitmap. +func (bg *BlockGroup64Bit) BlockBitmap() uint64 { + return (uint64(bg.BlockBitmapHi) << 32) | uint64(bg.BlockBitmapLo) +} + +// InodeBitmap implements BlockGroup.InodeBitmap. +func (bg *BlockGroup64Bit) InodeBitmap() uint64 { + return (uint64(bg.InodeBitmapHi) << 32) | uint64(bg.InodeBitmapLo) +} + +// ExclusionBitmap implements BlockGroup.ExclusionBitmap. +func (bg *BlockGroup64Bit) ExclusionBitmap() uint64 { + return (uint64(bg.ExcludeBitmapHi) << 32) | uint64(bg.ExcludeBitmapLo) +} + +// FreeBlocksCount implements BlockGroup.FreeBlocksCount. +func (bg *BlockGroup64Bit) FreeBlocksCount() uint32 { + return (uint32(bg.FreeBlocksCountHi) << 16) | uint32(bg.FreeBlocksCountLo) +} + +// FreeInodesCount implements BlockGroup.FreeInodesCount. +func (bg *BlockGroup64Bit) FreeInodesCount() uint32 { + return (uint32(bg.FreeInodesCountHi) << 16) | uint32(bg.FreeInodesCountLo) +} + +// DirectoryCount implements BlockGroup.DirectoryCount. +func (bg *BlockGroup64Bit) DirectoryCount() uint32 { + return (uint32(bg.UsedDirsCountHi) << 16) | uint32(bg.UsedDirsCountLo) +} + +// UnusedInodeCount implements BlockGroup.UnusedInodeCount. +func (bg *BlockGroup64Bit) UnusedInodeCount() uint32 { + return (uint32(bg.ItableUnusedHi) << 16) | uint32(bg.ItableUnusedLo) +} + +// BlockBitmapChecksum implements BlockGroup.BlockBitmapChecksum. +func (bg *BlockGroup64Bit) BlockBitmapChecksum() uint32 { + return (uint32(bg.BlockBitmapChecksumHi) << 16) | uint32(bg.BlockBitmapChecksumLo) +} + +// InodeBitmapChecksum implements BlockGroup.InodeBitmapChecksum. +func (bg *BlockGroup64Bit) InodeBitmapChecksum() uint32 { + return (uint32(bg.InodeBitmapChecksumHi) << 16) | uint32(bg.InodeBitmapChecksumLo) +} diff --git a/pkg/sentry/fs/ext/disklayout/block_group_test.go b/pkg/sentry/fs/ext/disklayout/block_group_test.go new file mode 100644 index 000000000..0ef4294c0 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/block_group_test.go @@ -0,0 +1,26 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "testing" +) + +// TestBlockGroupSize tests that the block group descriptor structs are of the +// correct size. +func TestBlockGroupSize(t *testing.T) { + assertSize(t, BlockGroup32Bit{}, 32) + assertSize(t, BlockGroup64Bit{}, 64) +} diff --git a/pkg/sentry/fs/ext/disklayout/dirent.go b/pkg/sentry/fs/ext/disklayout/dirent.go new file mode 100644 index 000000000..685bf57b8 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/dirent.go @@ -0,0 +1,69 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +const ( + // MaxFileName is the maximum length of an ext fs file's name. + MaxFileName = 255 +) + +var ( + // inodeTypeByFileType maps ext4 file types to vfs inode types. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#ftype. + inodeTypeByFileType = map[uint8]fs.InodeType{ + 0: fs.Anonymous, + 1: fs.RegularFile, + 2: fs.Directory, + 3: fs.CharacterDevice, + 4: fs.BlockDevice, + 5: fs.Pipe, + 6: fs.Socket, + 7: fs.Symlink, + } +) + +// The Dirent interface should be implemented by structs representing ext +// directory entries. These are for the linear classical directories which +// just store a list of dirent structs. A directory is a series of data blocks +// where is each data block contains a linear array of dirents. The last entry +// of the block has a record size that takes it to the end of the block. The +// end of the directory is when you read dirInode.Size() bytes from the blocks. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories. +type Dirent interface { + // Inode returns the absolute inode number of the underlying inode. + // Inode number 0 signifies an unused dirent. + Inode() uint32 + + // RecordSize returns the record length of this dirent on disk. The next + // dirent in the dirent list should be read after these many bytes from + // the current dirent. Must be a multiple of 4. + RecordSize() uint16 + + // FileName returns the name of the file. Can be at most 255 is length. + FileName() string + + // FileType returns the inode type of the underlying inode. This is a + // performance hack so that we do not have to read the underlying inode struct + // to know the type of inode. This will only work when the SbDirentFileType + // feature is set. If not, the second returned value will be false indicating + // that user code has to use the inode mode to extract the file type. + FileType() (fs.InodeType, bool) +} diff --git a/pkg/sentry/fs/ext/disklayout/dirent_new.go b/pkg/sentry/fs/ext/disklayout/dirent_new.go new file mode 100644 index 000000000..29ae4a5c2 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/dirent_new.go @@ -0,0 +1,61 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +// DirentNew represents the ext4 directory entry struct. This emulates Linux's +// ext4_dir_entry_2 struct. The FileName can not be more than 255 bytes so we +// only need 8 bits to store the NameLength. As a result, NameLength has been +// shortened and the other 8 bits are used to encode the file type. Use the +// FileTypeRaw field only if the SbDirentFileType feature is set. +// +// Note: This struct can be of variable size on disk. The one described below +// is of maximum size and the FileName beyond NameLength bytes might contain +// garbage. +type DirentNew struct { + InodeNumber uint32 + RecordLength uint16 + NameLength uint8 + FileTypeRaw uint8 + FileNameRaw [MaxFileName]byte +} + +// Compiles only if DirentNew implements Dirent. +var _ Dirent = (*DirentNew)(nil) + +// Inode implements Dirent.Inode. +func (d *DirentNew) Inode() uint32 { return d.InodeNumber } + +// RecordSize implements Dirent.RecordSize. +func (d *DirentNew) RecordSize() uint16 { return d.RecordLength } + +// FileName implements Dirent.FileName. +func (d *DirentNew) FileName() string { + return string(d.FileNameRaw[:d.NameLength]) +} + +// FileType implements Dirent.FileType. +func (d *DirentNew) FileType() (fs.InodeType, bool) { + if inodeType, ok := inodeTypeByFileType[d.FileTypeRaw]; ok { + return inodeType, true + } + + panic(fmt.Sprintf("unknown file type %v", d.FileTypeRaw)) +} diff --git a/pkg/sentry/fs/ext/disklayout/dirent_old.go b/pkg/sentry/fs/ext/disklayout/dirent_old.go new file mode 100644 index 000000000..2e0f9c812 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/dirent_old.go @@ -0,0 +1,50 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import "gvisor.dev/gvisor/pkg/sentry/fs" + +// DirentOld represents the old directory entry struct which does not contain +// the file type. This emulates Linux's ext4_dir_entry struct. This is used in +// ext2, ext3 and sometimes in ext4. +// +// Note: This struct can be of variable size on disk. The one described below +// is of maximum size and the FileName beyond NameLength bytes might contain +// garbage. +type DirentOld struct { + InodeNumber uint32 + RecordLength uint16 + NameLength uint16 + FileNameRaw [MaxFileName]byte +} + +// Compiles only if DirentOld implements Dirent. +var _ Dirent = (*DirentOld)(nil) + +// Inode implements Dirent.Inode. +func (d *DirentOld) Inode() uint32 { return d.InodeNumber } + +// RecordSize implements Dirent.RecordSize. +func (d *DirentOld) RecordSize() uint16 { return d.RecordLength } + +// FileName implements Dirent.FileName. +func (d *DirentOld) FileName() string { + return string(d.FileNameRaw[:d.NameLength]) +} + +// FileType implements Dirent.FileType. +func (d *DirentOld) FileType() (fs.InodeType, bool) { + return fs.Anonymous, false +} diff --git a/pkg/sentry/fs/ext/disklayout/dirent_test.go b/pkg/sentry/fs/ext/disklayout/dirent_test.go new file mode 100644 index 000000000..cc6dff2c9 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/dirent_test.go @@ -0,0 +1,28 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "testing" +) + +// TestDirentSize tests that the dirent structs are of the correct +// size. +func TestDirentSize(t *testing.T) { + want := uintptr(263) + + assertSize(t, DirentOld{}, want) + assertSize(t, DirentNew{}, want) +} diff --git a/pkg/sentry/fs/ext/disklayout/disklayout.go b/pkg/sentry/fs/ext/disklayout/disklayout.go new file mode 100644 index 000000000..bdf4e2132 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/disklayout.go @@ -0,0 +1,50 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package disklayout provides Linux ext file system's disk level structures +// which can be directly read into from the underlying device. Structs aim to +// emulate structures `exactly` how they are layed out on disk. +// +// This library aims to be compatible with all ext(2/3/4) systems so it +// provides a generic interface for all major structures and various +// implementations (for different versions). The user code is responsible for +// using appropriate implementations based on the underlying device. +// +// Interfacing all major structures here serves a few purposes: +// - Abstracts away the complexity of the underlying structure from client +// code. The client only has to figure out versioning on set up and then +// can use these as black boxes and pass it higher up the stack. +// - Having pointer receivers forces the user to use pointers to these +// heavy structs. Hence, prevents the client code from unintentionally +// copying these by value while passing the interface around. +// - Version-based implementation selection is resolved on set up hence +// avoiding per call overhead of choosing implementation. +// - All interface methods are pretty light weight (do not take in any +// parameters by design). Passing pointer arguments to interface methods +// can lead to heap allocation as the compiler won't be able to perform +// escape analysis on an unknown implementation at compile time. +// +// Notes: +// - All fields in these structs are exported because binary.Read would +// panic otherwise. +// - All structures on disk are in little-endian order. Only jbd2 (journal) +// structures are in big-endian order. +// - All OS dependent fields in these structures will be interpretted using +// the Linux version of that field. +// - The suffix `Lo` in field names stands for lower bits of that field. +// - The suffix `Hi` in field names stands for upper bits of that field. +// - The suffix `Raw` has been added to indicate that the field is not split +// into Lo and Hi fields and also to resolve name collision with the +// respective interface. +package disklayout diff --git a/pkg/sentry/fs/ext/disklayout/inode.go b/pkg/sentry/fs/ext/disklayout/inode.go new file mode 100644 index 000000000..b48001910 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/inode.go @@ -0,0 +1,267 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +// The Inode interface must be implemented by structs representing ext inodes. +// The inode stores all the metadata pertaining to the file (except for the +// file name which is held by the directory entry). It does NOT expose all +// fields and should be extended if need be. +// +// Some file systems (e.g. FAT) use the directory entry to store all this +// information. Ext file systems do not so that they can support hard links. +// However, ext4 cheats a little bit and duplicates the file type in the +// directory entry for performance gains. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes. +type Inode interface { + // Mode returns the linux file mode which is majorly used to extract + // information like: + // - File permissions (read/write/execute by user/group/others). + // - Sticky, set UID and GID bits. + // - File type. + // + // Masks to extract this information are provided in pkg/abi/linux/file.go. + Mode() linux.FileMode + + // UID returns the owner UID. + UID() auth.KUID + + // GID returns the owner GID. + GID() auth.KGID + + // Size returns the size of the file in bytes. + Size() uint64 + + // InodeSize returns the size of this inode struct in bytes. + // In ext2 and ext3, the inode struct and inode disk record size was fixed at + // 128 bytes. Ext4 makes it possible for the inode struct to be bigger. + // However, accessing any field beyond the 128 bytes marker must be verified + // using this method. + InodeSize() uint16 + + // AccessTime returns the last access time. Shows when the file was last read. + // + // If InExtendedAttr is set, then this should NOT be used because the + // underlying field is used to store the extended attribute value checksum. + AccessTime() time.Time + + // ChangeTime returns the last change time. Shows when the file meta data + // (like permissions) was last changed. + // + // If InExtendedAttr is set, then this should NOT be used because the + // underlying field is used to store the lower 32 bits of the attribute + // value’s reference count. + ChangeTime() time.Time + + // ModificationTime returns the last modification time. Shows when the file + // content was last modified. + // + // If InExtendedAttr is set, then this should NOT be used because + // the underlying field contains the number of the inode that owns the + // extended attribute. + ModificationTime() time.Time + + // DeletionTime returns the deletion time. Inodes are marked as deleted by + // writing to the underlying field. FS tools can restore files until they are + // actually overwritten. + DeletionTime() time.Time + + // LinksCount returns the number of hard links to this inode. + // + // Normally there is an upper limit on the number of hard links: + // - ext2/ext3 = 32,000 + // - ext4 = 65,000 + // + // This implies that an ext4 directory cannot have more than 64,998 + // subdirectories because each subdirectory will have a hard link to the + // directory via the `..` entry. The directory has hard link via the `.` entry + // of its own. And finally the inode is initiated with 1 hard link (itself). + // + // The underlying value is reset to 1 if all the following hold: + // - Inode is a directory. + // - SbDirNlink is enabled. + // - Number of hard links is incremented past 64,999. + // Hard link value of 1 for a directory would indicate that the number of hard + // links is unknown because a directory can have minimum 2 hard links (itself + // and `.` entry). + LinksCount() uint16 + + // Flags returns InodeFlags which represents the inode flags. + Flags() InodeFlags + + // Blocks returns the underlying inode.i_block array. This field is special + // and is used to store various kinds of things depending on the filesystem + // version and inode type. + // - In ext2/ext3, it contains the block map. + // - In ext4, it contains the extent tree. + // - For inline files, it contains the file contents. + // - For symlinks, it contains the link path (if it fits here). + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#the-contents-of-inode-i-block. + Blocks() [60]byte +} + +// Inode flags. This is not comprehensive and flags which were not used in +// the Linux kernel have been excluded. +const ( + // InSync indicates that all writes to the file must be synchronous. + InSync = 0x8 + + // InImmutable indicates that this file is immutable. + InImmutable = 0x10 + + // InAppend indicates that this file can only be appended to. + InAppend = 0x20 + + // InNoDump indicates that teh dump(1) utility should not dump this file. + InNoDump = 0x40 + + // InNoAccessTime indicates that the access time of this inode must not be + // updated. + InNoAccessTime = 0x80 + + // InIndex indicates that this directory has hashed indexes. + InIndex = 0x1000 + + // InJournalData indicates that file data must always be written through a + // journal device. + InJournalData = 0x4000 + + // InDirSync indicates that all the directory entiry data must be written + // synchronously. + InDirSync = 0x10000 + + // InTopDir indicates that this inode is at the top of the directory hierarchy. + InTopDir = 0x20000 + + // InHugeFile indicates that this is a huge file. + InHugeFile = 0x40000 + + // InExtents indicates that this inode uses extents. + InExtents = 0x80000 + + // InExtendedAttr indicates that this inode stores a large extended attribute + // value in its data blocks. + InExtendedAttr = 0x200000 + + // InInline indicates that this inode has inline data. + InInline = 0x10000000 + + // InReserved indicates that this inode is reserved for the ext4 library. + InReserved = 0x80000000 +) + +// InodeFlags represents all possible combinations of inode flags. It aims to +// cover the bit masks and provide a more user-friendly interface. +type InodeFlags struct { + Sync bool + Immutable bool + Append bool + NoDump bool + NoAccessTime bool + Index bool + JournalData bool + DirSync bool + TopDir bool + HugeFile bool + Extents bool + ExtendedAttr bool + Inline bool + Reserved bool +} + +// ToInt converts inode flags back to its 32-bit rep. +func (f InodeFlags) ToInt() uint32 { + var res uint32 + + if f.Sync { + res |= InSync + } + if f.Immutable { + res |= InImmutable + } + if f.Append { + res |= InAppend + } + if f.NoDump { + res |= InNoDump + } + if f.NoAccessTime { + res |= InNoAccessTime + } + if f.Index { + res |= InIndex + } + if f.JournalData { + res |= InJournalData + } + if f.DirSync { + res |= InDirSync + } + if f.TopDir { + res |= InTopDir + } + if f.HugeFile { + res |= InHugeFile + } + if f.Extents { + res |= InExtents + } + if f.ExtendedAttr { + res |= InExtendedAttr + } + if f.Inline { + res |= InInline + } + if f.Reserved { + res |= InReserved + } + + return res +} + +// InodeFlagsFromInt converts the integer representation of inode flags to +// a InodeFlags struct. +func InodeFlagsFromInt(f uint32) InodeFlags { + return InodeFlags{ + Sync: f&InSync > 0, + Immutable: f&InImmutable > 0, + Append: f&InAppend > 0, + NoDump: f&InNoDump > 0, + NoAccessTime: f&InNoAccessTime > 0, + Index: f&InIndex > 0, + JournalData: f&InJournalData > 0, + DirSync: f&InDirSync > 0, + TopDir: f&InTopDir > 0, + HugeFile: f&InHugeFile > 0, + Extents: f&InExtents > 0, + ExtendedAttr: f&InExtendedAttr > 0, + Inline: f&InInline > 0, + Reserved: f&InReserved > 0, + } +} + +// These masks define how users can view/modify inode flags. The rest of the +// flags are for internal kernel usage only. +const ( + InUserReadFlagMask = 0x4BDFFF + InUserWriteFlagMask = 0x4B80FF +) diff --git a/pkg/sentry/fs/ext/disklayout/inode_new.go b/pkg/sentry/fs/ext/disklayout/inode_new.go new file mode 100644 index 000000000..4f5348372 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/inode_new.go @@ -0,0 +1,96 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import "gvisor.dev/gvisor/pkg/sentry/kernel/time" + +// InodeNew represents ext4 inode structure which can be bigger than +// OldInodeSize. The actual size of this struct should be determined using +// inode.ExtraInodeSize. Accessing any field here should be verified with the +// actual size. The extra space between the end of the inode struct and end of +// the inode record can be used to store extended attr. +// +// If the TimeExtra fields are in scope, the lower 2 bits of those are used +// to extend their counter part to be 34 bits wide; the rest (upper) 30 bits +// are used to provide nanoscond precision. Hence, these timestamps will now +// overflow in May 2446. +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. +type InodeNew struct { + InodeOld + + ExtraInodeSize uint16 + ChecksumHi uint16 + ChangeTimeExtra uint32 + ModificationTimeExtra uint32 + AccessTimeExtra uint32 + CreationTime uint32 + CreationTimeExtra uint32 + VersionHi uint32 + ProjectID uint32 +} + +// Compiles only if InodeNew implements Inode. +var _ Inode = (*InodeNew)(nil) + +// fromExtraTime decodes the extra time and constructs the kernel time struct +// with nanosecond precision. +func fromExtraTime(lo int32, extra uint32) time.Time { + // See description above InodeNew for format. + seconds := (int64(extra&0x3) << 32) + int64(lo) + nanoseconds := int64(extra >> 2) + return time.FromUnix(seconds, nanoseconds) +} + +// Only override methods which change due to ext4 specific fields. + +// Size implements Inode.Size. +func (in *InodeNew) Size() uint64 { + return (uint64(in.SizeHi) << 32) | uint64(in.SizeLo) +} + +// InodeSize implements Inode.InodeSize. +func (in *InodeNew) InodeSize() uint16 { + return oldInodeSize + in.ExtraInodeSize +} + +// ChangeTime implements Inode.ChangeTime. +func (in *InodeNew) ChangeTime() time.Time { + // Apply new timestamp logic if inode.ChangeTimeExtra is in scope. + if in.ExtraInodeSize >= 8 { + return fromExtraTime(in.ChangeTimeRaw, in.ChangeTimeExtra) + } + + return in.InodeOld.ChangeTime() +} + +// ModificationTime implements Inode.ModificationTime. +func (in *InodeNew) ModificationTime() time.Time { + // Apply new timestamp logic if inode.ModificationTimeExtra is in scope. + if in.ExtraInodeSize >= 12 { + return fromExtraTime(in.ModificationTimeRaw, in.ModificationTimeExtra) + } + + return in.InodeOld.ModificationTime() +} + +// AccessTime implements Inode.AccessTime. +func (in *InodeNew) AccessTime() time.Time { + // Apply new timestamp logic if inode.AccessTimeExtra is in scope. + if in.ExtraInodeSize >= 16 { + return fromExtraTime(in.AccessTimeRaw, in.AccessTimeExtra) + } + + return in.InodeOld.AccessTime() +} diff --git a/pkg/sentry/fs/ext/disklayout/inode_old.go b/pkg/sentry/fs/ext/disklayout/inode_old.go new file mode 100644 index 000000000..dc4c9d8e4 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/inode_old.go @@ -0,0 +1,117 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +const ( + // oldInodeSize is the inode size in ext2/ext3. + oldInodeSize = 128 +) + +// InodeOld implements Inode interface. It emulates ext2/ext3 inode struct. +// Inode struct size and record size are both 128 bytes for this. +// +// All fields representing time are in seconds since the epoch. Which means that +// they will overflow in January 2038. +type InodeOld struct { + ModeRaw uint16 + UIDLo uint16 + SizeLo uint32 + + // The time fields are signed integers because they could be negative to + // represent time before the epoch. + AccessTimeRaw int32 + ChangeTimeRaw int32 + ModificationTimeRaw int32 + DeletionTimeRaw int32 + + GIDLo uint16 + LinksCountRaw uint16 + BlocksCountLo uint32 + FlagsRaw uint32 + VersionLo uint32 // This is OS dependent. + BlocksRaw [60]byte + Generation uint32 + FileACLLo uint32 + SizeHi uint32 + ObsoFaddr uint32 + + // OS dependent fields have been inlined here. + BlocksCountHi uint16 + FileACLHi uint16 + UIDHi uint16 + GIDHi uint16 + ChecksumLo uint16 + _ uint16 +} + +// Compiles only if InodeOld implements Inode. +var _ Inode = (*InodeOld)(nil) + +// Mode implements Inode.Mode. +func (in *InodeOld) Mode() linux.FileMode { return linux.FileMode(in.ModeRaw) } + +// UID implements Inode.UID. +func (in *InodeOld) UID() auth.KUID { + return auth.KUID((uint32(in.UIDHi) << 16) | uint32(in.UIDLo)) +} + +// GID implements Inode.GID. +func (in *InodeOld) GID() auth.KGID { + return auth.KGID((uint32(in.GIDHi) << 16) | uint32(in.GIDLo)) +} + +// Size implements Inode.Size. +func (in *InodeOld) Size() uint64 { + // In ext2/ext3, in.SizeHi did not exist, it was instead named in.DirACL. + return uint64(in.SizeLo) +} + +// InodeSize implements Inode.InodeSize. +func (in *InodeOld) InodeSize() uint16 { return oldInodeSize } + +// AccessTime implements Inode.AccessTime. +func (in *InodeOld) AccessTime() time.Time { + return time.FromUnix(int64(in.AccessTimeRaw), 0) +} + +// ChangeTime implements Inode.ChangeTime. +func (in *InodeOld) ChangeTime() time.Time { + return time.FromUnix(int64(in.ChangeTimeRaw), 0) +} + +// ModificationTime implements Inode.ModificationTime. +func (in *InodeOld) ModificationTime() time.Time { + return time.FromUnix(int64(in.ModificationTimeRaw), 0) +} + +// DeletionTime implements Inode.DeletionTime. +func (in *InodeOld) DeletionTime() time.Time { + return time.FromUnix(int64(in.DeletionTimeRaw), 0) +} + +// LinksCount implements Inode.LinksCount. +func (in *InodeOld) LinksCount() uint16 { return in.LinksCountRaw } + +// Flags implements Inode.Flags. +func (in *InodeOld) Flags() InodeFlags { return InodeFlagsFromInt(in.FlagsRaw) } + +// Blocks implements Inode.Blocks. +func (in *InodeOld) Blocks() [60]byte { return in.BlocksRaw } diff --git a/pkg/sentry/fs/ext/disklayout/inode_test.go b/pkg/sentry/fs/ext/disklayout/inode_test.go new file mode 100644 index 000000000..9cae9e4f0 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/inode_test.go @@ -0,0 +1,222 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "fmt" + "strconv" + "testing" + + "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +// TestInodeSize tests that the inode structs are of the correct size. +func TestInodeSize(t *testing.T) { + assertSize(t, InodeOld{}, oldInodeSize) + + // This was updated from 156 bytes to 160 bytes in Oct 2015. + assertSize(t, InodeNew{}, 160) +} + +// TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in +// ext4 inode structs are decoded correctly. +// +// These tests are derived from the table under https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. +func TestTimestampSeconds(t *testing.T) { + type timestampTest struct { + // msbSet tells if the most significant bit of InodeOld.[X]TimeRaw is set. + // If this is set then the 32-bit time is negative. + msbSet bool + + // lowerBound tells if we should take the lowest possible value of + // InodeOld.[X]TimeRaw while satisfying test.msbSet condition. If set to + // false it tells to take the highest possible value. + lowerBound bool + + // extraBits is InodeNew.[X]TimeExtra. + extraBits uint32 + + // want is the kernel time struct that is expected. + want time.Time + } + + tests := []timestampTest{ + // 1901-12-13 + { + msbSet: true, + lowerBound: true, + extraBits: 0, + want: time.FromUnix(int64(-0x80000000), 0), + }, + + // 1969-12-31 + { + msbSet: true, + lowerBound: false, + extraBits: 0, + want: time.FromUnix(int64(-1), 0), + }, + + // 1970-01-01 + { + msbSet: false, + lowerBound: true, + extraBits: 0, + want: time.FromUnix(int64(0), 0), + }, + + // 2038-01-19 + { + msbSet: false, + lowerBound: false, + extraBits: 0, + want: time.FromUnix(int64(0x7fffffff), 0), + }, + + // 2038-01-19 + { + msbSet: true, + lowerBound: true, + extraBits: 1, + want: time.FromUnix(int64(0x80000000), 0), + }, + + // 2106-02-07 + { + msbSet: true, + lowerBound: false, + extraBits: 1, + want: time.FromUnix(int64(0xffffffff), 0), + }, + + // 2106-02-07 + { + msbSet: false, + lowerBound: true, + extraBits: 1, + want: time.FromUnix(int64(0x100000000), 0), + }, + + // 2174-02-25 + { + msbSet: false, + lowerBound: false, + extraBits: 1, + want: time.FromUnix(int64(0x17fffffff), 0), + }, + + // 2174-02-25 + { + msbSet: true, + lowerBound: true, + extraBits: 2, + want: time.FromUnix(int64(0x180000000), 0), + }, + + // 2242-03-16 + { + msbSet: true, + lowerBound: false, + extraBits: 2, + want: time.FromUnix(int64(0x1ffffffff), 0), + }, + + // 2242-03-16 + { + msbSet: false, + lowerBound: true, + extraBits: 2, + want: time.FromUnix(int64(0x200000000), 0), + }, + + // 2310-04-04 + { + msbSet: false, + lowerBound: false, + extraBits: 2, + want: time.FromUnix(int64(0x27fffffff), 0), + }, + + // 2310-04-04 + { + msbSet: true, + lowerBound: true, + extraBits: 3, + want: time.FromUnix(int64(0x280000000), 0), + }, + + // 2378-04-22 + { + msbSet: true, + lowerBound: false, + extraBits: 3, + want: time.FromUnix(int64(0x2ffffffff), 0), + }, + + // 2378-04-22 + { + msbSet: false, + lowerBound: true, + extraBits: 3, + want: time.FromUnix(int64(0x300000000), 0), + }, + + // 2446-05-10 + { + msbSet: false, + lowerBound: false, + extraBits: 3, + want: time.FromUnix(int64(0x37fffffff), 0), + }, + } + + lowerMSB0 := int32(0) // binary: 00000000 00000000 00000000 00000000 + upperMSB0 := int32(0x7fffffff) // binary: 01111111 11111111 11111111 11111111 + lowerMSB1 := int32(-0x80000000) // binary: 10000000 00000000 00000000 00000000 + upperMSB1 := int32(-1) // binary: 11111111 11111111 11111111 11111111 + + get32BitTime := func(test timestampTest) int32 { + if test.msbSet { + if test.lowerBound { + return lowerMSB1 + } + + return upperMSB1 + } + + if test.lowerBound { + return lowerMSB0 + } + + return upperMSB0 + } + + getTestName := func(test timestampTest) string { + return fmt.Sprintf( + "Tests time decoding with epoch bits 0b%s and 32-bit raw time: MSB set=%t, lower bound=%t", + strconv.FormatInt(int64(test.extraBits), 2), + test.msbSet, + test.lowerBound, + ) + } + + for _, test := range tests { + t.Run(getTestName(test), func(t *testing.T) { + if got := fromExtraTime(get32BitTime(test), test.extraBits); got != test.want { + t.Errorf("Expected: %v, Got: %v", test.want, got) + } + }) + } +} diff --git a/pkg/sentry/fs/ext/disklayout/superblock.go b/pkg/sentry/fs/ext/disklayout/superblock.go new file mode 100644 index 000000000..e4b8f46fb --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/superblock.go @@ -0,0 +1,468 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlock should be implemented by structs representing the ext superblock. +// The superblock holds a lot of information about the enclosing filesystem. +// This interface aims to provide access methods to important information held +// by the superblock. It does NOT expose all fields of the superblock, only the +// ones necessary. This can be expanded when need be. +// +// Location and replication: +// - The superblock is located at offset 1024 in block group 0. +// - Redundant copies of the superblock and group descriptors are kept in +// all groups if SbSparse feature flag is NOT set. If it is set, the +// replicas only exist in groups whose group number is either 0 or a +// power of 3, 5, or 7. +// - There is also a sparse superblock feature v2 in which there are just +// two replicas saved in the block groups pointed by sb.s_backup_bgs. +// +// Replicas should eventually be updated if the superblock is updated. +// +// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block. +type SuperBlock interface { + // InodesCount returns the total number of inodes in this filesystem. + InodesCount() uint32 + + // BlocksCount returns the total number of data blocks in this filesystem. + BlocksCount() uint64 + + // FreeBlocksCount returns the number of free blocks in this filesystem. + FreeBlocksCount() uint64 + + // FreeInodesCount returns the number of free inodes in this filesystem. + FreeInodesCount() uint32 + + // MountCount returns the number of mounts since the last fsck. + MountCount() uint16 + + // MaxMountCount returns the number of mounts allowed beyond which a fsck is + // needed. + MaxMountCount() uint16 + + // FirstDataBlock returns the absolute block number of the first data block, + // which contains the super block itself. + // + // If the filesystem has 1kb data blocks then this should return 1. For all + // other configurations, this typically returns 0. + // + // The first block group descriptor is in (FirstDataBlock() + 1)th block. + FirstDataBlock() uint32 + + // BlockSize returns the size of one data block in this filesystem. + // This can be calculated by 2^(10 + sb.s_log_block_size). This ensures that + // the smallest block size is 1kb. + BlockSize() uint64 + + // BlocksPerGroup returns the number of data blocks in a block group. + BlocksPerGroup() uint32 + + // ClusterSize returns block cluster size (set during mkfs time by admin). + // This can be calculated by 2^(10 + sb.s_log_cluster_size). This ensures that + // the smallest cluster size is 1kb. + // + // sb.s_log_cluster_size must equal sb.s_log_block_size if bigalloc feature + // is NOT set and consequently BlockSize() = ClusterSize() in that case. + ClusterSize() uint64 + + // ClustersPerGroup returns: + // - number of clusters per group if bigalloc is enabled. + // - BlocksPerGroup() otherwise. + ClustersPerGroup() uint32 + + // InodeSize returns the size of the inode disk record size in bytes. Use this + // to iterate over inode arrays on disk. + // + // In ext2 and ext3: + // - Each inode had a disk record of 128 bytes. + // - The inode struct size was fixed at 128 bytes. + // + // In ext4 its possible to allocate larger on-disk inodes: + // - Inode disk record size = sb.s_inode_size (function return value). + // = 256 (default) + // - Inode struct size = 128 + inode.i_extra_isize. + // = 128 + 32 = 160 (default) + InodeSize() uint16 + + // InodesPerGroup returns the number of inodes in a block group. + InodesPerGroup() uint32 + + // BgDescSize returns the size of the block group descriptor struct. + // + // In ext2, ext3, ext4 (without 64-bit feature), the block group descriptor + // is only 32 bytes long. + // In ext4 with 64-bit feature, the block group descriptor expands to AT LEAST + // 64 bytes. It might be bigger than that. + BgDescSize() uint16 + + // CompatibleFeatures returns the CompatFeatures struct which holds all the + // compatible features this fs supports. + CompatibleFeatures() CompatFeatures + + // IncompatibleFeatures returns the CompatFeatures struct which holds all the + // incompatible features this fs supports. + IncompatibleFeatures() IncompatFeatures + + // ReadOnlyCompatibleFeatures returns the CompatFeatures struct which holds all the + // readonly compatible features this fs supports. + ReadOnlyCompatibleFeatures() RoCompatFeatures + + // Magic() returns the magic signature which must be 0xef53. + Magic() uint16 + + // Revision returns the superblock revision. Superblock struct fields from + // offset 0x54 till 0x150 should only be used if superblock has DynamicRev. + Revision() SbRevision +} + +// SbRevision is the type for superblock revisions. +type SbRevision int + +// Super block revisions. +const ( + // OldRev is the good old (original) format. + OldRev SbRevision = 0 + + // DynamicRev is v2 format w/ dynamic inode sizes. + DynamicRev SbRevision = 1 +) + +// Superblock compatible features. +// This is not exhaustive, unused features are not listed. +const ( + // SbDirPrealloc indicates directory preallocation. + SbDirPrealloc = 0x1 + + // SbHasJournal indicates the presence of a journal. jbd2 should only work + // with this being set. + SbHasJournal = 0x4 + + // SbExtAttr indicates extended attributes support. + SbExtAttr = 0x8 + + // SbResizeInode indicates that the fs has reserved GDT blocks (right after + // group descriptors) for fs expansion. + SbResizeInode = 0x10 + + // SbDirIndex indicates that the fs has directory indices. + SbDirIndex = 0x20 + + // SbSparseV2 stands for Sparse superblock version 2. + SbSparseV2 = 0x200 +) + +// CompatFeatures represents a superblock's compatible feature set. If the +// kernel does not understand any of these feature, it can still read/write +// to this fs. +type CompatFeatures struct { + DirPrealloc bool + HasJournal bool + ExtAttr bool + ResizeInode bool + DirIndex bool + SparseV2 bool +} + +// ToInt converts superblock compatible features back to its 32-bit rep. +func (f CompatFeatures) ToInt() uint32 { + var res uint32 + + if f.DirPrealloc { + res |= SbDirPrealloc + } + if f.HasJournal { + res |= SbHasJournal + } + if f.ExtAttr { + res |= SbExtAttr + } + if f.ResizeInode { + res |= SbResizeInode + } + if f.DirIndex { + res |= SbDirIndex + } + if f.SparseV2 { + res |= SbSparseV2 + } + + return res +} + +// CompatFeaturesFromInt converts the integer representation of superblock +// compatible features to CompatFeatures struct. +func CompatFeaturesFromInt(f uint32) CompatFeatures { + return CompatFeatures{ + DirPrealloc: f&SbDirPrealloc > 0, + HasJournal: f&SbHasJournal > 0, + ExtAttr: f&SbExtAttr > 0, + ResizeInode: f&SbResizeInode > 0, + DirIndex: f&SbDirIndex > 0, + SparseV2: f&SbSparseV2 > 0, + } +} + +// Superblock incompatible features. +// This is not exhaustive, unused features are not listed. +const ( + // SbDirentFileType indicates that directory entries record the file type. + // We should use struct ext4_dir_entry_2 for dirents then. + SbDirentFileType = 0x2 + + // SbRecovery indicates that the filesystem needs recovery. + SbRecovery = 0x4 + + // SbJournalDev indicates that the filesystem has a separate journal device. + SbJournalDev = 0x8 + + // SbMetaBG indicates that the filesystem is using Meta block groups. Moves + // the group descriptors from the congested first block group into the first + // group of each metablock group to increase the maximum block groups limit + // and hence support much larger filesystems. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#meta-block-groups. + SbMetaBG = 0x10 + + // SbExtents indicates that the filesystem uses extents. Must be set in ext4 + // filesystems. + SbExtents = 0x40 + + // SbIs64Bit indicates that this filesystem addresses blocks with 64-bits. + // Hence can support 2^64 data blocks. + SbIs64Bit = 0x80 + + // SbMMP indicates that this filesystem has multiple mount protection. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#multiple-mount-protection. + SbMMP = 0x100 + + // SbFlexBg indicates that this filesystem has flexible block groups. Several + // block groups are tied into one logical block group so that all the metadata + // for the block groups (bitmaps and inode tables) are close together for + // faster loading. Consequently, large files will be continuous on disk. + // However, this does not affect the placement of redundant superblocks and + // group descriptors. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#flexible-block-groups. + SbFlexBg = 0x200 + + // SbLargeDir shows that large directory enabled. Directory htree can be 3 + // levels deep. Directory htrees are allowed to be 2 levels deep otherwise. + SbLargeDir = 0x4000 + + // SbInlineData allows inline data in inodes for really small files. + SbInlineData = 0x8000 + + // SbEncrypted indicates that this fs contains encrypted inodes. + SbEncrypted = 0x10000 +) + +// IncompatFeatures represents a superblock's incompatible feature set. If the +// kernel does not understand any of these feature, it should refuse to mount. +type IncompatFeatures struct { + DirentFileType bool + Recovery bool + JournalDev bool + MetaBG bool + Extents bool + Is64Bit bool + MMP bool + FlexBg bool + LargeDir bool + InlineData bool + Encrypted bool +} + +// ToInt converts superblock incompatible features back to its 32-bit rep. +func (f IncompatFeatures) ToInt() uint32 { + var res uint32 + + if f.DirentFileType { + res |= SbDirentFileType + } + if f.Recovery { + res |= SbRecovery + } + if f.JournalDev { + res |= SbJournalDev + } + if f.MetaBG { + res |= SbMetaBG + } + if f.Extents { + res |= SbExtents + } + if f.Is64Bit { + res |= SbIs64Bit + } + if f.MMP { + res |= SbMMP + } + if f.FlexBg { + res |= SbFlexBg + } + if f.LargeDir { + res |= SbLargeDir + } + if f.InlineData { + res |= SbInlineData + } + if f.Encrypted { + res |= SbEncrypted + } + + return res +} + +// IncompatFeaturesFromInt converts the integer representation of superblock +// incompatible features to IncompatFeatures struct. +func IncompatFeaturesFromInt(f uint32) IncompatFeatures { + return IncompatFeatures{ + DirentFileType: f&SbDirentFileType > 0, + Recovery: f&SbRecovery > 0, + JournalDev: f&SbJournalDev > 0, + MetaBG: f&SbMetaBG > 0, + Extents: f&SbExtents > 0, + Is64Bit: f&SbIs64Bit > 0, + MMP: f&SbMMP > 0, + FlexBg: f&SbFlexBg > 0, + LargeDir: f&SbLargeDir > 0, + InlineData: f&SbInlineData > 0, + Encrypted: f&SbEncrypted > 0, + } +} + +// Superblock readonly compatible features. +// This is not exhaustive, unused features are not listed. +const ( + // SbSparse indicates sparse superblocks. Only groups with number either 0 or + // a power of 3, 5, or 7 will have redundant copies of the superblock and + // block descriptors. + SbSparse = 0x1 + + // SbLargeFile indicates that this fs has been used to store a file >= 2GiB. + SbLargeFile = 0x2 + + // SbHugeFile indicates that this fs contains files whose sizes are + // represented in units of logicals blocks, not 512-byte sectors. + SbHugeFile = 0x8 + + // SbGdtCsum indicates that group descriptors have checksums. + SbGdtCsum = 0x10 + + // SbDirNlink indicates that the new subdirectory limit is 64,999. Ext3 has a + // 32,000 subdirectory limit. + SbDirNlink = 0x20 + + // SbExtraIsize indicates that large inodes exist on this filesystem. + SbExtraIsize = 0x40 + + // SbHasSnapshot indicates the existence of a snapshot. + SbHasSnapshot = 0x80 + + // SbQuota enables usage tracking for all quota types. + SbQuota = 0x100 + + // SbBigalloc maps to the bigalloc feature. When set, the minimum allocation + // unit becomes a cluster rather than a data block. Then block bitmaps track + // clusters, not data blocks. + // + // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#bigalloc. + SbBigalloc = 0x200 + + // SbMetadataCsum indicates that the fs supports metadata checksumming. + SbMetadataCsum = 0x400 + + // SbReadOnly marks this filesystem as readonly. Should refuse to mount in + // read/write mode. + SbReadOnly = 0x1000 +) + +// RoCompatFeatures represents a superblock's readonly compatible feature set. +// If the kernel does not understand any of these feature, it can still mount +// readonly. But if the user wants to mount read/write, the kernel should +// refuse to mount. +type RoCompatFeatures struct { + Sparse bool + LargeFile bool + HugeFile bool + GdtCsum bool + DirNlink bool + ExtraIsize bool + HasSnapshot bool + Quota bool + Bigalloc bool + MetadataCsum bool + ReadOnly bool +} + +// ToInt converts superblock readonly compatible features to its 32-bit rep. +func (f RoCompatFeatures) ToInt() uint32 { + var res uint32 + + if f.Sparse { + res |= SbSparse + } + if f.LargeFile { + res |= SbLargeFile + } + if f.HugeFile { + res |= SbHugeFile + } + if f.GdtCsum { + res |= SbGdtCsum + } + if f.DirNlink { + res |= SbDirNlink + } + if f.ExtraIsize { + res |= SbExtraIsize + } + if f.HasSnapshot { + res |= SbHasSnapshot + } + if f.Quota { + res |= SbQuota + } + if f.Bigalloc { + res |= SbBigalloc + } + if f.MetadataCsum { + res |= SbMetadataCsum + } + if f.ReadOnly { + res |= SbReadOnly + } + + return res +} + +// RoCompatFeaturesFromInt converts the integer representation of superblock +// readonly compatible features to RoCompatFeatures struct. +func RoCompatFeaturesFromInt(f uint32) RoCompatFeatures { + return RoCompatFeatures{ + Sparse: f&SbSparse > 0, + LargeFile: f&SbLargeFile > 0, + HugeFile: f&SbHugeFile > 0, + GdtCsum: f&SbGdtCsum > 0, + DirNlink: f&SbDirNlink > 0, + ExtraIsize: f&SbExtraIsize > 0, + HasSnapshot: f&SbHasSnapshot > 0, + Quota: f&SbQuota > 0, + Bigalloc: f&SbBigalloc > 0, + MetadataCsum: f&SbMetadataCsum > 0, + ReadOnly: f&SbReadOnly > 0, + } +} diff --git a/pkg/sentry/fs/ext/disklayout/superblock_32.go b/pkg/sentry/fs/ext/disklayout/superblock_32.go new file mode 100644 index 000000000..587e4afaa --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/superblock_32.go @@ -0,0 +1,75 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlock32Bit implements SuperBlock and represents the 32-bit version of +// the ext4_super_block struct in fs/ext4/ext4.h. +type SuperBlock32Bit struct { + // We embed the old superblock struct here because the 32-bit version is just + // an extension of the old version. + SuperBlockOld + + FirstInode uint32 + InodeSizeRaw uint16 + BlockGroupNumber uint16 + FeatureCompat uint32 + FeatureIncompat uint32 + FeatureRoCompat uint32 + UUID [16]byte + VolumeName [16]byte + LastMounted [64]byte + AlgoUsageBitmap uint32 + PreallocBlocks uint8 + PreallocDirBlocks uint8 + ReservedGdtBlocks uint16 + JournalUUID [16]byte + JournalInum uint32 + JournalDev uint32 + LastOrphan uint32 + HashSeed [4]uint32 + DefaultHashVersion uint8 + JnlBackupType uint8 + BgDescSizeRaw uint16 + DefaultMountOpts uint32 + FirstMetaBg uint32 + MkfsTime uint32 + JnlBlocks [17]uint32 +} + +// Compiles only if SuperBlock32Bit implements SuperBlock. +var _ SuperBlock = (*SuperBlock32Bit)(nil) + +// Only override methods which change based on the additional fields above. +// Not overriding SuperBlock.BgDescSize because it would still return 32 here. + +// InodeSize implements SuperBlock.InodeSize. +func (sb *SuperBlock32Bit) InodeSize() uint16 { + return sb.InodeSizeRaw +} + +// CompatibleFeatures implements SuperBlock.CompatibleFeatures. +func (sb *SuperBlock32Bit) CompatibleFeatures() CompatFeatures { + return CompatFeaturesFromInt(sb.FeatureCompat) +} + +// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. +func (sb *SuperBlock32Bit) IncompatibleFeatures() IncompatFeatures { + return IncompatFeaturesFromInt(sb.FeatureIncompat) +} + +// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. +func (sb *SuperBlock32Bit) ReadOnlyCompatibleFeatures() RoCompatFeatures { + return RoCompatFeaturesFromInt(sb.FeatureRoCompat) +} diff --git a/pkg/sentry/fs/ext/disklayout/superblock_64.go b/pkg/sentry/fs/ext/disklayout/superblock_64.go new file mode 100644 index 000000000..a2c2278fb --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/superblock_64.go @@ -0,0 +1,94 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlock64Bit implements SuperBlock and represents the 64-bit version of +// the ext4_super_block struct in fs/ext4/ext4.h. This sums up to be exactly +// 1024 bytes (smallest possible block size) and hence the superblock always +// fits in no more than one data block. +type SuperBlock64Bit struct { + // We embed the 32-bit struct here because 64-bit version is just an extension + // of the 32-bit version. + SuperBlock32Bit + + BlocksCountHi uint32 + ReservedBlocksCountHi uint32 + FreeBlocksCountHi uint32 + MinInodeSize uint16 + WantInodeSize uint16 + Flags uint32 + RaidStride uint16 + MmpInterval uint16 + MmpBlock uint64 + RaidStripeWidth uint32 + LogGroupsPerFlex uint8 + ChecksumType uint8 + _ uint16 + KbytesWritten uint64 + SnapshotInum uint32 + SnapshotID uint32 + SnapshotRsrvBlocksCount uint64 + SnapshotList uint32 + ErrorCount uint32 + FirstErrorTime uint32 + FirstErrorInode uint32 + FirstErrorBlock uint64 + FirstErrorFunction [32]byte + FirstErrorLine uint32 + LastErrorTime uint32 + LastErrorInode uint32 + LastErrorLine uint32 + LastErrorBlock uint64 + LastErrorFunction [32]byte + MountOpts [64]byte + UserQuotaInum uint32 + GroupQuotaInum uint32 + OverheadBlocks uint32 + BackupBgs [2]uint32 + EncryptAlgos [4]uint8 + EncryptPwSalt [16]uint8 + LostFoundInode uint32 + ProjectQuotaInode uint32 + ChecksumSeed uint32 + WtimeHi uint8 + MtimeHi uint8 + MkfsTimeHi uint8 + LastCheckHi uint8 + FirstErrorTimeHi uint8 + LastErrorTimeHi uint8 + _ [2]uint8 + Encoding uint16 + EncodingFlags uint16 + _ [95]uint32 + Checksum uint32 +} + +// Compiles only if SuperBlock64Bit implements SuperBlock. +var _ SuperBlock = (*SuperBlock64Bit)(nil) + +// Only override methods which change based on the 64-bit feature. + +// BlocksCount implements SuperBlock.BlocksCount. +func (sb *SuperBlock64Bit) BlocksCount() uint64 { + return (uint64(sb.BlocksCountHi) << 32) | uint64(sb.BlocksCountLo) +} + +// FreeBlocksCount implements SuperBlock.FreeBlocksCount. +func (sb *SuperBlock64Bit) FreeBlocksCount() uint64 { + return (uint64(sb.FreeBlocksCountHi) << 32) | uint64(sb.FreeBlocksCountLo) +} + +// BgDescSize implements SuperBlock.BgDescSize. +func (sb *SuperBlock64Bit) BgDescSize() uint16 { return sb.BgDescSizeRaw } diff --git a/pkg/sentry/fs/ext/disklayout/superblock_old.go b/pkg/sentry/fs/ext/disklayout/superblock_old.go new file mode 100644 index 000000000..c74953610 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/superblock_old.go @@ -0,0 +1,102 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +// SuperBlockOld implements SuperBlock and represents the old version of the +// superblock struct in ext2 and ext3 systems. +type SuperBlockOld struct { + InodesCountRaw uint32 + BlocksCountLo uint32 + ReservedBlocksCount uint32 + FreeBlocksCountLo uint32 + FreeInodesCountRaw uint32 + FirstDataBlockRaw uint32 + LogBlockSize uint32 + LogClusterSize uint32 + BlocksPerGroupRaw uint32 + ClustersPerGroupRaw uint32 + InodesPerGroupRaw uint32 + Mtime uint32 + Wtime uint32 + MountCountRaw uint16 + MaxMountCountRaw uint16 + MagicRaw uint16 + State uint16 + Errors uint16 + MinorRevLevel uint16 + LastCheck uint32 + CheckInterval uint32 + CreatorOS uint32 + RevLevel uint32 + DefResUID uint16 + DefResGID uint16 +} + +// InodesCount implements SuperBlock.InodesCount. +func (sb *SuperBlockOld) InodesCount() uint32 { return sb.InodesCountRaw } + +// BlocksCount implements SuperBlock.BlocksCount. +func (sb *SuperBlockOld) BlocksCount() uint64 { return uint64(sb.BlocksCountLo) } + +// FreeBlocksCount implements SuperBlock.FreeBlocksCount. +func (sb *SuperBlockOld) FreeBlocksCount() uint64 { return uint64(sb.FreeBlocksCountLo) } + +// FreeInodesCount implements SuperBlock.FreeInodesCount. +func (sb *SuperBlockOld) FreeInodesCount() uint32 { return sb.FreeInodesCountRaw } + +// MountCount implements SuperBlock.MountCount. +func (sb *SuperBlockOld) MountCount() uint16 { return sb.MountCountRaw } + +// MaxMountCount implements SuperBlock.MaxMountCount. +func (sb *SuperBlockOld) MaxMountCount() uint16 { return sb.MaxMountCountRaw } + +// FirstDataBlock implements SuperBlock.FirstDataBlock. +func (sb *SuperBlockOld) FirstDataBlock() uint32 { return sb.FirstDataBlockRaw } + +// BlockSize implements SuperBlock.BlockSize. +func (sb *SuperBlockOld) BlockSize() uint64 { return 1 << (10 + sb.LogBlockSize) } + +// BlocksPerGroup implements SuperBlock.BlocksPerGroup. +func (sb *SuperBlockOld) BlocksPerGroup() uint32 { return sb.BlocksPerGroupRaw } + +// ClusterSize implements SuperBlock.ClusterSize. +func (sb *SuperBlockOld) ClusterSize() uint64 { return 1 << (10 + sb.LogClusterSize) } + +// ClustersPerGroup implements SuperBlock.ClustersPerGroup. +func (sb *SuperBlockOld) ClustersPerGroup() uint32 { return sb.ClustersPerGroupRaw } + +// InodeSize implements SuperBlock.InodeSize. +func (sb *SuperBlockOld) InodeSize() uint16 { return oldInodeSize } + +// InodesPerGroup implements SuperBlock.InodesPerGroup. +func (sb *SuperBlockOld) InodesPerGroup() uint32 { return sb.InodesPerGroupRaw } + +// BgDescSize implements SuperBlock.BgDescSize. +func (sb *SuperBlockOld) BgDescSize() uint16 { return 32 } + +// CompatibleFeatures implements SuperBlock.CompatibleFeatures. +func (sb *SuperBlockOld) CompatibleFeatures() CompatFeatures { return CompatFeatures{} } + +// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. +func (sb *SuperBlockOld) IncompatibleFeatures() IncompatFeatures { return IncompatFeatures{} } + +// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. +func (sb *SuperBlockOld) ReadOnlyCompatibleFeatures() RoCompatFeatures { return RoCompatFeatures{} } + +// Magic implements SuperBlock.Magic. +func (sb *SuperBlockOld) Magic() uint16 { return sb.MagicRaw } + +// Revision implements SuperBlock.Revision. +func (sb *SuperBlockOld) Revision() SbRevision { return SbRevision(sb.RevLevel) } diff --git a/pkg/sentry/fs/ext/disklayout/superblock_test.go b/pkg/sentry/fs/ext/disklayout/superblock_test.go new file mode 100644 index 000000000..463b5ba21 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/superblock_test.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "testing" +) + +// TestSuperBlockSize tests that the superblock structs are of the correct +// size. +func TestSuperBlockSize(t *testing.T) { + assertSize(t, SuperBlockOld{}, 84) + assertSize(t, SuperBlock32Bit{}, 336) + assertSize(t, SuperBlock64Bit{}, 1024) +} diff --git a/pkg/sentry/fs/ext/disklayout/test_utils.go b/pkg/sentry/fs/ext/disklayout/test_utils.go new file mode 100644 index 000000000..9c63f04c0 --- /dev/null +++ b/pkg/sentry/fs/ext/disklayout/test_utils.go @@ -0,0 +1,30 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disklayout + +import ( + "reflect" + "testing" + + "gvisor.dev/gvisor/pkg/binary" +) + +func assertSize(t *testing.T, v interface{}, want uintptr) { + t.Helper() + + if got := binary.Size(v); got != want { + t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got) + } +} diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go new file mode 100644 index 000000000..7602e2bf0 --- /dev/null +++ b/pkg/sentry/fs/ext/ext.go @@ -0,0 +1,49 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ext implements readonly ext(2/3/4) filesystems. +package ext + +import ( + "io" + "sync" + + "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" +) + +// Filesystem implements vfs.FilesystemImpl. +type Filesystem struct { + // dev is the ReadSeeker for the underlying fs device and is protected by mu. + dev io.ReadSeeker + + // mu synchronizes the usage of dev. The ext filesystems take locality into + // condsideration, i.e. data blocks of a file will tend to be placed close + // together. On a spinning disk, locality reduces the amount of movement of + // the head hence speeding up IO operations. On an SSD there are no moving + // parts but locality increases the size of each transer request. Hence, + // having mutual exclusion on the read seeker while reading a file *should* + // help in achieving the intended performance gains. + // + // Note: This synchronization was not coupled with the ReadSeeker itself + // because we want to synchronize across read/seek operations for the + // performance gains mentioned above. Helps enforcing one-file-at-a-time IO. + mu sync.Mutex + + // sb represents the filesystem superblock. Immutable after initialization. + sb disklayout.SuperBlock + + // bgs represents all the block group descriptors for the filesystem. + // Immutable after initialization. + bgs []disklayout.BlockGroup +} diff --git a/pkg/sentry/fs/ext4/BUILD b/pkg/sentry/fs/ext4/BUILD deleted file mode 100644 index 9dce67635..000000000 --- a/pkg/sentry/fs/ext4/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -package(licenses = ["notice"]) - -load("//tools/go_stateify:defs.bzl", "go_library") - -go_library( - name = "ext4", - srcs = ["fs.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext4", - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/sentry/context", - "//pkg/sentry/fs", - ], -) diff --git a/pkg/sentry/fs/ext4/disklayout/BUILD b/pkg/sentry/fs/ext4/disklayout/BUILD deleted file mode 100644 index 2920bfb52..000000000 --- a/pkg/sentry/fs/ext4/disklayout/BUILD +++ /dev/null @@ -1,45 +0,0 @@ -package(licenses = ["notice"]) - -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") - -go_library( - name = "disklayout", - srcs = [ - "block_group.go", - "block_group_32.go", - "block_group_64.go", - "dirent.go", - "dirent_new.go", - "dirent_old.go", - "disklayout.go", - "inode.go", - "inode_new.go", - "inode_old.go", - "superblock.go", - "superblock_32.go", - "superblock_64.go", - "superblock_old.go", - "test_utils.go", - ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext4/disklayout", - deps = [ - "//pkg/abi/linux", - "//pkg/binary", - "//pkg/sentry/fs", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/time", - ], -) - -go_test( - name = "disklayout_test", - size = "small", - srcs = [ - "block_group_test.go", - "dirent_test.go", - "inode_test.go", - "superblock_test.go", - ], - embed = [":disklayout"], - deps = ["//pkg/sentry/kernel/time"], -) diff --git a/pkg/sentry/fs/ext4/disklayout/block_group.go b/pkg/sentry/fs/ext4/disklayout/block_group.go deleted file mode 100644 index 32ea3d97d..000000000 --- a/pkg/sentry/fs/ext4/disklayout/block_group.go +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// BlockGroup represents a Linux ext block group descriptor. An ext file system -// is split into a series of block groups. This provides an access layer to -// information needed to access and use a block group. -// -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors. -type BlockGroup interface { - // InodeTable returns the absolute block number of the block containing the - // inode table. This points to an array of Inode structs. Inode tables are - // statically allocated at mkfs time. The superblock records the number of - // inodes per group (length of this table) and the size of each inode struct. - InodeTable() uint64 - - // BlockBitmap returns the absolute block number of the block containing the - // block bitmap. This bitmap tracks the usage of data blocks within this block - // group and has its own checksum. - BlockBitmap() uint64 - - // InodeBitmap returns the absolute block number of the block containing the - // inode bitmap. This bitmap tracks the usage of this group's inode table - // entries and has its own checksum. - InodeBitmap() uint64 - - // ExclusionBitmap returns the absolute block number of the snapshot exclusion - // bitmap. - ExclusionBitmap() uint64 - - // FreeBlocksCount returns the number of free blocks in the group. - FreeBlocksCount() uint32 - - // FreeInodesCount returns the number of free inodes in the group. - FreeInodesCount() uint32 - - // DirectoryCount returns the number of inodes that represent directories - // under this block group. - DirectoryCount() uint32 - - // UnusedInodeCount returns the number of unused inodes beyond the last used - // inode in this group's inode table. As a result, we needn’t scan past the - // (InodesPerGroup - UnusedInodeCount())th entry in the inode table. - UnusedInodeCount() uint32 - - // BlockBitmapChecksum returns the block bitmap checksum. This is calculated - // using crc32c(FS UUID + group number + entire bitmap). - BlockBitmapChecksum() uint32 - - // InodeBitmapChecksum returns the inode bitmap checksum. This is calculated - // using crc32c(FS UUID + group number + entire bitmap). - InodeBitmapChecksum() uint32 - - // Checksum returns this block group's checksum. - // - // If SbMetadataCsum feature is set: - // - checksum is crc32c(FS UUID + group number + group descriptor - // structure) & 0xFFFF. - // - // If SbGdtCsum feature is set: - // - checksum is crc16(FS UUID + group number + group descriptor - // structure). - // - // SbMetadataCsum and SbGdtCsum should not be both set. - // If they are, Linux warns and asks to run fsck. - Checksum() uint16 - - // Flags returns BGFlags which represents the block group flags. - Flags() BGFlags -} - -// These are the different block group flags. -const ( - // BgInodeUninit indicates that inode table and bitmap are not initialized. - BgInodeUninit uint16 = 0x1 - - // BgBlockUninit indicates that block bitmap is not initialized. - BgBlockUninit uint16 = 0x2 - - // BgInodeZeroed indicates that inode table is zeroed. - BgInodeZeroed uint16 = 0x4 -) - -// BGFlags represents all the different combinations of block group flags. -type BGFlags struct { - InodeUninit bool - BlockUninit bool - InodeZeroed bool -} - -// ToInt converts a BGFlags struct back to its 16-bit representation. -func (f BGFlags) ToInt() uint16 { - var res uint16 - - if f.InodeUninit { - res |= BgInodeUninit - } - if f.BlockUninit { - res |= BgBlockUninit - } - if f.InodeZeroed { - res |= BgInodeZeroed - } - - return res -} - -// BGFlagsFromInt converts the 16-bit flag representation to a BGFlags struct. -func BGFlagsFromInt(flags uint16) BGFlags { - return BGFlags{ - InodeUninit: flags&BgInodeUninit > 0, - BlockUninit: flags&BgBlockUninit > 0, - InodeZeroed: flags&BgInodeZeroed > 0, - } -} diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_32.go b/pkg/sentry/fs/ext4/disklayout/block_group_32.go deleted file mode 100644 index 3e16c76db..000000000 --- a/pkg/sentry/fs/ext4/disklayout/block_group_32.go +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// BlockGroup32Bit emulates the first half of struct ext4_group_desc in -// fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and -// 32-bit ext4 filesystems. It implements BlockGroup interface. -type BlockGroup32Bit struct { - BlockBitmapLo uint32 - InodeBitmapLo uint32 - InodeTableLo uint32 - FreeBlocksCountLo uint16 - FreeInodesCountLo uint16 - UsedDirsCountLo uint16 - FlagsRaw uint16 - ExcludeBitmapLo uint32 - BlockBitmapChecksumLo uint16 - InodeBitmapChecksumLo uint16 - ItableUnusedLo uint16 - ChecksumRaw uint16 -} - -// Compiles only if BlockGroup32Bit implements BlockGroup. -var _ BlockGroup = (*BlockGroup32Bit)(nil) - -// InodeTable implements BlockGroup.InodeTable. -func (bg *BlockGroup32Bit) InodeTable() uint64 { return uint64(bg.InodeTableLo) } - -// BlockBitmap implements BlockGroup.BlockBitmap. -func (bg *BlockGroup32Bit) BlockBitmap() uint64 { return uint64(bg.BlockBitmapLo) } - -// InodeBitmap implements BlockGroup.InodeBitmap. -func (bg *BlockGroup32Bit) InodeBitmap() uint64 { return uint64(bg.InodeBitmapLo) } - -// ExclusionBitmap implements BlockGroup.ExclusionBitmap. -func (bg *BlockGroup32Bit) ExclusionBitmap() uint64 { return uint64(bg.ExcludeBitmapLo) } - -// FreeBlocksCount implements BlockGroup.FreeBlocksCount. -func (bg *BlockGroup32Bit) FreeBlocksCount() uint32 { return uint32(bg.FreeBlocksCountLo) } - -// FreeInodesCount implements BlockGroup.FreeInodesCount. -func (bg *BlockGroup32Bit) FreeInodesCount() uint32 { return uint32(bg.FreeInodesCountLo) } - -// DirectoryCount implements BlockGroup.DirectoryCount. -func (bg *BlockGroup32Bit) DirectoryCount() uint32 { return uint32(bg.UsedDirsCountLo) } - -// UnusedInodeCount implements BlockGroup.UnusedInodeCount. -func (bg *BlockGroup32Bit) UnusedInodeCount() uint32 { return uint32(bg.ItableUnusedLo) } - -// BlockBitmapChecksum implements BlockGroup.BlockBitmapChecksum. -func (bg *BlockGroup32Bit) BlockBitmapChecksum() uint32 { return uint32(bg.BlockBitmapChecksumLo) } - -// InodeBitmapChecksum implements BlockGroup.InodeBitmapChecksum. -func (bg *BlockGroup32Bit) InodeBitmapChecksum() uint32 { return uint32(bg.InodeBitmapChecksumLo) } - -// Checksum implements BlockGroup.Checksum. -func (bg *BlockGroup32Bit) Checksum() uint16 { return bg.ChecksumRaw } - -// Flags implements BlockGroup.Flags. -func (bg *BlockGroup32Bit) Flags() BGFlags { return BGFlagsFromInt(bg.FlagsRaw) } diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_64.go b/pkg/sentry/fs/ext4/disklayout/block_group_64.go deleted file mode 100644 index 9a809197a..000000000 --- a/pkg/sentry/fs/ext4/disklayout/block_group_64.go +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// BlockGroup64Bit emulates struct ext4_group_desc in fs/ext4/ext4.h. -// It is the block group descriptor struct for 64-bit ext4 filesystems. -// It implements BlockGroup interface. It is an extension of the 32-bit -// version of BlockGroup. -type BlockGroup64Bit struct { - // We embed the 32-bit struct here because 64-bit version is just an extension - // of the 32-bit version. - BlockGroup32Bit - - // 64-bit specific fields. - BlockBitmapHi uint32 - InodeBitmapHi uint32 - InodeTableHi uint32 - FreeBlocksCountHi uint16 - FreeInodesCountHi uint16 - UsedDirsCountHi uint16 - ItableUnusedHi uint16 - ExcludeBitmapHi uint32 - BlockBitmapChecksumHi uint16 - InodeBitmapChecksumHi uint16 - _ uint32 // Padding to 64 bytes. -} - -// Compiles only if BlockGroup64Bit implements BlockGroup. -var _ BlockGroup = (*BlockGroup64Bit)(nil) - -// Methods to override. Checksum() and Flags() are not overridden. - -// InodeTable implements BlockGroup.InodeTable. -func (bg *BlockGroup64Bit) InodeTable() uint64 { - return (uint64(bg.InodeTableHi) << 32) | uint64(bg.InodeTableLo) -} - -// BlockBitmap implements BlockGroup.BlockBitmap. -func (bg *BlockGroup64Bit) BlockBitmap() uint64 { - return (uint64(bg.BlockBitmapHi) << 32) | uint64(bg.BlockBitmapLo) -} - -// InodeBitmap implements BlockGroup.InodeBitmap. -func (bg *BlockGroup64Bit) InodeBitmap() uint64 { - return (uint64(bg.InodeBitmapHi) << 32) | uint64(bg.InodeBitmapLo) -} - -// ExclusionBitmap implements BlockGroup.ExclusionBitmap. -func (bg *BlockGroup64Bit) ExclusionBitmap() uint64 { - return (uint64(bg.ExcludeBitmapHi) << 32) | uint64(bg.ExcludeBitmapLo) -} - -// FreeBlocksCount implements BlockGroup.FreeBlocksCount. -func (bg *BlockGroup64Bit) FreeBlocksCount() uint32 { - return (uint32(bg.FreeBlocksCountHi) << 16) | uint32(bg.FreeBlocksCountLo) -} - -// FreeInodesCount implements BlockGroup.FreeInodesCount. -func (bg *BlockGroup64Bit) FreeInodesCount() uint32 { - return (uint32(bg.FreeInodesCountHi) << 16) | uint32(bg.FreeInodesCountLo) -} - -// DirectoryCount implements BlockGroup.DirectoryCount. -func (bg *BlockGroup64Bit) DirectoryCount() uint32 { - return (uint32(bg.UsedDirsCountHi) << 16) | uint32(bg.UsedDirsCountLo) -} - -// UnusedInodeCount implements BlockGroup.UnusedInodeCount. -func (bg *BlockGroup64Bit) UnusedInodeCount() uint32 { - return (uint32(bg.ItableUnusedHi) << 16) | uint32(bg.ItableUnusedLo) -} - -// BlockBitmapChecksum implements BlockGroup.BlockBitmapChecksum. -func (bg *BlockGroup64Bit) BlockBitmapChecksum() uint32 { - return (uint32(bg.BlockBitmapChecksumHi) << 16) | uint32(bg.BlockBitmapChecksumLo) -} - -// InodeBitmapChecksum implements BlockGroup.InodeBitmapChecksum. -func (bg *BlockGroup64Bit) InodeBitmapChecksum() uint32 { - return (uint32(bg.InodeBitmapChecksumHi) << 16) | uint32(bg.InodeBitmapChecksumLo) -} diff --git a/pkg/sentry/fs/ext4/disklayout/block_group_test.go b/pkg/sentry/fs/ext4/disklayout/block_group_test.go deleted file mode 100644 index 0ef4294c0..000000000 --- a/pkg/sentry/fs/ext4/disklayout/block_group_test.go +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "testing" -) - -// TestBlockGroupSize tests that the block group descriptor structs are of the -// correct size. -func TestBlockGroupSize(t *testing.T) { - assertSize(t, BlockGroup32Bit{}, 32) - assertSize(t, BlockGroup64Bit{}, 64) -} diff --git a/pkg/sentry/fs/ext4/disklayout/dirent.go b/pkg/sentry/fs/ext4/disklayout/dirent.go deleted file mode 100644 index 685bf57b8..000000000 --- a/pkg/sentry/fs/ext4/disklayout/dirent.go +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/sentry/fs" -) - -const ( - // MaxFileName is the maximum length of an ext fs file's name. - MaxFileName = 255 -) - -var ( - // inodeTypeByFileType maps ext4 file types to vfs inode types. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#ftype. - inodeTypeByFileType = map[uint8]fs.InodeType{ - 0: fs.Anonymous, - 1: fs.RegularFile, - 2: fs.Directory, - 3: fs.CharacterDevice, - 4: fs.BlockDevice, - 5: fs.Pipe, - 6: fs.Socket, - 7: fs.Symlink, - } -) - -// The Dirent interface should be implemented by structs representing ext -// directory entries. These are for the linear classical directories which -// just store a list of dirent structs. A directory is a series of data blocks -// where is each data block contains a linear array of dirents. The last entry -// of the block has a record size that takes it to the end of the block. The -// end of the directory is when you read dirInode.Size() bytes from the blocks. -// -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories. -type Dirent interface { - // Inode returns the absolute inode number of the underlying inode. - // Inode number 0 signifies an unused dirent. - Inode() uint32 - - // RecordSize returns the record length of this dirent on disk. The next - // dirent in the dirent list should be read after these many bytes from - // the current dirent. Must be a multiple of 4. - RecordSize() uint16 - - // FileName returns the name of the file. Can be at most 255 is length. - FileName() string - - // FileType returns the inode type of the underlying inode. This is a - // performance hack so that we do not have to read the underlying inode struct - // to know the type of inode. This will only work when the SbDirentFileType - // feature is set. If not, the second returned value will be false indicating - // that user code has to use the inode mode to extract the file type. - FileType() (fs.InodeType, bool) -} diff --git a/pkg/sentry/fs/ext4/disklayout/dirent_new.go b/pkg/sentry/fs/ext4/disklayout/dirent_new.go deleted file mode 100644 index 29ae4a5c2..000000000 --- a/pkg/sentry/fs/ext4/disklayout/dirent_new.go +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "fmt" - - "gvisor.dev/gvisor/pkg/sentry/fs" -) - -// DirentNew represents the ext4 directory entry struct. This emulates Linux's -// ext4_dir_entry_2 struct. The FileName can not be more than 255 bytes so we -// only need 8 bits to store the NameLength. As a result, NameLength has been -// shortened and the other 8 bits are used to encode the file type. Use the -// FileTypeRaw field only if the SbDirentFileType feature is set. -// -// Note: This struct can be of variable size on disk. The one described below -// is of maximum size and the FileName beyond NameLength bytes might contain -// garbage. -type DirentNew struct { - InodeNumber uint32 - RecordLength uint16 - NameLength uint8 - FileTypeRaw uint8 - FileNameRaw [MaxFileName]byte -} - -// Compiles only if DirentNew implements Dirent. -var _ Dirent = (*DirentNew)(nil) - -// Inode implements Dirent.Inode. -func (d *DirentNew) Inode() uint32 { return d.InodeNumber } - -// RecordSize implements Dirent.RecordSize. -func (d *DirentNew) RecordSize() uint16 { return d.RecordLength } - -// FileName implements Dirent.FileName. -func (d *DirentNew) FileName() string { - return string(d.FileNameRaw[:d.NameLength]) -} - -// FileType implements Dirent.FileType. -func (d *DirentNew) FileType() (fs.InodeType, bool) { - if inodeType, ok := inodeTypeByFileType[d.FileTypeRaw]; ok { - return inodeType, true - } - - panic(fmt.Sprintf("unknown file type %v", d.FileTypeRaw)) -} diff --git a/pkg/sentry/fs/ext4/disklayout/dirent_old.go b/pkg/sentry/fs/ext4/disklayout/dirent_old.go deleted file mode 100644 index 2e0f9c812..000000000 --- a/pkg/sentry/fs/ext4/disklayout/dirent_old.go +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import "gvisor.dev/gvisor/pkg/sentry/fs" - -// DirentOld represents the old directory entry struct which does not contain -// the file type. This emulates Linux's ext4_dir_entry struct. This is used in -// ext2, ext3 and sometimes in ext4. -// -// Note: This struct can be of variable size on disk. The one described below -// is of maximum size and the FileName beyond NameLength bytes might contain -// garbage. -type DirentOld struct { - InodeNumber uint32 - RecordLength uint16 - NameLength uint16 - FileNameRaw [MaxFileName]byte -} - -// Compiles only if DirentOld implements Dirent. -var _ Dirent = (*DirentOld)(nil) - -// Inode implements Dirent.Inode. -func (d *DirentOld) Inode() uint32 { return d.InodeNumber } - -// RecordSize implements Dirent.RecordSize. -func (d *DirentOld) RecordSize() uint16 { return d.RecordLength } - -// FileName implements Dirent.FileName. -func (d *DirentOld) FileName() string { - return string(d.FileNameRaw[:d.NameLength]) -} - -// FileType implements Dirent.FileType. -func (d *DirentOld) FileType() (fs.InodeType, bool) { - return fs.Anonymous, false -} diff --git a/pkg/sentry/fs/ext4/disklayout/dirent_test.go b/pkg/sentry/fs/ext4/disklayout/dirent_test.go deleted file mode 100644 index cc6dff2c9..000000000 --- a/pkg/sentry/fs/ext4/disklayout/dirent_test.go +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "testing" -) - -// TestDirentSize tests that the dirent structs are of the correct -// size. -func TestDirentSize(t *testing.T) { - want := uintptr(263) - - assertSize(t, DirentOld{}, want) - assertSize(t, DirentNew{}, want) -} diff --git a/pkg/sentry/fs/ext4/disklayout/disklayout.go b/pkg/sentry/fs/ext4/disklayout/disklayout.go deleted file mode 100644 index bdf4e2132..000000000 --- a/pkg/sentry/fs/ext4/disklayout/disklayout.go +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package disklayout provides Linux ext file system's disk level structures -// which can be directly read into from the underlying device. Structs aim to -// emulate structures `exactly` how they are layed out on disk. -// -// This library aims to be compatible with all ext(2/3/4) systems so it -// provides a generic interface for all major structures and various -// implementations (for different versions). The user code is responsible for -// using appropriate implementations based on the underlying device. -// -// Interfacing all major structures here serves a few purposes: -// - Abstracts away the complexity of the underlying structure from client -// code. The client only has to figure out versioning on set up and then -// can use these as black boxes and pass it higher up the stack. -// - Having pointer receivers forces the user to use pointers to these -// heavy structs. Hence, prevents the client code from unintentionally -// copying these by value while passing the interface around. -// - Version-based implementation selection is resolved on set up hence -// avoiding per call overhead of choosing implementation. -// - All interface methods are pretty light weight (do not take in any -// parameters by design). Passing pointer arguments to interface methods -// can lead to heap allocation as the compiler won't be able to perform -// escape analysis on an unknown implementation at compile time. -// -// Notes: -// - All fields in these structs are exported because binary.Read would -// panic otherwise. -// - All structures on disk are in little-endian order. Only jbd2 (journal) -// structures are in big-endian order. -// - All OS dependent fields in these structures will be interpretted using -// the Linux version of that field. -// - The suffix `Lo` in field names stands for lower bits of that field. -// - The suffix `Hi` in field names stands for upper bits of that field. -// - The suffix `Raw` has been added to indicate that the field is not split -// into Lo and Hi fields and also to resolve name collision with the -// respective interface. -package disklayout diff --git a/pkg/sentry/fs/ext4/disklayout/inode.go b/pkg/sentry/fs/ext4/disklayout/inode.go deleted file mode 100644 index b48001910..000000000 --- a/pkg/sentry/fs/ext4/disklayout/inode.go +++ /dev/null @@ -1,267 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/time" -) - -// The Inode interface must be implemented by structs representing ext inodes. -// The inode stores all the metadata pertaining to the file (except for the -// file name which is held by the directory entry). It does NOT expose all -// fields and should be extended if need be. -// -// Some file systems (e.g. FAT) use the directory entry to store all this -// information. Ext file systems do not so that they can support hard links. -// However, ext4 cheats a little bit and duplicates the file type in the -// directory entry for performance gains. -// -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes. -type Inode interface { - // Mode returns the linux file mode which is majorly used to extract - // information like: - // - File permissions (read/write/execute by user/group/others). - // - Sticky, set UID and GID bits. - // - File type. - // - // Masks to extract this information are provided in pkg/abi/linux/file.go. - Mode() linux.FileMode - - // UID returns the owner UID. - UID() auth.KUID - - // GID returns the owner GID. - GID() auth.KGID - - // Size returns the size of the file in bytes. - Size() uint64 - - // InodeSize returns the size of this inode struct in bytes. - // In ext2 and ext3, the inode struct and inode disk record size was fixed at - // 128 bytes. Ext4 makes it possible for the inode struct to be bigger. - // However, accessing any field beyond the 128 bytes marker must be verified - // using this method. - InodeSize() uint16 - - // AccessTime returns the last access time. Shows when the file was last read. - // - // If InExtendedAttr is set, then this should NOT be used because the - // underlying field is used to store the extended attribute value checksum. - AccessTime() time.Time - - // ChangeTime returns the last change time. Shows when the file meta data - // (like permissions) was last changed. - // - // If InExtendedAttr is set, then this should NOT be used because the - // underlying field is used to store the lower 32 bits of the attribute - // value’s reference count. - ChangeTime() time.Time - - // ModificationTime returns the last modification time. Shows when the file - // content was last modified. - // - // If InExtendedAttr is set, then this should NOT be used because - // the underlying field contains the number of the inode that owns the - // extended attribute. - ModificationTime() time.Time - - // DeletionTime returns the deletion time. Inodes are marked as deleted by - // writing to the underlying field. FS tools can restore files until they are - // actually overwritten. - DeletionTime() time.Time - - // LinksCount returns the number of hard links to this inode. - // - // Normally there is an upper limit on the number of hard links: - // - ext2/ext3 = 32,000 - // - ext4 = 65,000 - // - // This implies that an ext4 directory cannot have more than 64,998 - // subdirectories because each subdirectory will have a hard link to the - // directory via the `..` entry. The directory has hard link via the `.` entry - // of its own. And finally the inode is initiated with 1 hard link (itself). - // - // The underlying value is reset to 1 if all the following hold: - // - Inode is a directory. - // - SbDirNlink is enabled. - // - Number of hard links is incremented past 64,999. - // Hard link value of 1 for a directory would indicate that the number of hard - // links is unknown because a directory can have minimum 2 hard links (itself - // and `.` entry). - LinksCount() uint16 - - // Flags returns InodeFlags which represents the inode flags. - Flags() InodeFlags - - // Blocks returns the underlying inode.i_block array. This field is special - // and is used to store various kinds of things depending on the filesystem - // version and inode type. - // - In ext2/ext3, it contains the block map. - // - In ext4, it contains the extent tree. - // - For inline files, it contains the file contents. - // - For symlinks, it contains the link path (if it fits here). - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#the-contents-of-inode-i-block. - Blocks() [60]byte -} - -// Inode flags. This is not comprehensive and flags which were not used in -// the Linux kernel have been excluded. -const ( - // InSync indicates that all writes to the file must be synchronous. - InSync = 0x8 - - // InImmutable indicates that this file is immutable. - InImmutable = 0x10 - - // InAppend indicates that this file can only be appended to. - InAppend = 0x20 - - // InNoDump indicates that teh dump(1) utility should not dump this file. - InNoDump = 0x40 - - // InNoAccessTime indicates that the access time of this inode must not be - // updated. - InNoAccessTime = 0x80 - - // InIndex indicates that this directory has hashed indexes. - InIndex = 0x1000 - - // InJournalData indicates that file data must always be written through a - // journal device. - InJournalData = 0x4000 - - // InDirSync indicates that all the directory entiry data must be written - // synchronously. - InDirSync = 0x10000 - - // InTopDir indicates that this inode is at the top of the directory hierarchy. - InTopDir = 0x20000 - - // InHugeFile indicates that this is a huge file. - InHugeFile = 0x40000 - - // InExtents indicates that this inode uses extents. - InExtents = 0x80000 - - // InExtendedAttr indicates that this inode stores a large extended attribute - // value in its data blocks. - InExtendedAttr = 0x200000 - - // InInline indicates that this inode has inline data. - InInline = 0x10000000 - - // InReserved indicates that this inode is reserved for the ext4 library. - InReserved = 0x80000000 -) - -// InodeFlags represents all possible combinations of inode flags. It aims to -// cover the bit masks and provide a more user-friendly interface. -type InodeFlags struct { - Sync bool - Immutable bool - Append bool - NoDump bool - NoAccessTime bool - Index bool - JournalData bool - DirSync bool - TopDir bool - HugeFile bool - Extents bool - ExtendedAttr bool - Inline bool - Reserved bool -} - -// ToInt converts inode flags back to its 32-bit rep. -func (f InodeFlags) ToInt() uint32 { - var res uint32 - - if f.Sync { - res |= InSync - } - if f.Immutable { - res |= InImmutable - } - if f.Append { - res |= InAppend - } - if f.NoDump { - res |= InNoDump - } - if f.NoAccessTime { - res |= InNoAccessTime - } - if f.Index { - res |= InIndex - } - if f.JournalData { - res |= InJournalData - } - if f.DirSync { - res |= InDirSync - } - if f.TopDir { - res |= InTopDir - } - if f.HugeFile { - res |= InHugeFile - } - if f.Extents { - res |= InExtents - } - if f.ExtendedAttr { - res |= InExtendedAttr - } - if f.Inline { - res |= InInline - } - if f.Reserved { - res |= InReserved - } - - return res -} - -// InodeFlagsFromInt converts the integer representation of inode flags to -// a InodeFlags struct. -func InodeFlagsFromInt(f uint32) InodeFlags { - return InodeFlags{ - Sync: f&InSync > 0, - Immutable: f&InImmutable > 0, - Append: f&InAppend > 0, - NoDump: f&InNoDump > 0, - NoAccessTime: f&InNoAccessTime > 0, - Index: f&InIndex > 0, - JournalData: f&InJournalData > 0, - DirSync: f&InDirSync > 0, - TopDir: f&InTopDir > 0, - HugeFile: f&InHugeFile > 0, - Extents: f&InExtents > 0, - ExtendedAttr: f&InExtendedAttr > 0, - Inline: f&InInline > 0, - Reserved: f&InReserved > 0, - } -} - -// These masks define how users can view/modify inode flags. The rest of the -// flags are for internal kernel usage only. -const ( - InUserReadFlagMask = 0x4BDFFF - InUserWriteFlagMask = 0x4B80FF -) diff --git a/pkg/sentry/fs/ext4/disklayout/inode_new.go b/pkg/sentry/fs/ext4/disklayout/inode_new.go deleted file mode 100644 index 4f5348372..000000000 --- a/pkg/sentry/fs/ext4/disklayout/inode_new.go +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import "gvisor.dev/gvisor/pkg/sentry/kernel/time" - -// InodeNew represents ext4 inode structure which can be bigger than -// OldInodeSize. The actual size of this struct should be determined using -// inode.ExtraInodeSize. Accessing any field here should be verified with the -// actual size. The extra space between the end of the inode struct and end of -// the inode record can be used to store extended attr. -// -// If the TimeExtra fields are in scope, the lower 2 bits of those are used -// to extend their counter part to be 34 bits wide; the rest (upper) 30 bits -// are used to provide nanoscond precision. Hence, these timestamps will now -// overflow in May 2446. -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. -type InodeNew struct { - InodeOld - - ExtraInodeSize uint16 - ChecksumHi uint16 - ChangeTimeExtra uint32 - ModificationTimeExtra uint32 - AccessTimeExtra uint32 - CreationTime uint32 - CreationTimeExtra uint32 - VersionHi uint32 - ProjectID uint32 -} - -// Compiles only if InodeNew implements Inode. -var _ Inode = (*InodeNew)(nil) - -// fromExtraTime decodes the extra time and constructs the kernel time struct -// with nanosecond precision. -func fromExtraTime(lo int32, extra uint32) time.Time { - // See description above InodeNew for format. - seconds := (int64(extra&0x3) << 32) + int64(lo) - nanoseconds := int64(extra >> 2) - return time.FromUnix(seconds, nanoseconds) -} - -// Only override methods which change due to ext4 specific fields. - -// Size implements Inode.Size. -func (in *InodeNew) Size() uint64 { - return (uint64(in.SizeHi) << 32) | uint64(in.SizeLo) -} - -// InodeSize implements Inode.InodeSize. -func (in *InodeNew) InodeSize() uint16 { - return oldInodeSize + in.ExtraInodeSize -} - -// ChangeTime implements Inode.ChangeTime. -func (in *InodeNew) ChangeTime() time.Time { - // Apply new timestamp logic if inode.ChangeTimeExtra is in scope. - if in.ExtraInodeSize >= 8 { - return fromExtraTime(in.ChangeTimeRaw, in.ChangeTimeExtra) - } - - return in.InodeOld.ChangeTime() -} - -// ModificationTime implements Inode.ModificationTime. -func (in *InodeNew) ModificationTime() time.Time { - // Apply new timestamp logic if inode.ModificationTimeExtra is in scope. - if in.ExtraInodeSize >= 12 { - return fromExtraTime(in.ModificationTimeRaw, in.ModificationTimeExtra) - } - - return in.InodeOld.ModificationTime() -} - -// AccessTime implements Inode.AccessTime. -func (in *InodeNew) AccessTime() time.Time { - // Apply new timestamp logic if inode.AccessTimeExtra is in scope. - if in.ExtraInodeSize >= 16 { - return fromExtraTime(in.AccessTimeRaw, in.AccessTimeExtra) - } - - return in.InodeOld.AccessTime() -} diff --git a/pkg/sentry/fs/ext4/disklayout/inode_old.go b/pkg/sentry/fs/ext4/disklayout/inode_old.go deleted file mode 100644 index dc4c9d8e4..000000000 --- a/pkg/sentry/fs/ext4/disklayout/inode_old.go +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/time" -) - -const ( - // oldInodeSize is the inode size in ext2/ext3. - oldInodeSize = 128 -) - -// InodeOld implements Inode interface. It emulates ext2/ext3 inode struct. -// Inode struct size and record size are both 128 bytes for this. -// -// All fields representing time are in seconds since the epoch. Which means that -// they will overflow in January 2038. -type InodeOld struct { - ModeRaw uint16 - UIDLo uint16 - SizeLo uint32 - - // The time fields are signed integers because they could be negative to - // represent time before the epoch. - AccessTimeRaw int32 - ChangeTimeRaw int32 - ModificationTimeRaw int32 - DeletionTimeRaw int32 - - GIDLo uint16 - LinksCountRaw uint16 - BlocksCountLo uint32 - FlagsRaw uint32 - VersionLo uint32 // This is OS dependent. - BlocksRaw [60]byte - Generation uint32 - FileACLLo uint32 - SizeHi uint32 - ObsoFaddr uint32 - - // OS dependent fields have been inlined here. - BlocksCountHi uint16 - FileACLHi uint16 - UIDHi uint16 - GIDHi uint16 - ChecksumLo uint16 - _ uint16 -} - -// Compiles only if InodeOld implements Inode. -var _ Inode = (*InodeOld)(nil) - -// Mode implements Inode.Mode. -func (in *InodeOld) Mode() linux.FileMode { return linux.FileMode(in.ModeRaw) } - -// UID implements Inode.UID. -func (in *InodeOld) UID() auth.KUID { - return auth.KUID((uint32(in.UIDHi) << 16) | uint32(in.UIDLo)) -} - -// GID implements Inode.GID. -func (in *InodeOld) GID() auth.KGID { - return auth.KGID((uint32(in.GIDHi) << 16) | uint32(in.GIDLo)) -} - -// Size implements Inode.Size. -func (in *InodeOld) Size() uint64 { - // In ext2/ext3, in.SizeHi did not exist, it was instead named in.DirACL. - return uint64(in.SizeLo) -} - -// InodeSize implements Inode.InodeSize. -func (in *InodeOld) InodeSize() uint16 { return oldInodeSize } - -// AccessTime implements Inode.AccessTime. -func (in *InodeOld) AccessTime() time.Time { - return time.FromUnix(int64(in.AccessTimeRaw), 0) -} - -// ChangeTime implements Inode.ChangeTime. -func (in *InodeOld) ChangeTime() time.Time { - return time.FromUnix(int64(in.ChangeTimeRaw), 0) -} - -// ModificationTime implements Inode.ModificationTime. -func (in *InodeOld) ModificationTime() time.Time { - return time.FromUnix(int64(in.ModificationTimeRaw), 0) -} - -// DeletionTime implements Inode.DeletionTime. -func (in *InodeOld) DeletionTime() time.Time { - return time.FromUnix(int64(in.DeletionTimeRaw), 0) -} - -// LinksCount implements Inode.LinksCount. -func (in *InodeOld) LinksCount() uint16 { return in.LinksCountRaw } - -// Flags implements Inode.Flags. -func (in *InodeOld) Flags() InodeFlags { return InodeFlagsFromInt(in.FlagsRaw) } - -// Blocks implements Inode.Blocks. -func (in *InodeOld) Blocks() [60]byte { return in.BlocksRaw } diff --git a/pkg/sentry/fs/ext4/disklayout/inode_test.go b/pkg/sentry/fs/ext4/disklayout/inode_test.go deleted file mode 100644 index 9cae9e4f0..000000000 --- a/pkg/sentry/fs/ext4/disklayout/inode_test.go +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "fmt" - "strconv" - "testing" - - "gvisor.dev/gvisor/pkg/sentry/kernel/time" -) - -// TestInodeSize tests that the inode structs are of the correct size. -func TestInodeSize(t *testing.T) { - assertSize(t, InodeOld{}, oldInodeSize) - - // This was updated from 156 bytes to 160 bytes in Oct 2015. - assertSize(t, InodeNew{}, 160) -} - -// TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in -// ext4 inode structs are decoded correctly. -// -// These tests are derived from the table under https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. -func TestTimestampSeconds(t *testing.T) { - type timestampTest struct { - // msbSet tells if the most significant bit of InodeOld.[X]TimeRaw is set. - // If this is set then the 32-bit time is negative. - msbSet bool - - // lowerBound tells if we should take the lowest possible value of - // InodeOld.[X]TimeRaw while satisfying test.msbSet condition. If set to - // false it tells to take the highest possible value. - lowerBound bool - - // extraBits is InodeNew.[X]TimeExtra. - extraBits uint32 - - // want is the kernel time struct that is expected. - want time.Time - } - - tests := []timestampTest{ - // 1901-12-13 - { - msbSet: true, - lowerBound: true, - extraBits: 0, - want: time.FromUnix(int64(-0x80000000), 0), - }, - - // 1969-12-31 - { - msbSet: true, - lowerBound: false, - extraBits: 0, - want: time.FromUnix(int64(-1), 0), - }, - - // 1970-01-01 - { - msbSet: false, - lowerBound: true, - extraBits: 0, - want: time.FromUnix(int64(0), 0), - }, - - // 2038-01-19 - { - msbSet: false, - lowerBound: false, - extraBits: 0, - want: time.FromUnix(int64(0x7fffffff), 0), - }, - - // 2038-01-19 - { - msbSet: true, - lowerBound: true, - extraBits: 1, - want: time.FromUnix(int64(0x80000000), 0), - }, - - // 2106-02-07 - { - msbSet: true, - lowerBound: false, - extraBits: 1, - want: time.FromUnix(int64(0xffffffff), 0), - }, - - // 2106-02-07 - { - msbSet: false, - lowerBound: true, - extraBits: 1, - want: time.FromUnix(int64(0x100000000), 0), - }, - - // 2174-02-25 - { - msbSet: false, - lowerBound: false, - extraBits: 1, - want: time.FromUnix(int64(0x17fffffff), 0), - }, - - // 2174-02-25 - { - msbSet: true, - lowerBound: true, - extraBits: 2, - want: time.FromUnix(int64(0x180000000), 0), - }, - - // 2242-03-16 - { - msbSet: true, - lowerBound: false, - extraBits: 2, - want: time.FromUnix(int64(0x1ffffffff), 0), - }, - - // 2242-03-16 - { - msbSet: false, - lowerBound: true, - extraBits: 2, - want: time.FromUnix(int64(0x200000000), 0), - }, - - // 2310-04-04 - { - msbSet: false, - lowerBound: false, - extraBits: 2, - want: time.FromUnix(int64(0x27fffffff), 0), - }, - - // 2310-04-04 - { - msbSet: true, - lowerBound: true, - extraBits: 3, - want: time.FromUnix(int64(0x280000000), 0), - }, - - // 2378-04-22 - { - msbSet: true, - lowerBound: false, - extraBits: 3, - want: time.FromUnix(int64(0x2ffffffff), 0), - }, - - // 2378-04-22 - { - msbSet: false, - lowerBound: true, - extraBits: 3, - want: time.FromUnix(int64(0x300000000), 0), - }, - - // 2446-05-10 - { - msbSet: false, - lowerBound: false, - extraBits: 3, - want: time.FromUnix(int64(0x37fffffff), 0), - }, - } - - lowerMSB0 := int32(0) // binary: 00000000 00000000 00000000 00000000 - upperMSB0 := int32(0x7fffffff) // binary: 01111111 11111111 11111111 11111111 - lowerMSB1 := int32(-0x80000000) // binary: 10000000 00000000 00000000 00000000 - upperMSB1 := int32(-1) // binary: 11111111 11111111 11111111 11111111 - - get32BitTime := func(test timestampTest) int32 { - if test.msbSet { - if test.lowerBound { - return lowerMSB1 - } - - return upperMSB1 - } - - if test.lowerBound { - return lowerMSB0 - } - - return upperMSB0 - } - - getTestName := func(test timestampTest) string { - return fmt.Sprintf( - "Tests time decoding with epoch bits 0b%s and 32-bit raw time: MSB set=%t, lower bound=%t", - strconv.FormatInt(int64(test.extraBits), 2), - test.msbSet, - test.lowerBound, - ) - } - - for _, test := range tests { - t.Run(getTestName(test), func(t *testing.T) { - if got := fromExtraTime(get32BitTime(test), test.extraBits); got != test.want { - t.Errorf("Expected: %v, Got: %v", test.want, got) - } - }) - } -} diff --git a/pkg/sentry/fs/ext4/disklayout/superblock.go b/pkg/sentry/fs/ext4/disklayout/superblock.go deleted file mode 100644 index e4b8f46fb..000000000 --- a/pkg/sentry/fs/ext4/disklayout/superblock.go +++ /dev/null @@ -1,468 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// SuperBlock should be implemented by structs representing the ext superblock. -// The superblock holds a lot of information about the enclosing filesystem. -// This interface aims to provide access methods to important information held -// by the superblock. It does NOT expose all fields of the superblock, only the -// ones necessary. This can be expanded when need be. -// -// Location and replication: -// - The superblock is located at offset 1024 in block group 0. -// - Redundant copies of the superblock and group descriptors are kept in -// all groups if SbSparse feature flag is NOT set. If it is set, the -// replicas only exist in groups whose group number is either 0 or a -// power of 3, 5, or 7. -// - There is also a sparse superblock feature v2 in which there are just -// two replicas saved in the block groups pointed by sb.s_backup_bgs. -// -// Replicas should eventually be updated if the superblock is updated. -// -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block. -type SuperBlock interface { - // InodesCount returns the total number of inodes in this filesystem. - InodesCount() uint32 - - // BlocksCount returns the total number of data blocks in this filesystem. - BlocksCount() uint64 - - // FreeBlocksCount returns the number of free blocks in this filesystem. - FreeBlocksCount() uint64 - - // FreeInodesCount returns the number of free inodes in this filesystem. - FreeInodesCount() uint32 - - // MountCount returns the number of mounts since the last fsck. - MountCount() uint16 - - // MaxMountCount returns the number of mounts allowed beyond which a fsck is - // needed. - MaxMountCount() uint16 - - // FirstDataBlock returns the absolute block number of the first data block, - // which contains the super block itself. - // - // If the filesystem has 1kb data blocks then this should return 1. For all - // other configurations, this typically returns 0. - // - // The first block group descriptor is in (FirstDataBlock() + 1)th block. - FirstDataBlock() uint32 - - // BlockSize returns the size of one data block in this filesystem. - // This can be calculated by 2^(10 + sb.s_log_block_size). This ensures that - // the smallest block size is 1kb. - BlockSize() uint64 - - // BlocksPerGroup returns the number of data blocks in a block group. - BlocksPerGroup() uint32 - - // ClusterSize returns block cluster size (set during mkfs time by admin). - // This can be calculated by 2^(10 + sb.s_log_cluster_size). This ensures that - // the smallest cluster size is 1kb. - // - // sb.s_log_cluster_size must equal sb.s_log_block_size if bigalloc feature - // is NOT set and consequently BlockSize() = ClusterSize() in that case. - ClusterSize() uint64 - - // ClustersPerGroup returns: - // - number of clusters per group if bigalloc is enabled. - // - BlocksPerGroup() otherwise. - ClustersPerGroup() uint32 - - // InodeSize returns the size of the inode disk record size in bytes. Use this - // to iterate over inode arrays on disk. - // - // In ext2 and ext3: - // - Each inode had a disk record of 128 bytes. - // - The inode struct size was fixed at 128 bytes. - // - // In ext4 its possible to allocate larger on-disk inodes: - // - Inode disk record size = sb.s_inode_size (function return value). - // = 256 (default) - // - Inode struct size = 128 + inode.i_extra_isize. - // = 128 + 32 = 160 (default) - InodeSize() uint16 - - // InodesPerGroup returns the number of inodes in a block group. - InodesPerGroup() uint32 - - // BgDescSize returns the size of the block group descriptor struct. - // - // In ext2, ext3, ext4 (without 64-bit feature), the block group descriptor - // is only 32 bytes long. - // In ext4 with 64-bit feature, the block group descriptor expands to AT LEAST - // 64 bytes. It might be bigger than that. - BgDescSize() uint16 - - // CompatibleFeatures returns the CompatFeatures struct which holds all the - // compatible features this fs supports. - CompatibleFeatures() CompatFeatures - - // IncompatibleFeatures returns the CompatFeatures struct which holds all the - // incompatible features this fs supports. - IncompatibleFeatures() IncompatFeatures - - // ReadOnlyCompatibleFeatures returns the CompatFeatures struct which holds all the - // readonly compatible features this fs supports. - ReadOnlyCompatibleFeatures() RoCompatFeatures - - // Magic() returns the magic signature which must be 0xef53. - Magic() uint16 - - // Revision returns the superblock revision. Superblock struct fields from - // offset 0x54 till 0x150 should only be used if superblock has DynamicRev. - Revision() SbRevision -} - -// SbRevision is the type for superblock revisions. -type SbRevision int - -// Super block revisions. -const ( - // OldRev is the good old (original) format. - OldRev SbRevision = 0 - - // DynamicRev is v2 format w/ dynamic inode sizes. - DynamicRev SbRevision = 1 -) - -// Superblock compatible features. -// This is not exhaustive, unused features are not listed. -const ( - // SbDirPrealloc indicates directory preallocation. - SbDirPrealloc = 0x1 - - // SbHasJournal indicates the presence of a journal. jbd2 should only work - // with this being set. - SbHasJournal = 0x4 - - // SbExtAttr indicates extended attributes support. - SbExtAttr = 0x8 - - // SbResizeInode indicates that the fs has reserved GDT blocks (right after - // group descriptors) for fs expansion. - SbResizeInode = 0x10 - - // SbDirIndex indicates that the fs has directory indices. - SbDirIndex = 0x20 - - // SbSparseV2 stands for Sparse superblock version 2. - SbSparseV2 = 0x200 -) - -// CompatFeatures represents a superblock's compatible feature set. If the -// kernel does not understand any of these feature, it can still read/write -// to this fs. -type CompatFeatures struct { - DirPrealloc bool - HasJournal bool - ExtAttr bool - ResizeInode bool - DirIndex bool - SparseV2 bool -} - -// ToInt converts superblock compatible features back to its 32-bit rep. -func (f CompatFeatures) ToInt() uint32 { - var res uint32 - - if f.DirPrealloc { - res |= SbDirPrealloc - } - if f.HasJournal { - res |= SbHasJournal - } - if f.ExtAttr { - res |= SbExtAttr - } - if f.ResizeInode { - res |= SbResizeInode - } - if f.DirIndex { - res |= SbDirIndex - } - if f.SparseV2 { - res |= SbSparseV2 - } - - return res -} - -// CompatFeaturesFromInt converts the integer representation of superblock -// compatible features to CompatFeatures struct. -func CompatFeaturesFromInt(f uint32) CompatFeatures { - return CompatFeatures{ - DirPrealloc: f&SbDirPrealloc > 0, - HasJournal: f&SbHasJournal > 0, - ExtAttr: f&SbExtAttr > 0, - ResizeInode: f&SbResizeInode > 0, - DirIndex: f&SbDirIndex > 0, - SparseV2: f&SbSparseV2 > 0, - } -} - -// Superblock incompatible features. -// This is not exhaustive, unused features are not listed. -const ( - // SbDirentFileType indicates that directory entries record the file type. - // We should use struct ext4_dir_entry_2 for dirents then. - SbDirentFileType = 0x2 - - // SbRecovery indicates that the filesystem needs recovery. - SbRecovery = 0x4 - - // SbJournalDev indicates that the filesystem has a separate journal device. - SbJournalDev = 0x8 - - // SbMetaBG indicates that the filesystem is using Meta block groups. Moves - // the group descriptors from the congested first block group into the first - // group of each metablock group to increase the maximum block groups limit - // and hence support much larger filesystems. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#meta-block-groups. - SbMetaBG = 0x10 - - // SbExtents indicates that the filesystem uses extents. Must be set in ext4 - // filesystems. - SbExtents = 0x40 - - // SbIs64Bit indicates that this filesystem addresses blocks with 64-bits. - // Hence can support 2^64 data blocks. - SbIs64Bit = 0x80 - - // SbMMP indicates that this filesystem has multiple mount protection. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#multiple-mount-protection. - SbMMP = 0x100 - - // SbFlexBg indicates that this filesystem has flexible block groups. Several - // block groups are tied into one logical block group so that all the metadata - // for the block groups (bitmaps and inode tables) are close together for - // faster loading. Consequently, large files will be continuous on disk. - // However, this does not affect the placement of redundant superblocks and - // group descriptors. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#flexible-block-groups. - SbFlexBg = 0x200 - - // SbLargeDir shows that large directory enabled. Directory htree can be 3 - // levels deep. Directory htrees are allowed to be 2 levels deep otherwise. - SbLargeDir = 0x4000 - - // SbInlineData allows inline data in inodes for really small files. - SbInlineData = 0x8000 - - // SbEncrypted indicates that this fs contains encrypted inodes. - SbEncrypted = 0x10000 -) - -// IncompatFeatures represents a superblock's incompatible feature set. If the -// kernel does not understand any of these feature, it should refuse to mount. -type IncompatFeatures struct { - DirentFileType bool - Recovery bool - JournalDev bool - MetaBG bool - Extents bool - Is64Bit bool - MMP bool - FlexBg bool - LargeDir bool - InlineData bool - Encrypted bool -} - -// ToInt converts superblock incompatible features back to its 32-bit rep. -func (f IncompatFeatures) ToInt() uint32 { - var res uint32 - - if f.DirentFileType { - res |= SbDirentFileType - } - if f.Recovery { - res |= SbRecovery - } - if f.JournalDev { - res |= SbJournalDev - } - if f.MetaBG { - res |= SbMetaBG - } - if f.Extents { - res |= SbExtents - } - if f.Is64Bit { - res |= SbIs64Bit - } - if f.MMP { - res |= SbMMP - } - if f.FlexBg { - res |= SbFlexBg - } - if f.LargeDir { - res |= SbLargeDir - } - if f.InlineData { - res |= SbInlineData - } - if f.Encrypted { - res |= SbEncrypted - } - - return res -} - -// IncompatFeaturesFromInt converts the integer representation of superblock -// incompatible features to IncompatFeatures struct. -func IncompatFeaturesFromInt(f uint32) IncompatFeatures { - return IncompatFeatures{ - DirentFileType: f&SbDirentFileType > 0, - Recovery: f&SbRecovery > 0, - JournalDev: f&SbJournalDev > 0, - MetaBG: f&SbMetaBG > 0, - Extents: f&SbExtents > 0, - Is64Bit: f&SbIs64Bit > 0, - MMP: f&SbMMP > 0, - FlexBg: f&SbFlexBg > 0, - LargeDir: f&SbLargeDir > 0, - InlineData: f&SbInlineData > 0, - Encrypted: f&SbEncrypted > 0, - } -} - -// Superblock readonly compatible features. -// This is not exhaustive, unused features are not listed. -const ( - // SbSparse indicates sparse superblocks. Only groups with number either 0 or - // a power of 3, 5, or 7 will have redundant copies of the superblock and - // block descriptors. - SbSparse = 0x1 - - // SbLargeFile indicates that this fs has been used to store a file >= 2GiB. - SbLargeFile = 0x2 - - // SbHugeFile indicates that this fs contains files whose sizes are - // represented in units of logicals blocks, not 512-byte sectors. - SbHugeFile = 0x8 - - // SbGdtCsum indicates that group descriptors have checksums. - SbGdtCsum = 0x10 - - // SbDirNlink indicates that the new subdirectory limit is 64,999. Ext3 has a - // 32,000 subdirectory limit. - SbDirNlink = 0x20 - - // SbExtraIsize indicates that large inodes exist on this filesystem. - SbExtraIsize = 0x40 - - // SbHasSnapshot indicates the existence of a snapshot. - SbHasSnapshot = 0x80 - - // SbQuota enables usage tracking for all quota types. - SbQuota = 0x100 - - // SbBigalloc maps to the bigalloc feature. When set, the minimum allocation - // unit becomes a cluster rather than a data block. Then block bitmaps track - // clusters, not data blocks. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#bigalloc. - SbBigalloc = 0x200 - - // SbMetadataCsum indicates that the fs supports metadata checksumming. - SbMetadataCsum = 0x400 - - // SbReadOnly marks this filesystem as readonly. Should refuse to mount in - // read/write mode. - SbReadOnly = 0x1000 -) - -// RoCompatFeatures represents a superblock's readonly compatible feature set. -// If the kernel does not understand any of these feature, it can still mount -// readonly. But if the user wants to mount read/write, the kernel should -// refuse to mount. -type RoCompatFeatures struct { - Sparse bool - LargeFile bool - HugeFile bool - GdtCsum bool - DirNlink bool - ExtraIsize bool - HasSnapshot bool - Quota bool - Bigalloc bool - MetadataCsum bool - ReadOnly bool -} - -// ToInt converts superblock readonly compatible features to its 32-bit rep. -func (f RoCompatFeatures) ToInt() uint32 { - var res uint32 - - if f.Sparse { - res |= SbSparse - } - if f.LargeFile { - res |= SbLargeFile - } - if f.HugeFile { - res |= SbHugeFile - } - if f.GdtCsum { - res |= SbGdtCsum - } - if f.DirNlink { - res |= SbDirNlink - } - if f.ExtraIsize { - res |= SbExtraIsize - } - if f.HasSnapshot { - res |= SbHasSnapshot - } - if f.Quota { - res |= SbQuota - } - if f.Bigalloc { - res |= SbBigalloc - } - if f.MetadataCsum { - res |= SbMetadataCsum - } - if f.ReadOnly { - res |= SbReadOnly - } - - return res -} - -// RoCompatFeaturesFromInt converts the integer representation of superblock -// readonly compatible features to RoCompatFeatures struct. -func RoCompatFeaturesFromInt(f uint32) RoCompatFeatures { - return RoCompatFeatures{ - Sparse: f&SbSparse > 0, - LargeFile: f&SbLargeFile > 0, - HugeFile: f&SbHugeFile > 0, - GdtCsum: f&SbGdtCsum > 0, - DirNlink: f&SbDirNlink > 0, - ExtraIsize: f&SbExtraIsize > 0, - HasSnapshot: f&SbHasSnapshot > 0, - Quota: f&SbQuota > 0, - Bigalloc: f&SbBigalloc > 0, - MetadataCsum: f&SbMetadataCsum > 0, - ReadOnly: f&SbReadOnly > 0, - } -} diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_32.go b/pkg/sentry/fs/ext4/disklayout/superblock_32.go deleted file mode 100644 index 587e4afaa..000000000 --- a/pkg/sentry/fs/ext4/disklayout/superblock_32.go +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// SuperBlock32Bit implements SuperBlock and represents the 32-bit version of -// the ext4_super_block struct in fs/ext4/ext4.h. -type SuperBlock32Bit struct { - // We embed the old superblock struct here because the 32-bit version is just - // an extension of the old version. - SuperBlockOld - - FirstInode uint32 - InodeSizeRaw uint16 - BlockGroupNumber uint16 - FeatureCompat uint32 - FeatureIncompat uint32 - FeatureRoCompat uint32 - UUID [16]byte - VolumeName [16]byte - LastMounted [64]byte - AlgoUsageBitmap uint32 - PreallocBlocks uint8 - PreallocDirBlocks uint8 - ReservedGdtBlocks uint16 - JournalUUID [16]byte - JournalInum uint32 - JournalDev uint32 - LastOrphan uint32 - HashSeed [4]uint32 - DefaultHashVersion uint8 - JnlBackupType uint8 - BgDescSizeRaw uint16 - DefaultMountOpts uint32 - FirstMetaBg uint32 - MkfsTime uint32 - JnlBlocks [17]uint32 -} - -// Compiles only if SuperBlock32Bit implements SuperBlock. -var _ SuperBlock = (*SuperBlock32Bit)(nil) - -// Only override methods which change based on the additional fields above. -// Not overriding SuperBlock.BgDescSize because it would still return 32 here. - -// InodeSize implements SuperBlock.InodeSize. -func (sb *SuperBlock32Bit) InodeSize() uint16 { - return sb.InodeSizeRaw -} - -// CompatibleFeatures implements SuperBlock.CompatibleFeatures. -func (sb *SuperBlock32Bit) CompatibleFeatures() CompatFeatures { - return CompatFeaturesFromInt(sb.FeatureCompat) -} - -// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. -func (sb *SuperBlock32Bit) IncompatibleFeatures() IncompatFeatures { - return IncompatFeaturesFromInt(sb.FeatureIncompat) -} - -// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. -func (sb *SuperBlock32Bit) ReadOnlyCompatibleFeatures() RoCompatFeatures { - return RoCompatFeaturesFromInt(sb.FeatureRoCompat) -} diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_64.go b/pkg/sentry/fs/ext4/disklayout/superblock_64.go deleted file mode 100644 index a2c2278fb..000000000 --- a/pkg/sentry/fs/ext4/disklayout/superblock_64.go +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// SuperBlock64Bit implements SuperBlock and represents the 64-bit version of -// the ext4_super_block struct in fs/ext4/ext4.h. This sums up to be exactly -// 1024 bytes (smallest possible block size) and hence the superblock always -// fits in no more than one data block. -type SuperBlock64Bit struct { - // We embed the 32-bit struct here because 64-bit version is just an extension - // of the 32-bit version. - SuperBlock32Bit - - BlocksCountHi uint32 - ReservedBlocksCountHi uint32 - FreeBlocksCountHi uint32 - MinInodeSize uint16 - WantInodeSize uint16 - Flags uint32 - RaidStride uint16 - MmpInterval uint16 - MmpBlock uint64 - RaidStripeWidth uint32 - LogGroupsPerFlex uint8 - ChecksumType uint8 - _ uint16 - KbytesWritten uint64 - SnapshotInum uint32 - SnapshotID uint32 - SnapshotRsrvBlocksCount uint64 - SnapshotList uint32 - ErrorCount uint32 - FirstErrorTime uint32 - FirstErrorInode uint32 - FirstErrorBlock uint64 - FirstErrorFunction [32]byte - FirstErrorLine uint32 - LastErrorTime uint32 - LastErrorInode uint32 - LastErrorLine uint32 - LastErrorBlock uint64 - LastErrorFunction [32]byte - MountOpts [64]byte - UserQuotaInum uint32 - GroupQuotaInum uint32 - OverheadBlocks uint32 - BackupBgs [2]uint32 - EncryptAlgos [4]uint8 - EncryptPwSalt [16]uint8 - LostFoundInode uint32 - ProjectQuotaInode uint32 - ChecksumSeed uint32 - WtimeHi uint8 - MtimeHi uint8 - MkfsTimeHi uint8 - LastCheckHi uint8 - FirstErrorTimeHi uint8 - LastErrorTimeHi uint8 - _ [2]uint8 - Encoding uint16 - EncodingFlags uint16 - _ [95]uint32 - Checksum uint32 -} - -// Compiles only if SuperBlock64Bit implements SuperBlock. -var _ SuperBlock = (*SuperBlock64Bit)(nil) - -// Only override methods which change based on the 64-bit feature. - -// BlocksCount implements SuperBlock.BlocksCount. -func (sb *SuperBlock64Bit) BlocksCount() uint64 { - return (uint64(sb.BlocksCountHi) << 32) | uint64(sb.BlocksCountLo) -} - -// FreeBlocksCount implements SuperBlock.FreeBlocksCount. -func (sb *SuperBlock64Bit) FreeBlocksCount() uint64 { - return (uint64(sb.FreeBlocksCountHi) << 32) | uint64(sb.FreeBlocksCountLo) -} - -// BgDescSize implements SuperBlock.BgDescSize. -func (sb *SuperBlock64Bit) BgDescSize() uint16 { return sb.BgDescSizeRaw } diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_old.go b/pkg/sentry/fs/ext4/disklayout/superblock_old.go deleted file mode 100644 index c74953610..000000000 --- a/pkg/sentry/fs/ext4/disklayout/superblock_old.go +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// SuperBlockOld implements SuperBlock and represents the old version of the -// superblock struct in ext2 and ext3 systems. -type SuperBlockOld struct { - InodesCountRaw uint32 - BlocksCountLo uint32 - ReservedBlocksCount uint32 - FreeBlocksCountLo uint32 - FreeInodesCountRaw uint32 - FirstDataBlockRaw uint32 - LogBlockSize uint32 - LogClusterSize uint32 - BlocksPerGroupRaw uint32 - ClustersPerGroupRaw uint32 - InodesPerGroupRaw uint32 - Mtime uint32 - Wtime uint32 - MountCountRaw uint16 - MaxMountCountRaw uint16 - MagicRaw uint16 - State uint16 - Errors uint16 - MinorRevLevel uint16 - LastCheck uint32 - CheckInterval uint32 - CreatorOS uint32 - RevLevel uint32 - DefResUID uint16 - DefResGID uint16 -} - -// InodesCount implements SuperBlock.InodesCount. -func (sb *SuperBlockOld) InodesCount() uint32 { return sb.InodesCountRaw } - -// BlocksCount implements SuperBlock.BlocksCount. -func (sb *SuperBlockOld) BlocksCount() uint64 { return uint64(sb.BlocksCountLo) } - -// FreeBlocksCount implements SuperBlock.FreeBlocksCount. -func (sb *SuperBlockOld) FreeBlocksCount() uint64 { return uint64(sb.FreeBlocksCountLo) } - -// FreeInodesCount implements SuperBlock.FreeInodesCount. -func (sb *SuperBlockOld) FreeInodesCount() uint32 { return sb.FreeInodesCountRaw } - -// MountCount implements SuperBlock.MountCount. -func (sb *SuperBlockOld) MountCount() uint16 { return sb.MountCountRaw } - -// MaxMountCount implements SuperBlock.MaxMountCount. -func (sb *SuperBlockOld) MaxMountCount() uint16 { return sb.MaxMountCountRaw } - -// FirstDataBlock implements SuperBlock.FirstDataBlock. -func (sb *SuperBlockOld) FirstDataBlock() uint32 { return sb.FirstDataBlockRaw } - -// BlockSize implements SuperBlock.BlockSize. -func (sb *SuperBlockOld) BlockSize() uint64 { return 1 << (10 + sb.LogBlockSize) } - -// BlocksPerGroup implements SuperBlock.BlocksPerGroup. -func (sb *SuperBlockOld) BlocksPerGroup() uint32 { return sb.BlocksPerGroupRaw } - -// ClusterSize implements SuperBlock.ClusterSize. -func (sb *SuperBlockOld) ClusterSize() uint64 { return 1 << (10 + sb.LogClusterSize) } - -// ClustersPerGroup implements SuperBlock.ClustersPerGroup. -func (sb *SuperBlockOld) ClustersPerGroup() uint32 { return sb.ClustersPerGroupRaw } - -// InodeSize implements SuperBlock.InodeSize. -func (sb *SuperBlockOld) InodeSize() uint16 { return oldInodeSize } - -// InodesPerGroup implements SuperBlock.InodesPerGroup. -func (sb *SuperBlockOld) InodesPerGroup() uint32 { return sb.InodesPerGroupRaw } - -// BgDescSize implements SuperBlock.BgDescSize. -func (sb *SuperBlockOld) BgDescSize() uint16 { return 32 } - -// CompatibleFeatures implements SuperBlock.CompatibleFeatures. -func (sb *SuperBlockOld) CompatibleFeatures() CompatFeatures { return CompatFeatures{} } - -// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. -func (sb *SuperBlockOld) IncompatibleFeatures() IncompatFeatures { return IncompatFeatures{} } - -// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. -func (sb *SuperBlockOld) ReadOnlyCompatibleFeatures() RoCompatFeatures { return RoCompatFeatures{} } - -// Magic implements SuperBlock.Magic. -func (sb *SuperBlockOld) Magic() uint16 { return sb.MagicRaw } - -// Revision implements SuperBlock.Revision. -func (sb *SuperBlockOld) Revision() SbRevision { return SbRevision(sb.RevLevel) } diff --git a/pkg/sentry/fs/ext4/disklayout/superblock_test.go b/pkg/sentry/fs/ext4/disklayout/superblock_test.go deleted file mode 100644 index 463b5ba21..000000000 --- a/pkg/sentry/fs/ext4/disklayout/superblock_test.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "testing" -) - -// TestSuperBlockSize tests that the superblock structs are of the correct -// size. -func TestSuperBlockSize(t *testing.T) { - assertSize(t, SuperBlockOld{}, 84) - assertSize(t, SuperBlock32Bit{}, 336) - assertSize(t, SuperBlock64Bit{}, 1024) -} diff --git a/pkg/sentry/fs/ext4/disklayout/test_utils.go b/pkg/sentry/fs/ext4/disklayout/test_utils.go deleted file mode 100644 index 9c63f04c0..000000000 --- a/pkg/sentry/fs/ext4/disklayout/test_utils.go +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "reflect" - "testing" - - "gvisor.dev/gvisor/pkg/binary" -) - -func assertSize(t *testing.T, v interface{}, want uintptr) { - t.Helper() - - if got := binary.Size(v); got != want { - t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got) - } -} diff --git a/pkg/sentry/fs/ext4/fs.go b/pkg/sentry/fs/ext4/fs.go deleted file mode 100644 index 5c7274821..000000000 --- a/pkg/sentry/fs/ext4/fs.go +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package ext4 implements the ext4 filesystem. -package ext4 - -import ( - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" -) - -// filesystem implements fs.Filesystem for ext4. -// -// +stateify savable -type filesystem struct{} - -func init() { - fs.RegisterFilesystem(&filesystem{}) -} - -// FilesystemName is the name under which the filesystem is registered. -// Name matches fs/ext4/super.c:ext4_fs_type.name. -const FilesystemName = "ext4" - -// Name is the name of the file system. -func (*filesystem) Name() string { - return FilesystemName -} - -// AllowUserMount prohibits users from using mount(2) with this file system. -func (*filesystem) AllowUserMount() bool { - return false -} - -// AllowUserList prohibits this filesystem to be listed in /proc/filesystems. -func (*filesystem) AllowUserList() bool { - return false -} - -// Flags returns properties of the filesystem. -// -// In Linux, ext4 returns FS_REQUIRES_DEV. See fs/ext4/super.c -func (*filesystem) Flags() fs.FilesystemFlags { - return fs.FilesystemRequiresDev -} - -// Mount returns the root inode of the ext4 fs. -func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, cgroupsInt interface{}) (*fs.Inode, error) { - panic("unimplemented") -} -- cgit v1.2.3 From 07bb86080fd37f108d9bfa9bf32534bd25deb186 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Thu, 11 Jul 2019 16:19:37 -0700 Subject: Bump rules_go to v0.18.7 and go toolchain to v1.12.7. PiperOrigin-RevId: 257703164 --- WORKSPACE | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index 26243b7f2..b57320e7a 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -3,10 +3,10 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") http_archive( name = "io_bazel_rules_go", - sha256 = "f04d2373bcaf8aa09bccb08a98a57e721306c8f6043a2a0ee610fd6853dcde3d", + sha256 = "45409e6c4f748baa9e05f8f6ab6efaa05739aa064e3ab94e5a1a09849c51806a", urls = [ - "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/rules_go/releases/download/0.18.6/rules_go-0.18.6.tar.gz", - "https://github.com/bazelbuild/rules_go/releases/download/0.18.6/rules_go-0.18.6.tar.gz", + "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/rules_go/releases/download/0.18.7/rules_go-0.18.7.tar.gz", + "https://github.com/bazelbuild/rules_go/releases/download/0.18.7/rules_go-0.18.7.tar.gz", ], ) @@ -21,7 +21,7 @@ load("@io_bazel_rules_go//go:deps.bzl", "go_rules_dependencies", "go_register_to go_rules_dependencies() go_register_toolchains( - go_version = "1.12.6", + go_version = "1.12.7", nogo = "@//:nogo", ) -- cgit v1.2.3 From 2eeca68900eeb57a679eccc08fe172b1d9ddf97f Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Thu, 11 Jul 2019 17:16:27 -0700 Subject: Added tiny ext4 image. The image is of size 64Kb which supports 64 1k blocks and 16 inodes. This is the smallest size mkfs.ext4 works with. Added README.md documenting how this was created and included all files on the device under assets. PiperOrigin-RevId: 257712672 --- pkg/sentry/fs/ext/assets/README.md | 28 ++++++++++++++++++++++++ pkg/sentry/fs/ext/assets/bigfile.txt | 41 +++++++++++++++++++++++++++++++++++ pkg/sentry/fs/ext/assets/file.txt | 1 + pkg/sentry/fs/ext/assets/symlink.txt | 1 + pkg/sentry/fs/ext/assets/tiny.ext4 | Bin 0 -> 65536 bytes 5 files changed, 71 insertions(+) create mode 100644 pkg/sentry/fs/ext/assets/README.md create mode 100644 pkg/sentry/fs/ext/assets/bigfile.txt create mode 100644 pkg/sentry/fs/ext/assets/file.txt create mode 120000 pkg/sentry/fs/ext/assets/symlink.txt create mode 100644 pkg/sentry/fs/ext/assets/tiny.ext4 diff --git a/pkg/sentry/fs/ext/assets/README.md b/pkg/sentry/fs/ext/assets/README.md new file mode 100644 index 000000000..b6eac4f40 --- /dev/null +++ b/pkg/sentry/fs/ext/assets/README.md @@ -0,0 +1,28 @@ +### Tiny Ext4 Image + +The image is of size 64Kb which supports 64 1k blocks and 16 inodes. This is the +smallest size mkfs.ext4 works with. + +This image was generated using the following commands. + +```bash +fallocate -l 64K tiny.ext4 +mkfs.ext4 -j tiny.ext4 +``` + +You can mount it using: + +```bash +sudo mount -o loop tiny.ext4 $MOUNTPOINT +``` + +`file.txt`, `bigfile.txt` and `symlink.txt` were added to this image by just +mounting it and copying (while preserving links) those files to the mountpoint +directory using: + +```bash +sudo cp -P {file.txt,symlink.txt,bigfile.txt} $MOUNTPOINT/ +``` + +The files in this directory mirror the contents and organisation of the files +stored in the image. diff --git a/pkg/sentry/fs/ext/assets/bigfile.txt b/pkg/sentry/fs/ext/assets/bigfile.txt new file mode 100644 index 000000000..3857cf516 --- /dev/null +++ b/pkg/sentry/fs/ext/assets/bigfile.txt @@ -0,0 +1,41 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus faucibus eleifend orci, ut ornare nibh faucibus eu. Cras at condimentum massa. Nullam luctus, elit non porttitor congue, sapien diam feugiat sapien, sed eleifend nulla mauris non arcu. Sed lacinia mauris magna, eu mollis libero varius sit amet. Donec mollis, quam convallis commodo posuere, dolor nisi placerat nisi, in faucibus augue mi eu lorem. In pharetra consectetur faucibus. Ut euismod ex efficitur egestas tincidunt. Maecenas condimentum ut ante in rutrum. Vivamus sed arcu tempor, faucibus turpis et, lacinia diam. + +Sed in lacus vel nisl interdum bibendum in sed justo. Nunc tellus risus, molestie vitae arcu sed, molestie tempus ligula. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nunc risus neque, volutpat et ante non, ullamcorper condimentum ante. Aliquam sed metus in urna condimentum convallis. Vivamus ut libero mauris. Proin mollis posuere consequat. Vestibulum placerat mollis est et pulvinar. + +Donec rutrum odio ac diam pharetra, id fermentum magna cursus. Pellentesque in dapibus elit, et condimentum orci. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Suspendisse euismod dapibus est, id vestibulum mauris. Nulla facilisi. Nulla cursus gravida nisi. Phasellus vestibulum rutrum lectus, a dignissim mauris hendrerit vitae. In at elementum mauris. Integer vel efficitur velit. Nullam fringilla sapien mi, quis luctus neque efficitur ac. Aenean nec quam dapibus nunc commodo pharetra. Proin sapien mi, fermentum aliquet vulputate non, aliquet porttitor diam. Quisque lacinia, urna et finibus fermentum, nunc lacus vehicula ex, sed congue metus lectus ac quam. Aliquam erat volutpat. Suspendisse sodales, dolor ut tincidunt finibus, augue erat varius tellus, a interdum erat sem at nunc. Vestibulum cursus iaculis sapien, vitae feugiat dui auctor quis. + +Pellentesque nec maximus nulla, eu blandit diam. Maecenas quis arcu ornare, congue ante at, vehicula ipsum. Praesent feugiat mauris rutrum sem fermentum, nec luctus ipsum placerat. Pellentesque placerat ipsum at dignissim fringilla. Vivamus et posuere sem, eget hendrerit felis. Aenean vulputate, augue vel mollis feugiat, justo ipsum mollis dolor, eu mollis elit neque ut ipsum. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Fusce bibendum sem quam, vulputate laoreet mi dapibus imperdiet. Sed a purus non nibh pretium aliquet. Integer eget luctus augue, vitae tincidunt magna. Ut eros enim, egestas eu nulla et, lobortis egestas arcu. Cras id ipsum ac justo lacinia rutrum. Vivamus lectus leo, ultricies sed justo at, pellentesque feugiat magna. Ut sollicitudin neque elit, vel ornare mauris commodo id. + +Duis dapibus orci et sapien finibus finibus. Mauris eleifend, lacus at vestibulum maximus, quam ligula pharetra erat, sit amet dapibus neque elit vitae neque. In bibendum sollicitudin erat, eget ultricies tortor malesuada at. Sed sit amet orci turpis. Donec feugiat ligula nibh, molestie tincidunt lectus elementum id. Donec volutpat maximus nibh, in vulputate felis posuere eu. Cras tincidunt ullamcorper lacus. Phasellus porta lorem auctor, congue magna a, commodo elit. + +Etiam auctor mi quis elit sodales, eu pulvinar arcu condimentum. Aenean imperdiet risus et dapibus tincidunt. Nullam tincidunt dictum dui, sed commodo urna rutrum id. Ut mollis libero vel elit laoreet bibendum. Quisque arcu arcu, tincidunt at ultricies id, vulputate nec metus. In tristique posuere quam sit amet volutpat. Vivamus scelerisque et nunc at dapibus. Fusce finibus libero ut ligula pretium rhoncus. Mauris non elit in arcu finibus imperdiet. Pellentesque nec massa odio. Proin rutrum mauris non sagittis efficitur. Aliquam auctor quam at dignissim faucibus. Ut eget ligula in magna posuere ultricies vitae sit amet turpis. Duis maximus odio nulla. Donec gravida sem tristique tempus scelerisque. + +Interdum et malesuada fames ac ante ipsum primis in faucibus. Fusce pharetra magna vulputate aliquet tempus. Duis id hendrerit arcu. Quisque ut ex elit. Integer velit orci, venenatis ut sapien ac, placerat porttitor dui. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc hendrerit cursus diam, hendrerit finibus ipsum scelerisque ut. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. + +Nulla non euismod neque. Phasellus vel sapien eu metus pulvinar rhoncus. Suspendisse eu mollis tellus, quis vestibulum tortor. Maecenas interdum dolor sed nulla fermentum maximus. Donec imperdiet ullamcorper condimentum. Nam quis nibh ante. Praesent quis tellus ut tortor pulvinar blandit sit amet ut sapien. Vestibulum est orci, pellentesque vitae tristique sit amet, tristique non felis. + +Vivamus sodales pellentesque varius. Sed vel tempus ligula. Nulla tristique nisl vel dui facilisis, ac sodales augue hendrerit. Proin augue nisi, vestibulum quis augue nec, sagittis tincidunt velit. Vestibulum euismod, nulla nec sodales faucibus, urna sapien vulputate magna, id varius metus sapien ut neque. Duis in mollis urna, in scelerisque enim. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nunc condimentum dictum turpis, et egestas neque dapibus eget. Quisque fringilla, dui eu venenatis eleifend, erat nibh lacinia urna, at lacinia lacus sapien eu dui. Duis eu erat ut mi lacinia convallis a sed ex. + +Fusce elit metus, tincidunt nec eleifend a, hendrerit nec ligula. Duis placerat finibus sollicitudin. In euismod porta tellus, in luctus justo bibendum bibendum. Maecenas at magna eleifend lectus tincidunt suscipit ut a ligula. Nulla tempor accumsan felis, fermentum dapibus est eleifend vitae. Mauris urna sem, fringilla at ultricies non, ultrices in arcu. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Nam vehicula nunc at laoreet imperdiet. Nunc tristique ut risus id aliquet. Integer eleifend massa orci. + +Vestibulum sed ante sollicitudin nisi fringilla bibendum nec vel quam. Sed pretium augue eu ligula congue pulvinar. Donec vitae magna tincidunt, pharetra lacus id, convallis nulla. Cras viverra nisl nisl, varius convallis leo vulputate nec. Morbi at consequat dui, sed aliquet metus. Sed suscipit fermentum mollis. Maecenas nec mi sodales, tincidunt purus in, tristique mauris. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec interdum mi in velit efficitur, quis ultrices ex imperdiet. Sed vestibulum, magna ut tristique pretium, mi ipsum placerat tellus, non tempor enim augue et ex. Pellentesque eget felis quis ante sodales viverra ac sed lacus. Donec suscipit tempus massa, eget laoreet massa molestie at. + +Aenean fringilla dui non aliquet consectetur. Fusce cursus quam nec orci hendrerit faucibus. Donec consequat suscipit enim, non volutpat lectus auctor interdum. Proin lorem purus, maximus vel orci vitae, suscipit egestas turpis. Donec risus urna, congue a sem eu, aliquet placerat odio. Morbi gravida tristique turpis, quis efficitur enim. Nunc interdum gravida ipsum vel facilisis. Nunc congue finibus sollicitudin. Quisque euismod aliquet lectus et tincidunt. Curabitur ultrices sem ut mi fringilla fermentum. Morbi pretium, nisi sit amet dapibus congue, dolor enim consectetur risus, a interdum ligula odio sed odio. Quisque facilisis, mi at suscipit gravida, nunc sapien cursus justo, ut luctus odio nulla quis leo. Integer condimentum lobortis mauris, non egestas tellus lobortis sit amet. + +In sollicitudin velit ac ante vehicula, vitae varius tortor mollis. In hac habitasse platea dictumst. Quisque et orci lorem. Integer malesuada fringilla luctus. Pellentesque malesuada, mi non lobortis porttitor, ante ligula vulputate ante, nec dictum risus eros sit amet sapien. Nulla aliquam lorem libero, ac varius nulla tristique eget. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut pellentesque mauris orci, vel consequat mi varius a. Ut sit amet elit vulputate, lacinia metus non, fermentum nisl. Pellentesque eu nisi sed quam egestas blandit. Duis sit amet lobortis dolor. Donec consectetur sem interdum, tristique elit sit amet, sodales lacus. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Fusce id aliquam augue. Sed pretium congue risus vitae lacinia. Vestibulum non vulputate risus, ut malesuada justo. + +Sed odio elit, consectetur ac mauris quis, consequat commodo libero. Fusce sodales velit vulputate pulvinar fermentum. Donec iaculis nec nisl eget faucibus. Mauris at dictum velit. Donec fermentum lectus eu viverra volutpat. Aliquam consequat facilisis lorem, cursus consequat dui bibendum ullamcorper. Pellentesque nulla magna, imperdiet at magna et, cursus egestas enim. Nullam semper molestie lectus sit amet semper. Duis eget tincidunt est. Integer id neque risus. Integer ultricies hendrerit vestibulum. Donec blandit blandit sagittis. Nunc consectetur vitae nisi consectetur volutpat. + +Nulla id lorem fermentum, efficitur magna a, hendrerit dui. Vivamus sagittis orci gravida, bibendum quam eget, molestie est. Phasellus nec enim tincidunt, volutpat sapien non, laoreet diam. Nulla posuere enim nec porttitor lobortis. Donec auctor odio ut orci eleifend, ut eleifend purus convallis. Interdum et malesuada fames ac ante ipsum primis in faucibus. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Ut hendrerit, purus eget viverra tincidunt, sem magna imperdiet libero, et aliquam turpis neque vitae elit. Maecenas semper varius iaculis. Cras non lorem quis quam bibendum eleifend in et libero. Curabitur at purus mauris. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus porta diam sed elit eleifend gravida. + +Nulla facilisi. Ut ultricies diam vel diam consectetur, vel porta augue molestie. Fusce interdum sapien et metus facilisis pellentesque. Nulla convallis sem at nunc vehicula facilisis. Nam ac rutrum purus. Nunc bibendum, dolor sit amet tempus ullamcorper, lorem leo tempor sem, id fringilla nunc augue scelerisque augue. Nullam sit amet rutrum nisl. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Donec sed mauris gravida eros vehicula sagittis at eget orci. Cras elementum, eros at accumsan bibendum, libero neque blandit purus, vitae vestibulum libero massa ac nibh. Integer at placerat nulla. Mauris eu eleifend orci. Aliquam consequat ligula vitae erat porta lobortis. Duis fermentum elit ac aliquet ornare. + +Mauris eget cursus tellus, eget sodales purus. Aliquam malesuada, augue id vulputate finibus, nisi ex bibendum nisl, sit amet laoreet quam urna a dolor. Nullam ultricies, sapien eu laoreet consequat, erat eros dignissim diam, ultrices sodales lectus mauris et leo. Morbi lacinia eu ante at tempus. Sed iaculis finibus magna malesuada efficitur. Donec faucibus erat sit amet elementum feugiat. Praesent a placerat nisi. Etiam lacinia gravida diam, et sollicitudin sapien tincidunt ut. + +Maecenas felis quam, tincidunt vitae venenatis scelerisque, viverra vitae odio. Phasellus enim neque, ultricies suscipit malesuada sit amet, vehicula sit amet purus. Nulla placerat sit amet dui vel tincidunt. Nam quis neque vel magna commodo egestas. Vestibulum sagittis rutrum lorem ut congue. Maecenas vel ultrices tellus. Donec efficitur, urna ac consequat iaculis, lorem felis pharetra eros, eget faucibus orci lectus sit amet arcu. + +Ut a tempus nisi. Nulla facilisi. Praesent vulputate maximus mi et dapibus. Sed sit amet libero ac augue hendrerit efficitur in a sapien. Mauris placerat velit sit amet tellus sollicitudin faucibus. Donec egestas a magna ac suscipit. Duis enim sapien, mollis sed egestas et, vestibulum vel leo. + +Proin quis dapibus dui. Donec eu tincidunt nunc. Vivamus eget purus consectetur, maximus ante vitae, tincidunt elit. Aenean mattis dolor a gravida aliquam. Praesent quis tellus id sem maximus vulputate nec sed nulla. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur metus nulla, volutpat volutpat est eu, hendrerit congue erat. Aliquam sollicitudin augue ante. Sed sollicitudin, magna eu consequat elementum, mi augue ullamcorper felis, molestie imperdiet erat metus iaculis est. Proin ac tortor nisi. Pellentesque quis nisi risus. Integer enim sapien, tincidunt quis tortor id, accumsan venenatis mi. Nulla facilisi. + +Cras pretium sit amet quam congue maximus. Morbi lacus libero, imperdiet commodo massa sed, scelerisque placerat libero. Cras nisl nisi, consectetur sed bibendum eu, venenatis at enim. Proin sodales justo at quam aliquam, a consectetur mi ornare. Donec porta ac est sit amet efficitur. Suspendisse vestibulum tortor id neque imperdiet, id lacinia risus vehicula. Phasellus ac eleifend purus. Mauris vel gravida ante. Aliquam vitae lobortis risus. Sed vehicula consectetur tincidunt. Nam et justo vitae purus molestie consequat. Pellentesque ipsum ex, convallis quis blandit non, gravida et urna. Donec diam ligula amet. diff --git a/pkg/sentry/fs/ext/assets/file.txt b/pkg/sentry/fs/ext/assets/file.txt new file mode 100644 index 000000000..980a0d5f1 --- /dev/null +++ b/pkg/sentry/fs/ext/assets/file.txt @@ -0,0 +1 @@ +Hello World! diff --git a/pkg/sentry/fs/ext/assets/symlink.txt b/pkg/sentry/fs/ext/assets/symlink.txt new file mode 120000 index 000000000..4c330738c --- /dev/null +++ b/pkg/sentry/fs/ext/assets/symlink.txt @@ -0,0 +1 @@ +file.txt \ No newline at end of file diff --git a/pkg/sentry/fs/ext/assets/tiny.ext4 b/pkg/sentry/fs/ext/assets/tiny.ext4 new file mode 100644 index 000000000..a6859736d Binary files /dev/null and b/pkg/sentry/fs/ext/assets/tiny.ext4 differ -- cgit v1.2.3 From 69e0affaecda24b4d193e4592202b55b53afecc3 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Fri, 12 Jul 2019 09:09:46 -0700 Subject: Don't emit an event for extended attribute syscalls. These are filesystem-specific, and filesystems are allowed to return ENOTSUP if they are not supported. PiperOrigin-RevId: 257813477 --- pkg/sentry/syscalls/linux/linux64.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 680c0c64c..51db2d8f7 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -234,9 +234,9 @@ var AMD64 = &kernel.SyscallTable{ 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), 186: syscalls.Supported("gettid", Gettid), 187: syscalls.ErrorWithEvent("readahead", syserror.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341) - 188: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 189: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 190: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 188: syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), 193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), -- cgit v1.2.3