summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r--pkg/sentry/kernel/BUILD4
-rw-r--r--pkg/sentry/kernel/abstract_socket_namespace.go8
-rw-r--r--pkg/sentry/kernel/auth/BUILD1
-rw-r--r--pkg/sentry/kernel/auth/credentials.go14
-rw-r--r--pkg/sentry/kernel/auth/id_map.go46
-rw-r--r--pkg/sentry/kernel/auth/user_namespace.go8
-rw-r--r--pkg/sentry/kernel/fasync/BUILD2
-rw-r--r--pkg/sentry/kernel/fasync/fasync.go4
-rw-r--r--pkg/sentry/kernel/fd_table.go2
-rw-r--r--pkg/sentry/kernel/fd_table_unsafe.go2
-rw-r--r--pkg/sentry/kernel/futex/BUILD3
-rw-r--r--pkg/sentry/kernel/futex/futex.go54
-rw-r--r--pkg/sentry/kernel/futex/futex_test.go4
-rw-r--r--pkg/sentry/kernel/ipc/BUILD20
-rw-r--r--pkg/sentry/kernel/ipc/object.go115
-rw-r--r--pkg/sentry/kernel/ipc/registry.go196
-rw-r--r--pkg/sentry/kernel/ipc_namespace.go8
-rw-r--r--pkg/sentry/kernel/kcov.go20
-rw-r--r--pkg/sentry/kernel/kernel.go6
-rw-r--r--pkg/sentry/kernel/kernel_opts.go3
-rw-r--r--pkg/sentry/kernel/msgqueue/BUILD36
-rw-r--r--pkg/sentry/kernel/msgqueue/msgqueue.go220
-rw-r--r--pkg/sentry/kernel/pipe/BUILD1
-rw-r--r--pkg/sentry/kernel/pipe/node.go7
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go7
-rw-r--r--pkg/sentry/kernel/pipe/pipe_unsafe.go2
-rw-r--r--pkg/sentry/kernel/pipe/pipe_util.go4
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go11
-rw-r--r--pkg/sentry/kernel/posixtimer.go18
-rw-r--r--pkg/sentry/kernel/ptrace.go42
-rw-r--r--pkg/sentry/kernel/ptrace_amd64.go1
-rw-r--r--pkg/sentry/kernel/ptrace_arm64.go1
-rw-r--r--pkg/sentry/kernel/rseq.go30
-rw-r--r--pkg/sentry/kernel/semaphore/BUILD14
-rw-r--r--pkg/sentry/kernel/semaphore/semaphore.go299
-rw-r--r--pkg/sentry/kernel/semaphore/semaphore_test.go20
-rw-r--r--pkg/sentry/kernel/sessions.go25
-rw-r--r--pkg/sentry/kernel/shm/BUILD2
-rw-r--r--pkg/sentry/kernel/shm/shm.go276
-rw-r--r--pkg/sentry/kernel/signalfd/BUILD1
-rw-r--r--pkg/sentry/kernel/signalfd/signalfd.go3
-rw-r--r--pkg/sentry/kernel/task.go6
-rw-r--r--pkg/sentry/kernel/task_acct.go6
-rw-r--r--pkg/sentry/kernel/task_block.go2
-rw-r--r--pkg/sentry/kernel/task_cgroup.go6
-rw-r--r--pkg/sentry/kernel/task_clone.go265
-rw-r--r--pkg/sentry/kernel/task_exit.go109
-rw-r--r--pkg/sentry/kernel/task_identity.go46
-rw-r--r--pkg/sentry/kernel/task_log.go2
-rw-r--r--pkg/sentry/kernel/task_run.go2
-rw-r--r--pkg/sentry/kernel/task_sched.go4
-rw-r--r--pkg/sentry/kernel/task_signals.go36
-rw-r--r--pkg/sentry/kernel/task_start.go3
-rw-r--r--pkg/sentry/kernel/task_syscall.go9
-rw-r--r--pkg/sentry/kernel/task_usermem.go15
-rw-r--r--pkg/sentry/kernel/thread_group.go22
-rw-r--r--pkg/sentry/kernel/time/BUILD2
-rw-r--r--pkg/sentry/kernel/time/time.go6
-rw-r--r--pkg/sentry/kernel/timekeeper_test.go4
59 files changed, 1230 insertions, 855 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 10563e3d7..e4e0dc04f 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -227,6 +227,7 @@ go_library(
"//pkg/context",
"//pkg/coverage",
"//pkg/cpuid",
+ "//pkg/errors",
"//pkg/errors/linuxerr",
"//pkg/eventchannel",
"//pkg/fspath",
@@ -256,6 +257,7 @@ go_library(
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/epoll",
"//pkg/sentry/kernel/futex",
+ "//pkg/sentry/kernel/msgqueue",
"//pkg/sentry/kernel/sched",
"//pkg/sentry/kernel/semaphore",
"//pkg/sentry/kernel/shm",
@@ -301,6 +303,7 @@ go_test(
deps = [
"//pkg/abi",
"//pkg/context",
+ "//pkg/errors/linuxerr",
"//pkg/hostarch",
"//pkg/sentry/arch",
"//pkg/sentry/contexttest",
@@ -312,6 +315,5 @@ go_test(
"//pkg/sentry/time",
"//pkg/sentry/usage",
"//pkg/sync",
- "//pkg/syserror",
],
)
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index d100e58d7..5d86a04f3 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -27,7 +27,7 @@ import (
// +stateify savable
type abstractEndpoint struct {
ep transport.BoundEndpoint
- socket refsvfs2.RefCounter
+ socket refsvfs2.TryRefCounter
name string
ns *AbstractSocketNamespace
}
@@ -57,7 +57,7 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace {
// its backing socket.
type boundEndpoint struct {
transport.BoundEndpoint
- socket refsvfs2.RefCounter
+ socket refsvfs2.TryRefCounter
}
// Release implements transport.BoundEndpoint.Release.
@@ -89,7 +89,7 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp
//
// When the last reference managed by socket is dropped, ep may be removed from the
// namespace.
-func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.RefCounter) error {
+func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.TryRefCounter) error {
a.mu.Lock()
defer a.mu.Unlock()
@@ -109,7 +109,7 @@ func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep tran
// Remove removes the specified socket at name from the abstract socket
// namespace, if it has not yet been replaced.
-func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.RefCounter) {
+func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.TryRefCounter) {
a.mu.Lock()
defer a.mu.Unlock()
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 12180351d..7a1a36454 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -63,6 +63,7 @@ go_library(
"//pkg/abi/linux",
"//pkg/bits",
"//pkg/context",
+ "//pkg/errors/linuxerr",
"//pkg/log",
"//pkg/sync",
"//pkg/syserror",
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index 3325fedcb..fc245c54b 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -16,7 +16,7 @@ package auth
import (
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
)
// Credentials contains information required to authorize privileged operations
@@ -203,7 +203,7 @@ func (c *Credentials) UseUID(uid UID) (KUID, error) {
// uid must be mapped.
kuid := c.UserNamespace.MapToKUID(uid)
if !kuid.Ok() {
- return NoID, syserror.EINVAL
+ return NoID, linuxerr.EINVAL
}
// If c has CAP_SETUID, then it can use any UID in its user namespace.
if c.HasCapability(linux.CAP_SETUID) {
@@ -214,7 +214,7 @@ func (c *Credentials) UseUID(uid UID) (KUID, error) {
if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID {
return kuid, nil
}
- return NoID, syserror.EPERM
+ return NoID, linuxerr.EPERM
}
// UseGID checks that c can use gid in its user namespace, then translates it
@@ -222,7 +222,7 @@ func (c *Credentials) UseUID(uid UID) (KUID, error) {
func (c *Credentials) UseGID(gid GID) (KGID, error) {
kgid := c.UserNamespace.MapToKGID(gid)
if !kgid.Ok() {
- return NoID, syserror.EINVAL
+ return NoID, linuxerr.EINVAL
}
if c.HasCapability(linux.CAP_SETGID) {
return kgid, nil
@@ -230,7 +230,7 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) {
if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID {
return kgid, nil
}
- return NoID, syserror.EPERM
+ return NoID, linuxerr.EPERM
}
// SetUID translates the provided uid to the root user namespace and updates c's
@@ -239,7 +239,7 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) {
func (c *Credentials) SetUID(uid UID) error {
kuid := c.UserNamespace.MapToKUID(uid)
if !kuid.Ok() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
c.RealKUID = kuid
c.EffectiveKUID = kuid
@@ -253,7 +253,7 @@ func (c *Credentials) SetUID(uid UID) error {
func (c *Credentials) SetGID(gid GID) error {
kgid := c.UserNamespace.MapToKGID(gid)
if !kgid.Ok() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
c.RealKGID = kgid
c.EffectiveKGID = kgid
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
index 28cbe159d..f06a374a0 100644
--- a/pkg/sentry/kernel/auth/id_map.go
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -17,7 +17,7 @@ package auth
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
)
// MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns.
@@ -106,11 +106,11 @@ func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) er
// than once to a uid_map file in a user namespace fails with the error
// EPERM. Similar rules apply for gid_map files." - user_namespaces(7)
if !ns.uidMapFromParent.IsEmpty() {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// "At least one line must be written to the file."
if len(entries) == 0 {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
// """
// In order for a process to write to the /proc/[pid]/uid_map
@@ -121,12 +121,12 @@ func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) er
// in the user namespace of the process pid.
// """
if !c.HasCapabilityIn(linux.CAP_SETUID, ns) {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// "2. The writing process must either be in the user namespace of the process
// pid or be in the parent user namespace of the process pid."
if c.UserNamespace != ns && c.UserNamespace != ns.parent {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// """
// 3. (see trySetUIDMap)
@@ -145,14 +145,14 @@ func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) er
// parent user namespace to a user ID (group ID) in the user namespace.
// """
if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// """
// + The writing process must have the same effective user ID as the
// process that created the user namespace.
// """
if c.EffectiveKUID != ns.owner {
- return syserror.EPERM
+ return linuxerr.EPERM
}
}
// trySetUIDMap leaves data in maps if it fails.
@@ -170,11 +170,11 @@ func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error {
// checks for NoID.
lastID := e.FirstID + e.Length
if lastID <= e.FirstID {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
lastParentID := e.FirstParentID + e.Length
if lastParentID <= e.FirstParentID {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
// "3. The mapped user IDs (group IDs) must in turn have a mapping in
// the parent user namespace."
@@ -182,14 +182,14 @@ func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error {
// mappings when it's created, so SetUIDMap would have returned EPERM
// without reaching this point if ns is root.
if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// If either of these Adds fail, we have an overlapping range.
if !ns.uidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if !ns.uidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
}
return nil
@@ -202,24 +202,24 @@ func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) er
ns.mu.Lock()
defer ns.mu.Unlock()
if !ns.gidMapFromParent.IsEmpty() {
- return syserror.EPERM
+ return linuxerr.EPERM
}
if len(entries) == 0 {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if !c.HasCapabilityIn(linux.CAP_SETGID, ns) {
- return syserror.EPERM
+ return linuxerr.EPERM
}
if c.UserNamespace != ns && c.UserNamespace != ns.parent {
- return syserror.EPERM
+ return linuxerr.EPERM
}
if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) {
if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// It's correct for this to still be UID.
if c.EffectiveKUID != ns.owner {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// "In the case of gid_map, use of the setgroups(2) system call must
// first be denied by writing "deny" to the /proc/[pid]/setgroups file
@@ -239,20 +239,20 @@ func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error {
for _, e := range entries {
lastID := e.FirstID + e.Length
if lastID <= e.FirstID {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
lastParentID := e.FirstParentID + e.Length
if lastParentID <= e.FirstParentID {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) {
- return syserror.EPERM
+ return linuxerr.EPERM
}
if !ns.gidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if !ns.gidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
}
return nil
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
index 9dd52c860..40a406f9d 100644
--- a/pkg/sentry/kernel/auth/user_namespace.go
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -17,8 +17,8 @@ package auth
import (
"math"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
)
// A UserNamespace represents a user namespace. See user_namespaces(7) for
@@ -105,7 +105,7 @@ func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) {
if c.UserNamespace.depth() >= maxUserNamespaceDepth {
// "... Calls to unshare(2) or clone(2) that would cause this limit to
// be exceeded fail with the error EUSERS." - user_namespaces(7)
- return nil, syserror.EUSERS
+ return nil, linuxerr.EUSERS
}
// "EPERM: CLONE_NEWUSER was specified in flags, but either the effective
// user ID or the effective group ID of the caller does not have a mapping
@@ -114,10 +114,10 @@ func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) {
// process are mapped to user IDs and group IDs in the user namespace of
// the calling process at the time of the call." - unshare(2)
if !c.EffectiveKUID.In(c.UserNamespace).Ok() {
- return nil, syserror.EPERM
+ return nil, linuxerr.EPERM
}
if !c.EffectiveKGID.In(c.UserNamespace).Ok() {
- return nil, syserror.EPERM
+ return nil, linuxerr.EPERM
}
return &UserNamespace{
parent: c.UserNamespace,
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index 6224a0cbd..6b2dd09da 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -8,12 +8,12 @@ go_library(
visibility = ["//:sandbox"],
deps = [
"//pkg/abi/linux",
+ "//pkg/errors/linuxerr",
"//pkg/sentry/fs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/vfs",
"//pkg/sync",
- "//pkg/syserror",
"//pkg/waiter",
],
)
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index 5d584dc45..473987a79 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -17,12 +17,12 @@ package fasync
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -248,7 +248,7 @@ func (a *FileAsync) Signal() linux.Signal {
// to send SIGIO.
func (a *FileAsync) SetSignal(signal linux.Signal) error {
if signal != 0 && !signal.IsValid() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
a.mu.Lock()
defer a.mu.Unlock()
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 9f7702fcc..eff556a0c 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -108,7 +108,7 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
ctx := context.Background()
f.initNoLeakCheck() // Initialize table.
- f.fdBitmap = bitmap.BitmapWithSize(uint32(math.MaxUint16))
+ f.fdBitmap = bitmap.New(uint32(math.MaxUint16))
for fd, d := range m {
if fd < 0 {
panic(fmt.Sprintf("FD is not supposed to be negative. FD: %d", fd))
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index c4cac6b99..2b3e6ef71 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -46,7 +46,7 @@ func (f *FDTable) initNoLeakCheck() {
func (f *FDTable) init() {
f.initNoLeakCheck()
f.InitRefs()
- f.fdBitmap = bitmap.BitmapWithSize(uint32(math.MaxUint16))
+ f.fdBitmap = bitmap.New(uint32(math.MaxUint16))
}
// get gets a file entry.
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 6c31e082c..cfdea5cf7 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -37,6 +37,7 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/errors/linuxerr",
"//pkg/hostarch",
"//pkg/log",
"//pkg/sentry/memmap",
@@ -53,8 +54,8 @@ go_test(
library = ":futex",
deps = [
"//pkg/context",
+ "//pkg/errors/linuxerr",
"//pkg/hostarch",
"//pkg/sync",
- "@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 0427cf3f4..f5c364c96 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -20,6 +20,7 @@ package futex
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sync"
@@ -122,7 +123,7 @@ func check(t Target, addr hostarch.Addr, val uint32) error {
return err
}
if cur != val {
- return syserror.EAGAIN
+ return linuxerr.EAGAIN
}
return nil
}
@@ -332,7 +333,7 @@ func getKey(t Target, addr hostarch.Addr, private bool) (Key, error) {
// Ensure the address is aligned.
// It must be a DWORD boundary.
if addr&0x3 != 0 {
- return Key{}, syserror.EINVAL
+ return Key{}, linuxerr.EINVAL
}
if private {
return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil
@@ -397,8 +398,8 @@ func (m *Manager) Fork() *Manager {
}
// lockBucket returns a locked bucket for the given key.
-func (m *Manager) lockBucket(k *Key) *bucket {
- var b *bucket
+// +checklocksacquire:b.mu
+func (m *Manager) lockBucket(k *Key) (b *bucket) {
if k.Kind == KindSharedMappable {
b = m.sharedBucket
} else {
@@ -409,7 +410,9 @@ func (m *Manager) lockBucket(k *Key) *bucket {
}
// lockBuckets returns locked buckets for the given keys.
-func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
+// +checklocksacquire:b1.mu
+// +checklocksacquire:b2.mu
+func (m *Manager) lockBuckets(k1, k2 *Key) (b1 *bucket, b2 *bucket) {
// Buckets must be consistently ordered to avoid circular lock
// dependencies. We order buckets in m.privateBuckets by index (lowest
// index first), and all buckets in m.privateBuckets precede
@@ -419,8 +422,8 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable {
i1 := bucketIndexForAddr(k1.addr())
i2 := bucketIndexForAddr(k2.addr())
- b1 := &m.privateBuckets[i1]
- b2 := &m.privateBuckets[i2]
+ b1 = &m.privateBuckets[i1]
+ b2 = &m.privateBuckets[i2]
switch {
case i1 < i2:
b1.mu.Lock()
@@ -431,19 +434,30 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
default:
b1.mu.Lock()
}
- return b1, b2
+ return b1, b2 // +checklocksforce
}
// At least one of b1 or b2 should be m.sharedBucket.
- b1 := m.sharedBucket
- b2 := m.sharedBucket
+ b1 = m.sharedBucket
+ b2 = m.sharedBucket
if k1.Kind != KindSharedMappable {
b1 = m.lockBucket(k1)
} else if k2.Kind != KindSharedMappable {
b2 = m.lockBucket(k2)
}
m.sharedBucket.mu.Lock()
- return b1, b2
+ return b1, b2 // +checklocksforce
+}
+
+// unlockBuckets unlocks two buckets.
+// +checklocksrelease:b1.mu
+// +checklocksrelease:b2.mu
+func (m *Manager) unlockBuckets(b1, b2 *bucket) {
+ b1.mu.Unlock()
+ if b1 != b2 {
+ b2.mu.Unlock()
+ }
+ return // +checklocksforce
}
// Wake wakes up to n waiters matching the bitmask on the given addr.
@@ -476,10 +490,7 @@ func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, c
defer k2.release(t)
b1, b2 := m.lockBuckets(&k1, &k2)
- defer b1.mu.Unlock()
- if b2 != b1 {
- defer b2.mu.Unlock()
- }
+ defer m.unlockBuckets(b1, b2)
if checkval {
if err := check(t, addr, val); err != nil {
@@ -526,10 +537,7 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwa
defer k2.release(t)
b1, b2 := m.lockBuckets(&k1, &k2)
- defer b1.mu.Unlock()
- if b2 != b1 {
- defer b2.mu.Unlock()
- }
+ defer m.unlockBuckets(b1, b2)
done := 0
cond, err := atomicOp(t, addr2, op)
@@ -670,7 +678,7 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr hostarch.Addr, tid uint
return false, err
}
if (cur & linux.FUTEX_TID_MASK) == tid {
- return false, syserror.EDEADLK
+ return false, linuxerr.EDEADLK
}
if (cur & linux.FUTEX_TID_MASK) == 0 {
@@ -745,7 +753,7 @@ func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bu
}
if (cur & linux.FUTEX_TID_MASK) != tid {
- return syserror.EPERM
+ return linuxerr.EPERM
}
var next *Waiter // Who's the next owner?
@@ -773,7 +781,7 @@ func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bu
if prev != cur {
// Let user mode handle CAS races. This is different than lock, which
// retries when CAS fails.
- return syserror.EAGAIN
+ return linuxerr.EAGAIN
}
return nil
}
@@ -790,7 +798,7 @@ func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bu
return err
}
if prev != cur {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
b.wakeWaiterLocked(next)
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index deba44e5c..04c136f87 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -21,8 +21,8 @@ import (
"testing"
"unsafe"
- "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -488,7 +488,7 @@ func (t *testMutex) Lock() {
// Wait for it to be "not locked".
w := NewWaiter()
err := t.m.WaitPrepare(w, t.d, t.a, true, testMutexLocked, ^uint32(0))
- if err == unix.EAGAIN {
+ if linuxerr.Equals(linuxerr.EAGAIN, err) {
continue
}
if err != nil {
diff --git a/pkg/sentry/kernel/ipc/BUILD b/pkg/sentry/kernel/ipc/BUILD
new file mode 100644
index 000000000..e42a94e15
--- /dev/null
+++ b/pkg/sentry/kernel/ipc/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "ipc",
+ srcs = [
+ "object.go",
+ "registry.go",
+ ],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/errors/linuxerr",
+ "//pkg/log",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/kernel/auth",
+ ],
+)
diff --git a/pkg/sentry/kernel/ipc/object.go b/pkg/sentry/kernel/ipc/object.go
new file mode 100644
index 000000000..387b35e7e
--- /dev/null
+++ b/pkg/sentry/kernel/ipc/object.go
@@ -0,0 +1,115 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ipc defines functionality and utilities common to sysvipc mechanisms.
+//
+// Lock ordering: [shm/semaphore/msgqueue].Registry.mu -> Mechanism
+package ipc
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// Key is a user-provided identifier for IPC objects.
+type Key int32
+
+// ID is a kernel identifier for IPC objects.
+type ID int32
+
+// Object represents an abstract IPC object with fields common to all IPC
+// mechanisms.
+//
+// +stateify savable
+type Object struct {
+ // User namespace which owns the IPC namespace which owns the IPC object.
+ // Immutable.
+ UserNS *auth.UserNamespace
+
+ // ID is a kernel identifier for the IPC object. Immutable.
+ ID ID
+
+ // Key is a user-provided identifier for the IPC object. Immutable.
+ Key Key
+
+ // Creator is the user who created the IPC object. Immutable.
+ Creator fs.FileOwner
+
+ // Owner is the current owner of the IPC object.
+ Owner fs.FileOwner
+
+ // Perms is the access permissions the IPC object.
+ Perms fs.FilePermissions
+}
+
+// Mechanism represents a SysV mechanism that holds an IPC object. It can also
+// be looked at as a container for an ipc.Object, which is by definition a fully
+// functional SysV object.
+type Mechanism interface {
+ // Lock behaves the same as Mutex.Lock on the mechanism.
+ Lock()
+
+ // Unlock behaves the same as Mutex.Unlock on the mechanism.
+ Unlock()
+
+ // Object returns a pointer to the mechanism's ipc.Object. Mechanism.Lock,
+ // and Mechanism.Unlock should be used when the object is used.
+ Object() *Object
+
+ // Destroy destroys the mechanism.
+ Destroy()
+}
+
+// NewObject returns a new, initialized ipc.Object. The newly returned object
+// doesn't have a valid ID. When the object is registered, the registry assigns
+// it a new unique ID.
+func NewObject(un *auth.UserNamespace, key Key, creator, owner fs.FileOwner, perms fs.FilePermissions) *Object {
+ return &Object{
+ UserNS: un,
+ Key: key,
+ Creator: creator,
+ Owner: owner,
+ Perms: perms,
+ }
+}
+
+// CheckOwnership verifies whether an IPC object may be accessed using creds as
+// an owner. See ipc/util.c:ipcctl_obtain_check() in Linux.
+func (o *Object) CheckOwnership(creds *auth.Credentials) bool {
+ if o.Owner.UID == creds.EffectiveKUID || o.Creator.UID == creds.EffectiveKUID {
+ return true
+ }
+
+ // Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux
+ // doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented
+ // for use to "override IPC ownership checks".
+ return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, o.UserNS)
+}
+
+// CheckPermissions verifies whether an IPC object is accessible using creds for
+// access described by req. See ipc/util.c:ipcperms() in Linux.
+func (o *Object) CheckPermissions(creds *auth.Credentials, req fs.PermMask) bool {
+ p := o.Perms.Other
+ if o.Owner.UID == creds.EffectiveKUID {
+ p = o.Perms.User
+ } else if creds.InGroup(o.Owner.GID) {
+ p = o.Perms.Group
+ }
+
+ if p.SupersetOf(req) {
+ return true
+ }
+ return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, o.UserNS)
+}
diff --git a/pkg/sentry/kernel/ipc/registry.go b/pkg/sentry/kernel/ipc/registry.go
new file mode 100644
index 000000000..91de19070
--- /dev/null
+++ b/pkg/sentry/kernel/ipc/registry.go
@@ -0,0 +1,196 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipc
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// Registry is similar to Object, but for registries. It represent an abstract
+// SysV IPC registry with fields common to all SysV registries. Registry is not
+// thread-safe, and should be protected using a mutex.
+//
+// +stateify savable
+type Registry struct {
+ // UserNS owning the IPC namespace this registry belongs to. Immutable.
+ UserNS *auth.UserNamespace
+
+ // objects is a map of IDs to IPC mechanisms.
+ objects map[ID]Mechanism
+
+ // KeysToIDs maps a lookup key to an ID.
+ keysToIDs map[Key]ID
+
+ // lastIDUsed is used to find the next available ID for object creation.
+ lastIDUsed ID
+}
+
+// NewRegistry return a new, initialized ipc.Registry.
+func NewRegistry(userNS *auth.UserNamespace) *Registry {
+ return &Registry{
+ UserNS: userNS,
+ objects: make(map[ID]Mechanism),
+ keysToIDs: make(map[Key]ID),
+ }
+}
+
+// Find uses key to search for and return a SysV mechanism. Find returns an
+// error if an object is found by shouldn't be, or if the user doesn't have
+// permission to use the object. If no object is found, Find checks create
+// flag, and returns an error only if it's false.
+func (r *Registry) Find(ctx context.Context, key Key, mode linux.FileMode, create, exclusive bool) (Mechanism, error) {
+ if id, ok := r.keysToIDs[key]; ok {
+ mech := r.objects[id]
+ mech.Lock()
+ defer mech.Unlock()
+
+ obj := mech.Object()
+ creds := auth.CredentialsFromContext(ctx)
+ if !obj.CheckPermissions(creds, fs.PermsFromMode(mode)) {
+ // The [calling process / user] does not have permission to access
+ // the set, and does not have the CAP_IPC_OWNER capability in the
+ // user namespace that governs its IPC namespace.
+ return nil, linuxerr.EACCES
+ }
+
+ if create && exclusive {
+ // IPC_CREAT and IPC_EXCL were specified, but an object already
+ // exists for key.
+ return nil, linuxerr.EEXIST
+ }
+ return mech, nil
+ }
+
+ if !create {
+ // No object exists for key and msgflg did not specify IPC_CREAT.
+ return nil, linuxerr.ENOENT
+ }
+
+ return nil, nil
+}
+
+// Register adds the given object into Registry.Objects, and assigns it a new
+// ID. It returns an error if all IDs are exhausted.
+func (r *Registry) Register(m Mechanism) error {
+ id, err := r.newID()
+ if err != nil {
+ return err
+ }
+
+ obj := m.Object()
+ obj.ID = id
+
+ r.objects[id] = m
+ r.keysToIDs[obj.Key] = id
+
+ return nil
+}
+
+// newID finds the first unused ID in the registry, and returns an error if
+// non is found.
+func (r *Registry) newID() (ID, error) {
+ // Find the next available ID.
+ for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
+ // Handle wrap around.
+ if id < 0 {
+ id = 0
+ continue
+ }
+ if r.objects[id] == nil {
+ r.lastIDUsed = id
+ return id, nil
+ }
+ }
+
+ log.Warningf("ids exhausted, they may be leaking")
+
+ // The man pages for shmget(2) mention that ENOSPC should be used if "All
+ // possible shared memory IDs have been taken (SHMMNI)". Other SysV
+ // mechanisms don't have a specific errno for running out of IDs, but they
+ // return ENOSPC if the max number of objects is exceeded, so we assume that
+ // it's the same case.
+ return 0, linuxerr.ENOSPC
+}
+
+// Remove removes the mechanism with the given id from the registry, and calls
+// mechanism.Destroy to perform mechanism-specific removal.
+func (r *Registry) Remove(id ID, creds *auth.Credentials) error {
+ mech := r.objects[id]
+ if mech == nil {
+ return linuxerr.EINVAL
+ }
+
+ mech.Lock()
+ defer mech.Unlock()
+
+ obj := mech.Object()
+
+ // The effective user ID of the calling process must match the creator or
+ // owner of the [mechanism], or the caller must be privileged.
+ if !obj.CheckOwnership(creds) {
+ return linuxerr.EPERM
+ }
+
+ delete(r.objects, obj.ID)
+ delete(r.keysToIDs, obj.Key)
+ mech.Destroy()
+
+ return nil
+}
+
+// ForAllObjects executes a given function for all given objects.
+func (r *Registry) ForAllObjects(f func(o Mechanism)) {
+ for _, o := range r.objects {
+ f(o)
+ }
+}
+
+// FindByID returns the mechanism with the given ID, nil if non exists.
+func (r *Registry) FindByID(id ID) Mechanism {
+ return r.objects[id]
+}
+
+// DissociateKey removes the association between a mechanism and its key
+// (deletes it from r.keysToIDs), preventing it from being discovered by any new
+// process, but not necessarily destroying it. If the given key doesn't exist,
+// nothing is changed.
+func (r *Registry) DissociateKey(key Key) {
+ delete(r.keysToIDs, key)
+}
+
+// DissociateID removes the association between a mechanism and its ID (deletes
+// it from r.objects). An ID can't be removed unless the associated key is
+// removed already, this is done to prevent the users from acquiring nil a
+// Mechanism.
+//
+// Precondition: must be preceded by a call to r.DissociateKey.
+func (r *Registry) DissociateID(id ID) {
+ delete(r.objects, id)
+}
+
+// ObjectCount returns the number of registered objects.
+func (r *Registry) ObjectCount() int {
+ return len(r.objects)
+}
+
+// LastIDUsed returns the last used ID.
+func (r *Registry) LastIDUsed() ID {
+ return r.lastIDUsed
+}
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 9545bb5ef..0b101b1bb 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -17,6 +17,7 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/msgqueue"
"gvisor.dev/gvisor/pkg/sentry/kernel/semaphore"
"gvisor.dev/gvisor/pkg/sentry/kernel/shm"
)
@@ -30,6 +31,7 @@ type IPCNamespace struct {
// User namespace which owns this IPC namespace. Immutable.
userNS *auth.UserNamespace
+ queues *msgqueue.Registry
semaphores *semaphore.Registry
shms *shm.Registry
}
@@ -38,6 +40,7 @@ type IPCNamespace struct {
func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
ns := &IPCNamespace{
userNS: userNS,
+ queues: msgqueue.NewRegistry(userNS),
semaphores: semaphore.NewRegistry(userNS),
shms: shm.NewRegistry(userNS),
}
@@ -45,6 +48,11 @@ func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
return ns
}
+// MsgqueueRegistry returns the message queue registry for this namespace.
+func (i *IPCNamespace) MsgqueueRegistry() *msgqueue.Registry {
+ return i.queues
+}
+
// SemaphoreRegistry returns the semaphore set registry for this namespace.
func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry {
return i.semaphores
diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go
index 4b943106b..e8a71bec1 100644
--- a/pkg/sentry/kernel/kcov.go
+++ b/pkg/sentry/kernel/kcov.go
@@ -22,13 +22,13 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/coverage"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/usage"
- "gvisor.dev/gvisor/pkg/syserror"
)
// kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov
@@ -125,19 +125,19 @@ func (kcov *Kcov) InitTrace(size uint64) error {
defer kcov.mu.Unlock()
if kcov.mode != linux.KCOV_MODE_DISABLED {
- return syserror.EBUSY
+ return linuxerr.EBUSY
}
// To simplify all the logic around mapping, we require that the length of the
// shared region is a multiple of the system page size.
if (8*size)&(hostarch.PageSize-1) != 0 {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
// We need space for at least two uint64s to hold current position and a
// single PC.
if size < 2 || size > kcovAreaSizeMax {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
kcov.size = size
@@ -157,7 +157,7 @@ func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error {
// KCOV_ENABLE must be preceded by KCOV_INIT_TRACE and an mmap call.
if kcov.mode != linux.KCOV_MODE_INIT || kcov.mappable == nil {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
switch traceKind {
@@ -165,13 +165,13 @@ func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error {
kcov.mode = linux.KCOV_MODE_TRACE_PC
case linux.KCOV_TRACE_CMP:
// We do not support KCOV_MODE_TRACE_CMP.
- return syserror.ENOTSUP
+ return linuxerr.ENOTSUP
default:
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if kcov.owningTask != nil && kcov.owningTask != t {
- return syserror.EBUSY
+ return linuxerr.EBUSY
}
kcov.owningTask = t
@@ -195,7 +195,7 @@ func (kcov *Kcov) DisableTrace(ctx context.Context) error {
}
if t != kcov.owningTask {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
kcov.mode = linux.KCOV_MODE_INIT
kcov.owningTask = nil
@@ -237,7 +237,7 @@ func (kcov *Kcov) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) erro
defer kcov.mu.Unlock()
if kcov.mode != linux.KCOV_MODE_INIT {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if kcov.mappable == nil {
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 352c36ba9..df5160b67 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -1299,11 +1299,11 @@ func (k *Kernel) WaitExited() {
}
// Kill requests that all tasks in k immediately exit as if group exiting with
-// status es. Kill does not wait for tasks to exit.
-func (k *Kernel) Kill(es ExitStatus) {
+// status ws. Kill does not wait for tasks to exit.
+func (k *Kernel) Kill(ws linux.WaitStatus) {
k.extMu.Lock()
defer k.extMu.Unlock()
- k.tasks.Kill(es)
+ k.tasks.Kill(ws)
}
// Pause requests that all tasks in k temporarily stop executing, and blocks
diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go
index 2e66ec587..5ffafb0d1 100644
--- a/pkg/sentry/kernel/kernel_opts.go
+++ b/pkg/sentry/kernel/kernel_opts.go
@@ -12,6 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+//go:build go1.1
+// +build go1.1
+
package kernel
// SpecialOpts contains non-standard options for the kernel.
diff --git a/pkg/sentry/kernel/msgqueue/BUILD b/pkg/sentry/kernel/msgqueue/BUILD
new file mode 100644
index 000000000..5ec11e1f6
--- /dev/null
+++ b/pkg/sentry/kernel/msgqueue/BUILD
@@ -0,0 +1,36 @@
+load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+ name = "message_list",
+ out = "message_list.go",
+ package = "msgqueue",
+ prefix = "msg",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*Message",
+ "Linker": "*Message",
+ },
+)
+
+go_library(
+ name = "msgqueue",
+ srcs = [
+ "message_list.go",
+ "msgqueue.go",
+ ],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/errors/linuxerr",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/ipc",
+ "//pkg/sentry/kernel/time",
+ "//pkg/sync",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/kernel/msgqueue/msgqueue.go b/pkg/sentry/kernel/msgqueue/msgqueue.go
new file mode 100644
index 000000000..3ce926950
--- /dev/null
+++ b/pkg/sentry/kernel/msgqueue/msgqueue.go
@@ -0,0 +1,220 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package msgqueue implements System V message queues.
+package msgqueue
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/ipc"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+ // System-wide limit for maximum number of queues.
+ maxQueues = linux.MSGMNI
+
+ // Maximum size of a queue in bytes.
+ maxQueueBytes = linux.MSGMNB
+
+ // Maximum size of a message in bytes.
+ maxMessageBytes = linux.MSGMAX
+)
+
+// Registry contains a set of message queues that can be referenced using keys
+// or IDs.
+//
+// +stateify savable
+type Registry struct {
+ // mu protects all the fields below.
+ mu sync.Mutex `state:"nosave"`
+
+ // reg defines basic fields and operations needed for all SysV registries.
+ reg *ipc.Registry
+}
+
+// NewRegistry returns a new Registry ready to be used.
+func NewRegistry(userNS *auth.UserNamespace) *Registry {
+ return &Registry{
+ reg: ipc.NewRegistry(userNS),
+ }
+}
+
+// Queue represents a SysV message queue, described by sysvipc(7).
+//
+// +stateify savable
+type Queue struct {
+ // registry is the registry owning this queue. Immutable.
+ registry *Registry
+
+ // mu protects all the fields below.
+ mu sync.Mutex `state:"nosave"`
+
+ // dead is set to true when a queue is removed from the registry and should
+ // not be used. Operations on the queue should check dead, and return
+ // EIDRM if set to true.
+ dead bool
+
+ // obj defines basic fields that should be included in all SysV IPC objects.
+ obj *ipc.Object
+
+ // senders holds a queue of blocked message senders. Senders are notified
+ // when enough space is available in the queue to insert their message.
+ senders waiter.Queue
+
+ // receivers holds a queue of blocked receivers. Receivers are notified
+ // when a new message is inserted into the queue and can be received.
+ receivers waiter.Queue
+
+ // messages is a list of sent messages.
+ messages msgList
+
+ // sendTime is the last time a msgsnd was perfomed.
+ sendTime ktime.Time
+
+ // receiveTime is the last time a msgrcv was performed.
+ receiveTime ktime.Time
+
+ // changeTime is the last time the queue was modified using msgctl.
+ changeTime ktime.Time
+
+ // byteCount is the current number of message bytes in the queue.
+ byteCount uint64
+
+ // messageCount is the current number of messages in the queue.
+ messageCount uint64
+
+ // maxBytes is the maximum allowed number of bytes in the queue, and is also
+ // used as a limit for the number of total possible messages.
+ maxBytes uint64
+
+ // sendPID is the PID of the process that performed the last msgsnd.
+ sendPID int32
+
+ // receivePID is the PID of the process that performed the last msgrcv.
+ receivePID int32
+}
+
+// Message represents a message exchanged through a Queue via msgsnd(2) and
+// msgrcv(2).
+//
+// +stateify savable
+type Message struct {
+ msgEntry
+
+ // mType is an integer representing the type of the sent message.
+ mType int64
+
+ // mText is an untyped block of memory.
+ mText []byte
+
+ // mSize is the size of mText.
+ mSize uint64
+}
+
+// FindOrCreate creates a new message queue or returns an existing one. See
+// msgget(2).
+func (r *Registry) FindOrCreate(ctx context.Context, key ipc.Key, mode linux.FileMode, private, create, exclusive bool) (*Queue, error) {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ if !private {
+ queue, err := r.reg.Find(ctx, key, mode, create, exclusive)
+ if err != nil {
+ return nil, err
+ }
+
+ if queue != nil {
+ return queue.(*Queue), nil
+ }
+ }
+
+ // Check system-wide limits.
+ if r.reg.ObjectCount() >= maxQueues {
+ return nil, linuxerr.ENOSPC
+ }
+
+ return r.newQueueLocked(ctx, key, fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(mode))
+}
+
+// newQueueLocked creates a new queue using the given fields. An error is
+// returned if there're no more available identifiers.
+//
+// Precondition: r.mu must be held.
+func (r *Registry) newQueueLocked(ctx context.Context, key ipc.Key, creator fs.FileOwner, perms fs.FilePermissions) (*Queue, error) {
+ q := &Queue{
+ registry: r,
+ obj: ipc.NewObject(r.reg.UserNS, key, creator, creator, perms),
+ sendTime: ktime.ZeroTime,
+ receiveTime: ktime.ZeroTime,
+ changeTime: ktime.NowFromContext(ctx),
+ maxBytes: maxQueueBytes,
+ }
+
+ err := r.reg.Register(q)
+ if err != nil {
+ return nil, err
+ }
+ return q, nil
+}
+
+// Remove removes the queue with specified ID. All waiters (readers and
+// writers) and writers will be awakened and fail. Remove will return an error
+// if the ID is invalid, or the the user doesn't have privileges.
+func (r *Registry) Remove(id ipc.ID, creds *auth.Credentials) error {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ r.reg.Remove(id, creds)
+ return nil
+}
+
+// Lock implements ipc.Mechanism.Lock.
+func (q *Queue) Lock() {
+ q.mu.Lock()
+}
+
+// Unlock implements ipc.mechanism.Unlock.
+//
+// +checklocksignore
+func (q *Queue) Unlock() {
+ q.mu.Unlock()
+}
+
+// Object implements ipc.Mechanism.Object.
+func (q *Queue) Object() *ipc.Object {
+ return q.obj
+}
+
+// Destroy implements ipc.Mechanism.Destroy.
+func (q *Queue) Destroy() {
+ q.dead = true
+
+ // Notify waiters. Senders and receivers will try to run, and return an
+ // error (EIDRM). Waiters should remove themselves from the queue after
+ // waking up.
+ q.senders.Notify(waiter.EventOut)
+ q.receivers.Notify(waiter.EventIn)
+}
+
+// ID returns queue's ID.
+func (q *Queue) ID() ipc.ID {
+ return q.obj.ID
+}
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index af46b3e08..94ebac7c5 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -21,6 +21,7 @@ go_library(
"//pkg/abi/linux",
"//pkg/amutex",
"//pkg/context",
+ "//pkg/errors/linuxerr",
"//pkg/hostarch",
"//pkg/marshal/primitive",
"//pkg/safemem",
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 6497dc4ba..08786d704 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -17,6 +17,7 @@ package pipe
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sync"
@@ -112,7 +113,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
// read side isn't open yet.
if flags.NonBlocking {
w.DecRef(ctx)
- return nil, syserror.ENXIO
+ return nil, linuxerr.ENXIO
}
if !waitFor(&i.mu, &i.rWakeup, ctx) {
@@ -130,10 +131,10 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
return rw, nil
default:
- return nil, syserror.EINVAL
+ return nil, linuxerr.EINVAL
}
}
func (*inodeOperations) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error {
- return syserror.EPIPE
+ return linuxerr.EPIPE
}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 06769931a..85e3ce9f4 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -22,6 +22,7 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -428,18 +429,18 @@ func (p *Pipe) FifoSize(context.Context, *fs.File) (int64, error) {
// SetFifoSize implements fs.FifoSizer.SetFifoSize.
func (p *Pipe) SetFifoSize(size int64) (int64, error) {
if size < 0 {
- return 0, syserror.EINVAL
+ return 0, linuxerr.EINVAL
}
if size < MinimumPipeSize {
size = MinimumPipeSize // Per spec.
}
if size > MaximumPipeSize {
- return 0, syserror.EPERM
+ return 0, linuxerr.EPERM
}
p.mu.Lock()
defer p.mu.Unlock()
if size < p.size {
- return 0, syserror.EBUSY
+ return 0, linuxerr.EBUSY
}
p.max = size
return size, nil
diff --git a/pkg/sentry/kernel/pipe/pipe_unsafe.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go
index dd60cba24..077c5d596 100644
--- a/pkg/sentry/kernel/pipe/pipe_unsafe.go
+++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go
@@ -23,6 +23,8 @@ import (
// concurrent calls cannot deadlock.
//
// Preconditions: x != y.
+// +checklocksacquire:x.mu
+// +checklocksacquire:y.mu
func lockTwoPipes(x, y *Pipe) {
// Lock the two pipes in order of increasing address.
if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) {
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index 3fa5d1d2f..c883a9014 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/amutex"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -86,7 +87,7 @@ func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error)
if n > 0 {
p.Notify(waiter.ReadableEvents)
}
- if err == unix.EPIPE {
+ if linuxerr.Equals(linuxerr.EPIPE, err) {
// If we are returning EPIPE send SIGPIPE to the task.
if sendSig := linux.SignalNoInfoFuncFromContext(ctx); sendSig != nil {
sendSig(linux.SIGPIPE)
@@ -156,6 +157,7 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume
//
// mu must be held by the caller. waitFor returns with mu held, but it will
// drop mu before blocking for any reader/writers.
+// +checklocks:mu
func waitFor(mu *sync.Mutex, wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool {
// Ideally this function would simply use a condition variable. However, the
// wait needs to be interruptible via 'sleeper', so we must sychronize via a
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index 95b948edb..077d5fd7f 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -17,6 +17,7 @@ package pipe
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -79,7 +80,7 @@ func (vp *VFSPipe) ReaderWriterPair(ctx context.Context, mnt *vfs.Mount, vfsd *v
// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error {
- return syserror.ESPIPE
+ return linuxerr.ESPIPE
}
// Open opens the pipe represented by vp.
@@ -90,7 +91,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
readable := vfs.MayReadFileWithOpenFlags(statusFlags)
writable := vfs.MayWriteFileWithOpenFlags(statusFlags)
if !readable && !writable {
- return nil, syserror.EINVAL
+ return nil, linuxerr.EINVAL
}
fd, err := vp.newFD(mnt, vfsd, statusFlags, locks)
@@ -131,7 +132,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
// side isn't open yet.
if statusFlags&linux.O_NONBLOCK != 0 {
fd.DecRef(ctx)
- return nil, syserror.ENXIO
+ return nil, linuxerr.ENXIO
}
// Wait for a reader to open the other end.
if !waitFor(&vp.mu, &vp.rWakeup, ctx) {
@@ -224,7 +225,7 @@ func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask {
// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
- return syserror.ESPIPE
+ return linuxerr.ESPIPE
}
// EventRegister implements waiter.Waitable.EventRegister.
@@ -415,7 +416,7 @@ func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
// Preconditions: count > 0.
func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) {
if dst.pipe == src.pipe {
- return 0, syserror.EINVAL
+ return 0, linuxerr.EINVAL
}
lockTwoPipes(dst.pipe, src.pipe)
diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go
index d801a3d83..319754a42 100644
--- a/pkg/sentry/kernel/posixtimer.go
+++ b/pkg/sentry/kernel/posixtimer.go
@@ -18,8 +18,8 @@ import (
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
- "gvisor.dev/gvisor/pkg/syserror"
)
// IntervalTimer represents a POSIX interval timer as described by
@@ -175,7 +175,7 @@ func (t *Task) IntervalTimerCreate(c ktime.Clock, sigev *linux.Sigevent) (linux.
break
}
if t.tg.nextTimerID == end {
- return 0, syserror.EAGAIN
+ return 0, linuxerr.EAGAIN
}
}
@@ -214,16 +214,16 @@ func (t *Task) IntervalTimerCreate(c ktime.Clock, sigev *linux.Sigevent) (linux.
target, ok := t.tg.pidns.tasks[ThreadID(sigev.Tid)]
t.tg.pidns.owner.mu.RUnlock()
if !ok || target.tg != t.tg {
- return 0, syserror.EINVAL
+ return 0, linuxerr.EINVAL
}
it.target = target
default:
- return 0, syserror.EINVAL
+ return 0, linuxerr.EINVAL
}
if sigev.Notify != linux.SIGEV_NONE {
it.signo = linux.Signal(sigev.Signo)
if !it.signo.IsValid() {
- return 0, syserror.EINVAL
+ return 0, linuxerr.EINVAL
}
}
it.timer = ktime.NewTimer(c, it)
@@ -238,7 +238,7 @@ func (t *Task) IntervalTimerDelete(id linux.TimerID) error {
defer t.tg.timerMu.Unlock()
it := t.tg.timers[id]
if it == nil {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
delete(t.tg.timers, id)
it.DestroyTimer()
@@ -251,7 +251,7 @@ func (t *Task) IntervalTimerSettime(id linux.TimerID, its linux.Itimerspec, abs
defer t.tg.timerMu.Unlock()
it := t.tg.timers[id]
if it == nil {
- return linux.Itimerspec{}, syserror.EINVAL
+ return linux.Itimerspec{}, linuxerr.EINVAL
}
newS, err := ktime.SettingFromItimerspec(its, abs, it.timer.Clock())
@@ -269,7 +269,7 @@ func (t *Task) IntervalTimerGettime(id linux.TimerID) (linux.Itimerspec, error)
defer t.tg.timerMu.Unlock()
it := t.tg.timers[id]
if it == nil {
- return linux.Itimerspec{}, syserror.EINVAL
+ return linux.Itimerspec{}, linuxerr.EINVAL
}
tm, s := it.timer.Get()
@@ -285,7 +285,7 @@ func (t *Task) IntervalTimerGetoverrun(id linux.TimerID) (int32, error) {
defer t.tg.timerMu.Unlock()
it := t.tg.timers[id]
if it == nil {
- return 0, syserror.EINVAL
+ return 0, linuxerr.EINVAL
}
// By timer_create(2) invariant, either it.target == nil (in which case
// it.overrunLast is immutably 0) or t.tg == it.target.tg; and the fact
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 20563f02a..079294f81 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -19,6 +19,7 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -481,7 +482,7 @@ func (t *Task) ptraceTraceme() error {
t.tg.pidns.owner.mu.Lock()
defer t.tg.pidns.owner.mu.Unlock()
if t.hasTracer() {
- return syserror.EPERM
+ return linuxerr.EPERM
}
if t.parent == nil {
// In Linux, only init can not have a parent, and init is assumed never
@@ -497,7 +498,7 @@ func (t *Task) ptraceTraceme() error {
return nil
}
if !t.parent.canTraceLocked(t, true) {
- return syserror.EPERM
+ return linuxerr.EPERM
}
if t.parent.exitState != TaskExitNone {
// Fail silently, as if we were successfully attached but then
@@ -513,21 +514,21 @@ func (t *Task) ptraceTraceme() error {
// ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller.
func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error {
if t.tg == target.tg {
- return syserror.EPERM
+ return linuxerr.EPERM
}
t.tg.pidns.owner.mu.Lock()
defer t.tg.pidns.owner.mu.Unlock()
if !t.canTraceLocked(target, true) {
- return syserror.EPERM
+ return linuxerr.EPERM
}
if target.hasTracer() {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// Attaching to zombies and dead tasks is not permitted; the exit
// notification logic relies on this. Linux allows attaching to PF_EXITING
// tasks, though.
if target.exitState >= TaskExitZombie {
- return syserror.EPERM
+ return linuxerr.EPERM
}
if seize {
if err := target.ptraceSetOptionsLocked(opts); err != nil {
@@ -651,6 +652,7 @@ func (t *Task) forgetTracerLocked() {
// Preconditions:
// * The signal mutex must be locked.
// * The caller must be running on the task goroutine.
+// +checklocks:t.tg.signalHandlers.mu
func (t *Task) ptraceSignalLocked(info *linux.SignalInfo) bool {
if linux.Signal(info.Signo) == linux.SIGKILL {
return false
@@ -766,14 +768,14 @@ const (
// ptraceClone is called at the end of a clone or fork syscall to check if t
// should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK
// stop. child is the new task.
-func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool {
+func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, args *linux.CloneArgs) bool {
if !t.hasTracer() {
return false
}
t.tg.pidns.owner.mu.Lock()
defer t.tg.pidns.owner.mu.Unlock()
event := false
- if !opts.Untraced {
+ if args.Flags&linux.CLONE_UNTRACED == 0 {
switch kind {
case ptraceCloneKindClone:
if t.ptraceOpts.TraceClone {
@@ -808,7 +810,7 @@ func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions
// clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is
// confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() =>
// include/linux/ptrace.h:ptrace_init_task().
- if event || opts.InheritTracer {
+ if event || args.Flags&linux.CLONE_PTRACE != 0 {
tracer := t.Tracer()
if tracer != nil {
child.ptraceTracer.Store(tracer)
@@ -910,7 +912,7 @@ func (t *Task) ptraceExit() {
return
}
t.tg.signalHandlers.mu.Lock()
- status := t.exitStatus.Status()
+ status := t.exitStatus
t.tg.signalHandlers.mu.Unlock()
t.Debugf("Entering PTRACE_EVENT_EXIT stop")
t.ptraceEventLocked(linux.PTRACE_EVENT_EXIT, uint64(status))
@@ -938,7 +940,7 @@ func (t *Task) ptraceKill(target *Task) error {
t.tg.pidns.owner.mu.Lock()
defer t.tg.pidns.owner.mu.Unlock()
if target.Tracer() != t {
- return syserror.ESRCH
+ return linuxerr.ESRCH
}
target.tg.signalHandlers.mu.Lock()
defer target.tg.signalHandlers.mu.Unlock()
@@ -962,7 +964,7 @@ func (t *Task) ptraceInterrupt(target *Task) error {
t.tg.pidns.owner.mu.Lock()
defer t.tg.pidns.owner.mu.Unlock()
if target.Tracer() != t {
- return syserror.ESRCH
+ return linuxerr.ESRCH
}
if !target.ptraceSeized {
return syserror.EIO
@@ -994,7 +996,7 @@ func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
linux.PTRACE_O_TRACEVFORK |
linux.PTRACE_O_TRACEVFORKDONE)
if opts&^valid != 0 {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
t.ptraceOpts = ptraceOptions{
ExitKill: opts&linux.PTRACE_O_EXITKILL != 0,
@@ -1020,7 +1022,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error {
// specified by pid.
target := t.tg.pidns.TaskWithID(pid)
if target == nil {
- return syserror.ESRCH
+ return linuxerr.ESRCH
}
// PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already
@@ -1045,7 +1047,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error {
t.tg.pidns.owner.mu.RLock()
if target.Tracer() != t {
t.tg.pidns.owner.mu.RUnlock()
- return syserror.ESRCH
+ return linuxerr.ESRCH
}
if !target.ptraceFreeze() {
t.tg.pidns.owner.mu.RUnlock()
@@ -1053,7 +1055,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error {
// PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the
// tracee to be in a ptrace-stop, otherwise they fail with ESRCH." -
// ptrace(2)
- return syserror.ESRCH
+ return linuxerr.ESRCH
}
t.tg.pidns.owner.mu.RUnlock()
// Even if the target has a ptrace-stop active, the tracee's task goroutine
@@ -1221,7 +1223,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error {
t.tg.pidns.owner.mu.RLock()
defer t.tg.pidns.owner.mu.RUnlock()
if target.ptraceSiginfo == nil {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
_, err := target.ptraceSiginfo.CopyOut(t, data)
return err
@@ -1234,14 +1236,14 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error {
t.tg.pidns.owner.mu.RLock()
defer t.tg.pidns.owner.mu.RUnlock()
if target.ptraceSiginfo == nil {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
target.ptraceSiginfo = &info
return nil
case linux.PTRACE_GETSIGMASK:
if addr != linux.SignalSetSize {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
mask := target.SignalMask()
_, err := mask.CopyOut(t, data)
@@ -1249,7 +1251,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error {
case linux.PTRACE_SETSIGMASK:
if addr != linux.SignalSetSize {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
var mask linux.SignalSet
if _, err := mask.CopyIn(t, data); err != nil {
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
index 5ae05b5c3..63422e155 100644
--- a/pkg/sentry/kernel/ptrace_amd64.go
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+//go:build amd64
// +build amd64
package kernel
diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go
index 46dd84cbc..27514d67b 100644
--- a/pkg/sentry/kernel/ptrace_arm64.go
+++ b/pkg/sentry/kernel/ptrace_arm64.go
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+//go:build arm64
// +build arm64
package kernel
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 4bc5bca44..de352f4f2 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -18,9 +18,9 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -59,23 +59,23 @@ func (t *Task) RSeqAvailable() bool {
func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error {
if t.rseqAddr != 0 {
if t.rseqAddr != addr {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if t.rseqSignature != signature {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
- return syserror.EBUSY
+ return linuxerr.EBUSY
}
// rseq must be aligned and correctly sized.
if addr&(linux.AlignOfRSeq-1) != 0 {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if length != linux.SizeOfRSeq {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok {
- return syserror.EFAULT
+ return linuxerr.EFAULT
}
t.rseqAddr = addr
@@ -92,7 +92,7 @@ func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error {
t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
- return syserror.EFAULT
+ return linuxerr.EFAULT
}
return nil
@@ -103,16 +103,16 @@ func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error {
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) ClearRSeq(addr hostarch.Addr, length, signature uint32) error {
if t.rseqAddr == 0 {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if t.rseqAddr != addr {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if length != linux.SizeOfRSeq {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if t.rseqSignature != signature {
- return syserror.EPERM
+ return linuxerr.EPERM
}
if err := t.rseqClearCPU(); err != nil {
@@ -152,10 +152,10 @@ func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error {
return nil
}
if r.CriticalSection.Start >= r.CriticalSection.End {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if r.CriticalSection.Contains(r.Restart) {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
// TODO(jamieliu): check that r.CriticalSection and r.Restart are in
// the application address range, for consistency with Linux.
@@ -187,7 +187,7 @@ func (t *Task) SetOldRSeqCPUAddr(addr hostarch.Addr) error {
// unfortunate, but unlikely in a correct program.
if err := t.rseqUpdateCPU(); err != nil {
t.oldRSeqCPUAddr = 0
- return syserror.EINVAL // yes, EINVAL, not err or EFAULT
+ return linuxerr.EINVAL // yes, EINVAL, not err or EFAULT
}
return nil
}
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index 65e5427c1..2ae08ed12 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -25,9 +25,10 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/context",
- "//pkg/log",
+ "//pkg/errors/linuxerr",
"//pkg/sentry/fs",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/ipc",
"//pkg/sentry/kernel/time",
"//pkg/sync",
"//pkg/syserror",
@@ -40,10 +41,11 @@ go_test(
srcs = ["semaphore_test.go"],
library = ":semaphore",
deps = [
- "//pkg/abi/linux",
- "//pkg/context",
- "//pkg/sentry/contexttest",
- "//pkg/sentry/kernel/auth",
- "//pkg/syserror",
+ "//pkg/abi/linux", # keep
+ "//pkg/context", # keep
+ "//pkg/sentry/contexttest", # keep
+ "//pkg/sentry/kernel/auth", # keep
+ "//pkg/sentry/kernel/ipc", # keep
+ "//pkg/syserror", # keep
],
)
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 47bb66b42..8610d3fc1 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -20,9 +20,10 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/ipc"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -46,15 +47,15 @@ const (
//
// +stateify savable
type Registry struct {
- // userNS owning the ipc name this registry belongs to. Immutable.
- userNS *auth.UserNamespace
// mu protects all fields below.
- mu sync.Mutex `state:"nosave"`
- semaphores map[int32]*Set
- lastIDUsed int32
+ mu sync.Mutex `state:"nosave"`
+
+ // reg defines basic fields and operations needed for all SysV registries.
+ reg *ipc.Registry
+
// indexes maintains a mapping between a set's index in virtual array and
// its identifier.
- indexes map[int32]int32
+ indexes map[int32]ipc.ID
}
// Set represents a set of semaphores that can be operated atomically.
@@ -64,19 +65,11 @@ type Set struct {
// registry owning this sem set. Immutable.
registry *Registry
- // Id is a handle that identifies the set.
- ID int32
-
- // key is an user provided key that can be shared between processes.
- key int32
+ // mu protects all fields below.
+ mu sync.Mutex `state:"nosave"`
- // creator is the user that created the set. Immutable.
- creator fs.FileOwner
+ obj *ipc.Object
- // mu protects all fields below.
- mu sync.Mutex `state:"nosave"`
- owner fs.FileOwner
- perms fs.FilePermissions
opTime ktime.Time
changeTime ktime.Time
@@ -114,9 +107,8 @@ type waiter struct {
// NewRegistry creates a new semaphore set registry.
func NewRegistry(userNS *auth.UserNamespace) *Registry {
return &Registry{
- userNS: userNS,
- semaphores: make(map[int32]*Set),
- indexes: make(map[int32]int32),
+ reg: ipc.NewRegistry(userNS),
+ indexes: make(map[int32]ipc.ID),
}
}
@@ -125,52 +117,40 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry {
// a new set is always created. If create is false, it fails if a set cannot
// be found. If exclusive is true, it fails if a set with the same key already
// exists.
-func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) {
+func (r *Registry) FindOrCreate(ctx context.Context, key ipc.Key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) {
if nsems < 0 || nsems > semsMax {
- return nil, syserror.EINVAL
+ return nil, linuxerr.EINVAL
}
r.mu.Lock()
defer r.mu.Unlock()
if !private {
- // Look up an existing semaphore.
- if set := r.findByKey(key); set != nil {
- set.mu.Lock()
- defer set.mu.Unlock()
-
- // Check that caller can access semaphore set.
- creds := auth.CredentialsFromContext(ctx)
- if !set.checkPerms(creds, fs.PermsFromMode(mode)) {
- return nil, syserror.EACCES
- }
+ set, err := r.reg.Find(ctx, key, mode, create, exclusive)
+ if err != nil {
+ return nil, err
+ }
- // Validate parameters.
+ // Validate semaphore-specific parameters.
+ if set != nil {
+ set := set.(*Set)
if nsems > int32(set.Size()) {
- return nil, syserror.EINVAL
- }
- if create && exclusive {
- return nil, syserror.EEXIST
+ return nil, linuxerr.EINVAL
}
return set, nil
}
-
- if !create {
- // Semaphore not found and should not be created.
- return nil, syserror.ENOENT
- }
}
// Zero is only valid if an existing set is found.
if nsems == 0 {
- return nil, syserror.EINVAL
+ return nil, linuxerr.EINVAL
}
// Apply system limits.
//
- // Map semaphores and map indexes in a registry are of the same size,
- // check map semaphores only here for the system limit.
- if len(r.semaphores) >= setsMax {
+ // Map reg.objects and map indexes in a registry are of the same size,
+ // check map reg.objects only here for the system limit.
+ if r.reg.ObjectCount() >= setsMax {
return nil, syserror.ENOSPC
}
if r.totalSems() > int(semsTotalMax-nsems) {
@@ -178,9 +158,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu
}
// Finally create a new set.
- owner := fs.FileOwnerFromContext(ctx)
- perms := fs.FilePermsFromMode(mode)
- return r.newSet(ctx, key, owner, owner, perms, nsems)
+ return r.newSetLocked(ctx, key, fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(mode), nsems)
}
// IPCInfo returns information about system-wide semaphore limits and parameters.
@@ -207,7 +185,7 @@ func (r *Registry) SemInfo() *linux.SemInfo {
defer r.mu.Unlock()
info := r.IPCInfo()
- info.SemUsz = uint32(len(r.semaphores))
+ info.SemUsz = uint32(r.reg.ObjectCount())
info.SemAem = uint32(r.totalSems())
return info
@@ -230,77 +208,59 @@ func (r *Registry) HighestIndex() int32 {
return highestIndex
}
-// RemoveID removes set with give 'id' from the registry and marks the set as
+// Remove removes set with give 'id' from the registry and marks the set as
// dead. All waiters will be awakened and fail.
-func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
+func (r *Registry) Remove(id ipc.ID, creds *auth.Credentials) error {
r.mu.Lock()
defer r.mu.Unlock()
- set := r.semaphores[id]
- if set == nil {
- return syserror.EINVAL
- }
index, found := r.findIndexByID(id)
if !found {
- // Inconsistent state.
- panic(fmt.Sprintf("unable to find an index for ID: %d", id))
+ return linuxerr.EINVAL
}
+ delete(r.indexes, index)
- set.mu.Lock()
- defer set.mu.Unlock()
-
- // "The effective user ID of the calling process must match the creator or
- // owner of the semaphore set, or the caller must be privileged."
- if !set.checkCredentials(creds) && !set.checkCapability(creds) {
- return syserror.EACCES
- }
+ r.reg.Remove(id, creds)
- delete(r.semaphores, set.ID)
- delete(r.indexes, index)
- set.destroy()
return nil
}
-func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) {
+// newSetLocked creates a new Set using given fields. An error is returned if there
+// are no more available identifiers.
+//
+// Precondition: r.mu must be held.
+func (r *Registry) newSetLocked(ctx context.Context, key ipc.Key, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) {
set := &Set{
registry: r,
- key: key,
- owner: owner,
- creator: owner,
- perms: perms,
+ obj: ipc.NewObject(r.reg.UserNS, ipc.Key(key), creator, creator, perms),
changeTime: ktime.NowFromContext(ctx),
sems: make([]sem, nsems),
}
- // Find the next available ID.
- for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
- // Handle wrap around.
- if id < 0 {
- id = 0
- continue
- }
- if r.semaphores[id] == nil {
- index, found := r.findFirstAvailableIndex()
- if !found {
- panic("unable to find an available index")
- }
- r.indexes[index] = id
- r.lastIDUsed = id
- r.semaphores[id] = set
- set.ID = id
- return set, nil
- }
+ err := r.reg.Register(set)
+ if err != nil {
+ return nil, err
+ }
+
+ index, found := r.findFirstAvailableIndex()
+ if !found {
+ // See linux, ipc/sem.c:newary().
+ return nil, linuxerr.ENOSPC
}
+ r.indexes[index] = set.obj.ID
- log.Warningf("Semaphore map is full, they must be leaking")
- return nil, syserror.ENOMEM
+ return set, nil
}
// FindByID looks up a set given an ID.
-func (r *Registry) FindByID(id int32) *Set {
+func (r *Registry) FindByID(id ipc.ID) *Set {
r.mu.Lock()
defer r.mu.Unlock()
- return r.semaphores[id]
+ mech := r.reg.FindByID(id)
+ if mech == nil {
+ return nil
+ }
+ return mech.(*Set)
}
// FindByIndex looks up a set given an index.
@@ -312,19 +272,10 @@ func (r *Registry) FindByIndex(index int32) *Set {
if !present {
return nil
}
- return r.semaphores[id]
+ return r.reg.FindByID(id).(*Set)
}
-func (r *Registry) findByKey(key int32) *Set {
- for _, v := range r.semaphores {
- if v.key == key {
- return v
- }
- }
- return nil
-}
-
-func (r *Registry) findIndexByID(id int32) (int32, bool) {
+func (r *Registry) findIndexByID(id ipc.ID) (int32, bool) {
for k, v := range r.indexes {
if v == id {
return k, true
@@ -344,12 +295,36 @@ func (r *Registry) findFirstAvailableIndex() (int32, bool) {
func (r *Registry) totalSems() int {
totalSems := 0
- for _, v := range r.semaphores {
- totalSems += v.Size()
- }
+ r.reg.ForAllObjects(
+ func(o ipc.Mechanism) {
+ totalSems += o.(*Set).Size()
+ },
+ )
return totalSems
}
+// ID returns semaphore's ID.
+func (s *Set) ID() ipc.ID {
+ return s.obj.ID
+}
+
+// Object implements ipc.Mechanism.Object.
+func (s *Set) Object() *ipc.Object {
+ return s.obj
+}
+
+// Lock implements ipc.Mechanism.Lock.
+func (s *Set) Lock() {
+ s.mu.Lock()
+}
+
+// Unlock implements ipc.mechanism.Unlock.
+//
+// +checklocksignore
+func (s *Set) Unlock() {
+ s.mu.Unlock()
+}
+
func (s *Set) findSem(num int32) *sem {
if num < 0 || int(num) >= s.Size() {
return nil
@@ -369,12 +344,12 @@ func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.File
// "The effective UID of the calling process must match the owner or creator
// of the semaphore set, or the caller must be privileged."
- if !s.checkCredentials(creds) && !s.checkCapability(creds) {
- return syserror.EACCES
+ if !s.obj.CheckOwnership(creds) {
+ return linuxerr.EACCES
}
- s.owner = owner
- s.perms = perms
+ s.obj.Owner = owner
+ s.obj.Perms = perms
s.changeTime = ktime.NowFromContext(ctx)
return nil
}
@@ -394,18 +369,18 @@ func (s *Set) semStat(creds *auth.Credentials, permMask fs.PermMask) (*linux.Sem
s.mu.Lock()
defer s.mu.Unlock()
- if !s.checkPerms(creds, permMask) {
- return nil, syserror.EACCES
+ if !s.obj.CheckPermissions(creds, permMask) {
+ return nil, linuxerr.EACCES
}
return &linux.SemidDS{
SemPerm: linux.IPCPerm{
- Key: uint32(s.key),
- UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
- GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
- CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
- CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
- Mode: uint16(s.perms.LinuxMode()),
+ Key: uint32(s.obj.Key),
+ UID: uint32(creds.UserNamespace.MapFromKUID(s.obj.Owner.UID)),
+ GID: uint32(creds.UserNamespace.MapFromKGID(s.obj.Owner.GID)),
+ CUID: uint32(creds.UserNamespace.MapFromKUID(s.obj.Creator.UID)),
+ CGID: uint32(creds.UserNamespace.MapFromKGID(s.obj.Creator.GID)),
+ Mode: uint16(s.obj.Perms.LinuxMode()),
Seq: 0, // IPC sequence not supported.
},
SemOTime: s.opTime.TimeT(),
@@ -417,20 +392,20 @@ func (s *Set) semStat(creds *auth.Credentials, permMask fs.PermMask) (*linux.Sem
// SetVal overrides a semaphore value, waking up waiters as needed.
func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error {
if val < 0 || val > valueMax {
- return syserror.ERANGE
+ return linuxerr.ERANGE
}
s.mu.Lock()
defer s.mu.Unlock()
// "The calling process must have alter permission on the semaphore set."
- if !s.checkPerms(creds, fs.PermMask{Write: true}) {
- return syserror.EACCES
+ if !s.obj.CheckPermissions(creds, fs.PermMask{Write: true}) {
+ return linuxerr.EACCES
}
sem := s.findSem(num)
if sem == nil {
- return syserror.ERANGE
+ return linuxerr.ERANGE
}
// TODO(gvisor.dev/issue/137): Clear undo entries in all processes.
@@ -452,7 +427,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
for _, val := range vals {
if val > valueMax {
- return syserror.ERANGE
+ return linuxerr.ERANGE
}
}
@@ -460,8 +435,8 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
defer s.mu.Unlock()
// "The calling process must have alter permission on the semaphore set."
- if !s.checkPerms(creds, fs.PermMask{Write: true}) {
- return syserror.EACCES
+ if !s.obj.CheckPermissions(creds, fs.PermMask{Write: true}) {
+ return linuxerr.EACCES
}
for i, val := range vals {
@@ -482,13 +457,13 @@ func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) {
defer s.mu.Unlock()
// "The calling process must have read permission on the semaphore set."
- if !s.checkPerms(creds, fs.PermMask{Read: true}) {
- return 0, syserror.EACCES
+ if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) {
+ return 0, linuxerr.EACCES
}
sem := s.findSem(num)
if sem == nil {
- return 0, syserror.ERANGE
+ return 0, linuxerr.ERANGE
}
return sem.value, nil
}
@@ -499,8 +474,8 @@ func (s *Set) GetValAll(creds *auth.Credentials) ([]uint16, error) {
defer s.mu.Unlock()
// "The calling process must have read permission on the semaphore set."
- if !s.checkPerms(creds, fs.PermMask{Read: true}) {
- return nil, syserror.EACCES
+ if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) {
+ return nil, linuxerr.EACCES
}
vals := make([]uint16, s.Size())
@@ -516,13 +491,13 @@ func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) {
defer s.mu.Unlock()
// "The calling process must have read permission on the semaphore set."
- if !s.checkPerms(creds, fs.PermMask{Read: true}) {
- return 0, syserror.EACCES
+ if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) {
+ return 0, linuxerr.EACCES
}
sem := s.findSem(num)
if sem == nil {
- return 0, syserror.ERANGE
+ return 0, linuxerr.ERANGE
}
return sem.pid, nil
}
@@ -532,13 +507,13 @@ func (s *Set) countWaiters(num int32, creds *auth.Credentials, pred func(w *wait
defer s.mu.Unlock()
// The calling process must have read permission on the semaphore set.
- if !s.checkPerms(creds, fs.PermMask{Read: true}) {
- return 0, syserror.EACCES
+ if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) {
+ return 0, linuxerr.EACCES
}
sem := s.findSem(num)
if sem == nil {
- return 0, syserror.ERANGE
+ return 0, linuxerr.ERANGE
}
var cnt uint16
for w := sem.waiters.Front(); w != nil; w = w.Next() {
@@ -581,15 +556,15 @@ func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Cr
readOnly := true
for _, op := range ops {
if s.findSem(int32(op.SemNum)) == nil {
- return nil, 0, syserror.EFBIG
+ return nil, 0, linuxerr.EFBIG
}
if op.SemOp != 0 {
readOnly = false
}
}
- if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) {
- return nil, 0, syserror.EACCES
+ if !s.obj.CheckPermissions(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) {
+ return nil, 0, linuxerr.EACCES
}
ch, num, err := s.executeOps(ctx, ops, pid)
@@ -624,7 +599,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch
if op.SemOp < 0 {
// Handle 'wait' operation.
if -op.SemOp > valueMax {
- return nil, 0, syserror.ERANGE
+ return nil, 0, linuxerr.ERANGE
}
if -op.SemOp > tmpVals[op.SemNum] {
// Not enough resources, must wait.
@@ -639,7 +614,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch
} else {
// op.SemOp > 0: Handle 'signal' operation.
if tmpVals[op.SemNum] > valueMax-op.SemOp {
- return nil, 0, syserror.ERANGE
+ return nil, 0, linuxerr.ERANGE
}
}
@@ -674,38 +649,10 @@ func (s *Set) AbortWait(num int32, ch chan struct{}) {
// Waiter may not be found in case it raced with wakeWaiters().
}
-func (s *Set) checkCredentials(creds *auth.Credentials) bool {
- return s.owner.UID == creds.EffectiveKUID ||
- s.owner.GID == creds.EffectiveKGID ||
- s.creator.UID == creds.EffectiveKUID ||
- s.creator.GID == creds.EffectiveKGID
-}
-
-func (s *Set) checkCapability(creds *auth.Credentials) bool {
- return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok()
-}
-
-func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool {
- // Are we owner, or in group, or other?
- p := s.perms.Other
- if s.owner.UID == creds.EffectiveKUID {
- p = s.perms.User
- } else if creds.InGroup(s.owner.GID) {
- p = s.perms.Group
- }
-
- // Are permissions satisfied without capability checks?
- if p.SupersetOf(reqPerms) {
- return true
- }
-
- return s.checkCapability(creds)
-}
-
-// destroy destroys the set.
+// Destroy implements ipc.Mechanism.Destroy.
//
// Preconditions: Caller must hold 's.mu'.
-func (s *Set) destroy() {
+func (s *Set) Destroy() {
// Notify all waiters. They will fail on the next attempt to execute
// operations and return error.
s.dead = true
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
index e47acefdf..2e4ab8121 100644
--- a/pkg/sentry/kernel/semaphore/semaphore_test.go
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/ipc"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -55,7 +56,7 @@ func signalled(ch chan struct{}) bool {
func TestBasic(t *testing.T) {
ctx := contexttest.Context(t)
- set := &Set{ID: 123, sems: make([]sem, 1)}
+ set := &Set{obj: &ipc.Object{ID: 123}, sems: make([]sem, 1)}
ops := []linux.Sembuf{
{SemOp: 1},
}
@@ -76,7 +77,7 @@ func TestBasic(t *testing.T) {
func TestWaitForZero(t *testing.T) {
ctx := contexttest.Context(t)
- set := &Set{ID: 123, sems: make([]sem, 1)}
+ set := &Set{obj: &ipc.Object{ID: 123}, sems: make([]sem, 1)}
ops := []linux.Sembuf{
{SemOp: 0},
}
@@ -115,7 +116,7 @@ func TestWaitForZero(t *testing.T) {
func TestNoWait(t *testing.T) {
ctx := contexttest.Context(t)
- set := &Set{ID: 123, sems: make([]sem, 1)}
+ set := &Set{obj: &ipc.Object{ID: 123}, sems: make([]sem, 1)}
ops := []linux.Sembuf{
{SemOp: 1},
}
@@ -138,11 +139,12 @@ func TestUnregister(t *testing.T) {
ctx := contexttest.Context(t)
r := NewRegistry(auth.NewRootUserNamespace())
set, err := r.FindOrCreate(ctx, 123, 2, linux.FileMode(0x600), true, true, true)
+
if err != nil {
t.Fatalf("FindOrCreate() failed, err: %v", err)
}
- if got := r.FindByID(set.ID); got.ID != set.ID {
- t.Fatalf("FindById(%d) failed, got: %+v, expected: %+v", set.ID, got, set)
+ if got := r.FindByID(set.obj.ID); got.obj.ID != set.obj.ID {
+ t.Fatalf("FindById(%d) failed, got: %+v, expected: %+v", set.obj.ID, got, set)
}
ops := []linux.Sembuf{
@@ -155,14 +157,14 @@ func TestUnregister(t *testing.T) {
}
creds := auth.CredentialsFromContext(ctx)
- if err := r.RemoveID(set.ID, creds); err != nil {
- t.Fatalf("RemoveID(%d) failed, err: %v", set.ID, err)
+ if err := r.Remove(set.obj.ID, creds); err != nil {
+ t.Fatalf("Remove(%d) failed, err: %v", set.obj.ID, err)
}
if !set.dead {
t.Fatalf("set is not dead: %+v", set)
}
- if got := r.FindByID(set.ID); got != nil {
- t.Fatalf("FindById(%d) failed, got: %+v, expected: nil", set.ID, got)
+ if got := r.FindByID(set.obj.ID); got != nil {
+ t.Fatalf("FindById(%d) failed, got: %+v, expected: nil", set.obj.ID, got)
}
for i, ch := range chs {
if !signalled(ch) {
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 973d708a3..f9f872522 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -16,7 +16,7 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
)
// SessionID is the public identifier.
@@ -120,8 +120,9 @@ func (pg *ProcessGroup) Originator() *ThreadGroup {
// IsOrphan returns true if this process group is an orphan.
func (pg *ProcessGroup) IsOrphan() bool {
- pg.originator.TaskSet().mu.RLock()
- defer pg.originator.TaskSet().mu.RUnlock()
+ ts := pg.originator.TaskSet()
+ ts.mu.RLock()
+ defer ts.mu.RUnlock()
return pg.ancestors == 0
}
@@ -277,14 +278,14 @@ func (tg *ThreadGroup) createSession() error {
continue
}
if s.leader == tg {
- return syserror.EPERM
+ return linuxerr.EPERM
}
if s.id == SessionID(id) {
- return syserror.EPERM
+ return linuxerr.EPERM
}
for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
if pg.id == ProcessGroupID(id) {
- return syserror.EPERM
+ return linuxerr.EPERM
}
}
}
@@ -371,7 +372,7 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
// Check whether a process still exists or not.
if id == 0 {
- return syserror.ESRCH
+ return linuxerr.ESRCH
}
// Per above, check for a Session leader or existing group.
@@ -380,11 +381,11 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
continue
}
if s.leader == tg {
- return syserror.EPERM
+ return linuxerr.EPERM
}
for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
if pg.id == ProcessGroupID(id) {
- return syserror.EPERM
+ return linuxerr.EPERM
}
}
}
@@ -442,17 +443,17 @@ func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID
// Lookup the ProcessGroup.
pg := pidns.processGroups[pgid]
if pg == nil {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// Disallow the join if an execve has performed, per POSIX.
if checkExec && tg.execed {
- return syserror.EACCES
+ return linuxerr.EACCES
}
// See if it's in the same session as ours.
if pg.session != tg.processGroup.session {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// Join the group; adjust children.
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index 1c3c0794f..4e8deac4c 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -28,6 +28,7 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/errors/linuxerr",
"//pkg/hostarch",
"//pkg/log",
"//pkg/refs",
@@ -35,6 +36,7 @@ go_library(
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/ipc",
"//pkg/sentry/kernel/time",
"//pkg/sentry/memmap",
"//pkg/sentry/pgalloc",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index a73f1bdca..2abf467d7 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -38,10 +38,12 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/ipc"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -50,12 +52,6 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
-// Key represents a shm segment key. Analogous to a file name.
-type Key int32
-
-// ID represents the opaque handle for a shm segment. Analogous to an fd.
-type ID int32
-
// Registry tracks all shared memory segments in an IPC namespace. The registry
// provides the mechanisms for creating and finding segments, and reporting
// global shm parameters.
@@ -68,50 +64,51 @@ type Registry struct {
// mu protects all fields below.
mu sync.Mutex `state:"nosave"`
- // shms maps segment ids to segments.
+ // reg defines basic fields and operations needed for all SysV registries.
//
- // shms holds all referenced segments, which are removed on the last
+ // Withing reg, there are two maps, Objects and KeysToIDs.
+ //
+ // reg.objects holds all referenced segments, which are removed on the last
// DecRef. Thus, it cannot itself hold a reference on the Shm.
//
// Since removal only occurs after the last (unlocked) DecRef, there
// exists a short window during which a Shm still exists in Shm, but is
// unreferenced. Users must use TryIncRef to determine if the Shm is
// still valid.
- shms map[ID]*Shm
-
- // keysToShms maps segment keys to segments.
//
- // Shms in keysToShms are guaranteed to be referenced, as they are
+ // keysToIDs maps segment keys to IDs.
+ //
+ // Shms in keysToIDs are guaranteed to be referenced, as they are
// removed by disassociateKey before the last DecRef.
- keysToShms map[Key]*Shm
+ reg *ipc.Registry
// Sum of the sizes of all existing segments rounded up to page size, in
// units of page size.
totalPages uint64
-
- // ID assigned to the last created segment. Used to quickly find the next
- // unused ID.
- lastIDUsed ID
}
// NewRegistry creates a new shm registry.
func NewRegistry(userNS *auth.UserNamespace) *Registry {
return &Registry{
- userNS: userNS,
- shms: make(map[ID]*Shm),
- keysToShms: make(map[Key]*Shm),
+ userNS: userNS,
+ reg: ipc.NewRegistry(userNS),
}
}
// FindByID looks up a segment given an ID.
//
// FindByID returns a reference on Shm.
-func (r *Registry) FindByID(id ID) *Shm {
+func (r *Registry) FindByID(id ipc.ID) *Shm {
r.mu.Lock()
defer r.mu.Unlock()
- s := r.shms[id]
+ mech := r.reg.FindByID(id)
+ if mech == nil {
+ return nil
+ }
+ s := mech.(*Shm)
+
// Take a reference on s. If TryIncRef fails, s has reached the last
- // DecRef, but hasn't quite been removed from r.shms yet.
+ // DecRef, but hasn't quite been removed from r.reg.objects yet.
if s != nil && s.TryIncRef() {
return s
}
@@ -128,9 +125,9 @@ func (r *Registry) dissociateKey(s *Shm) {
defer r.mu.Unlock()
s.mu.Lock()
defer s.mu.Unlock()
- if s.key != linux.IPC_PRIVATE {
- delete(r.keysToShms, s.key)
- s.key = linux.IPC_PRIVATE
+ if s.obj.Key != linux.IPC_PRIVATE {
+ r.reg.DissociateKey(s.obj.Key)
+ s.obj.Key = linux.IPC_PRIVATE
}
}
@@ -138,69 +135,49 @@ func (r *Registry) dissociateKey(s *Shm) {
// analogous to open(2).
//
// FindOrCreate returns a reference on Shm.
-func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
+func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key ipc.Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
// "A new segment was to be created and size is less than SHMMIN or
// greater than SHMMAX." - man shmget(2)
//
// Note that 'private' always implies the creation of a new segment
// whether IPC_CREAT is specified or not.
- return nil, syserror.EINVAL
+ return nil, linuxerr.EINVAL
}
r.mu.Lock()
defer r.mu.Unlock()
- if len(r.shms) >= linux.SHMMNI {
+ if r.reg.ObjectCount() >= linux.SHMMNI {
// "All possible shared memory IDs have been taken (SHMMNI) ..."
// - man shmget(2)
return nil, syserror.ENOSPC
}
if !private {
- // Look up an existing segment.
- if shm := r.keysToShms[key]; shm != nil {
- shm.mu.Lock()
- defer shm.mu.Unlock()
-
- // Check that caller can access the segment.
- if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) {
- // "The user does not have permission to access the shared
- // memory segment, and does not have the CAP_IPC_OWNER
- // capability in the user namespace that governs its IPC
- // namespace." - man shmget(2)
- return nil, syserror.EACCES
- }
+ shm, err := r.reg.Find(ctx, key, mode, create, exclusive)
+ if err != nil {
+ return nil, err
+ }
+ // Validate shm-specific parameters.
+ if shm != nil {
+ shm := shm.(*Shm)
if size > shm.size {
// "A segment for the given key exists, but size is greater than
// the size of that segment." - man shmget(2)
- return nil, syserror.EINVAL
- }
-
- if create && exclusive {
- // "IPC_CREAT and IPC_EXCL were specified in shmflg, but a
- // shared memory segment already exists for key."
- // - man shmget(2)
- return nil, syserror.EEXIST
+ return nil, linuxerr.EINVAL
}
-
shm.IncRef()
return shm, nil
}
-
- if !create {
- // "No segment exists for the given key, and IPC_CREAT was not
- // specified." - man shmget(2)
- return nil, syserror.ENOENT
- }
}
var sizeAligned uint64
if val, ok := hostarch.Addr(size).RoundUp(); ok {
sizeAligned = uint64(val)
} else {
- return nil, syserror.EINVAL
+ return nil, linuxerr.EINVAL
}
if numPages := sizeAligned / hostarch.PageSize; r.totalPages+numPages > linux.SHMALL {
@@ -211,9 +188,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
}
// Need to create a new segment.
- creator := fs.FileOwnerFromContext(ctx)
- perms := fs.FilePermsFromMode(mode)
- s, err := r.newShm(ctx, pid, key, creator, perms, size)
+ s, err := r.newShmLocked(ctx, pid, key, fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(mode), size)
if err != nil {
return nil, err
}
@@ -223,10 +198,10 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
return s, nil
}
-// newShm creates a new segment in the registry.
+// newShmLocked creates a new segment in the registry.
//
// Precondition: Caller must hold r.mu.
-func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
+func (r *Registry) newShmLocked(ctx context.Context, pid int32, key ipc.Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
mfp := pgalloc.MemoryFileProviderFromContext(ctx)
if mfp == nil {
panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
@@ -241,40 +216,21 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
shm := &Shm{
mfp: mfp,
registry: r,
- creator: creator,
size: size,
effectiveSize: effectiveSize,
+ obj: ipc.NewObject(r.reg.UserNS, ipc.Key(key), creator, creator, perms),
fr: fr,
- key: key,
- perms: perms,
- owner: creator,
creatorPID: pid,
changeTime: ktime.NowFromContext(ctx),
}
shm.InitRefs()
- // Find the next available ID.
- for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
- // Handle wrap around.
- if id < 0 {
- id = 0
- continue
- }
- if r.shms[id] == nil {
- r.lastIDUsed = id
-
- shm.ID = id
- r.shms[id] = shm
- r.keysToShms[key] = shm
-
- r.totalPages += effectiveSize / hostarch.PageSize
-
- return shm, nil
- }
+ if err := r.reg.Register(shm); err != nil {
+ return nil, err
}
+ r.totalPages += effectiveSize / hostarch.PageSize
- log.Warningf("Shm ids exhuasted, they may be leaking")
- return nil, syserror.ENOSPC
+ return shm, nil
}
// IPCInfo reports global parameters for sysv shared memory segments on this
@@ -296,7 +252,7 @@ func (r *Registry) ShmInfo() *linux.ShmInfo {
defer r.mu.Unlock()
return &linux.ShmInfo{
- UsedIDs: int32(r.lastIDUsed),
+ UsedIDs: int32(r.reg.LastIDUsed()),
ShmTot: r.totalPages,
ShmRss: r.totalPages, // We could probably get a better estimate from memory accounting.
ShmSwp: 0, // No reclaim at the moment.
@@ -313,11 +269,11 @@ func (r *Registry) remove(s *Shm) {
s.mu.Lock()
defer s.mu.Unlock()
- if s.key != linux.IPC_PRIVATE {
+ if s.obj.Key != linux.IPC_PRIVATE {
panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked()))
}
- delete(r.shms, s.ID)
+ r.reg.DissociateID(s.obj.ID)
r.totalPages -= s.effectiveSize / hostarch.PageSize
}
@@ -329,13 +285,16 @@ func (r *Registry) Release(ctx context.Context) {
// the IPC namespace containing it has no more references.
toRelease := make([]*Shm, 0)
r.mu.Lock()
- for _, s := range r.keysToShms {
- s.mu.Lock()
- if !s.pendingDestruction {
- toRelease = append(toRelease, s)
- }
- s.mu.Unlock()
- }
+ r.reg.ForAllObjects(
+ func(o ipc.Mechanism) {
+ s := o.(*Shm)
+ s.mu.Lock()
+ if !s.pendingDestruction {
+ toRelease = append(toRelease, s)
+ }
+ s.mu.Unlock()
+ },
+ )
r.mu.Unlock()
for _, s := range toRelease {
@@ -373,12 +332,6 @@ type Shm struct {
// registry points to the shm registry containing this segment. Immutable.
registry *Registry
- // ID is the kernel identifier for this segment. Immutable.
- ID ID
-
- // creator is the user that created the segment. Immutable.
- creator fs.FileOwner
-
// size is the requested size of the segment at creation, in
// bytes. Immutable.
size uint64
@@ -396,14 +349,8 @@ type Shm struct {
// mu protects all fields below.
mu sync.Mutex `state:"nosave"`
- // key is the public identifier for this segment.
- key Key
-
- // perms is the access permissions for the segment.
- perms fs.FilePermissions
+ obj *ipc.Object
- // owner of this segment.
- owner fs.FileOwner
// attachTime is updated on every successful shmat.
attachTime ktime.Time
// detachTime is updated on every successful shmdt.
@@ -425,17 +372,44 @@ type Shm struct {
pendingDestruction bool
}
+// ID returns object's ID.
+func (s *Shm) ID() ipc.ID {
+ return s.obj.ID
+}
+
+// Object implements ipc.Mechanism.Object.
+func (s *Shm) Object() *ipc.Object {
+ return s.obj
+}
+
+// Destroy implements ipc.Mechanism.Destroy. No work is performed on shm.Destroy
+// because a different removal mechanism is used in shm. See Shm.MarkDestroyed.
+func (s *Shm) Destroy() {
+}
+
+// Lock implements ipc.Mechanism.Lock.
+func (s *Shm) Lock() {
+ s.mu.Lock()
+}
+
+// Unlock implements ipc.mechanism.Unlock.
+//
+// +checklocksignore
+func (s *Shm) Unlock() {
+ s.mu.Unlock()
+}
+
// Precondition: Caller must hold s.mu.
func (s *Shm) debugLocked() string {
return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}",
- s.ID, s.key, s.size, s.ReadRefs(), s.pendingDestruction)
+ s.obj.ID, s.obj.Key, s.size, s.ReadRefs(), s.pendingDestruction)
}
// MappedName implements memmap.MappingIdentity.MappedName.
func (s *Shm) MappedName(ctx context.Context) string {
s.mu.Lock()
defer s.mu.Unlock()
- return fmt.Sprintf("SYSV%08d", s.key)
+ return fmt.Sprintf("SYSV%08d", s.obj.Key)
}
// DeviceID implements memmap.MappingIdentity.DeviceID.
@@ -447,7 +421,7 @@ func (s *Shm) DeviceID() uint64 {
func (s *Shm) InodeID() uint64 {
// "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use
// this. Changing this will break them." -- Linux, ipc/shm.c:newseg()
- return uint64(s.ID)
+ return uint64(s.obj.ID)
}
// DecRef drops a reference on s.
@@ -511,7 +485,7 @@ func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange
func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
var err error
if required.End > s.fr.Length() {
- err = &memmap.BusError{syserror.EFAULT}
+ err = &memmap.BusError{linuxerr.EFAULT}
}
if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 {
return []memmap.Translation{
@@ -550,7 +524,8 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts Atta
return memmap.MMapOpts{}, syserror.EIDRM
}
- if !s.checkPermissions(ctx, fs.PermMask{
+ creds := auth.CredentialsFromContext(ctx)
+ if !s.obj.CheckPermissions(creds, fs.PermMask{
Read: true,
Write: !opts.Readonly,
Execute: opts.Execute,
@@ -558,7 +533,7 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts Atta
// "The calling process does not have the required permissions for the
// requested attach type, and does not have the CAP_IPC_OWNER capability
// in the user namespace that governs its IPC namespace." - man shmat(2)
- return memmap.MMapOpts{}, syserror.EACCES
+ return memmap.MMapOpts{}, linuxerr.EACCES
}
return memmap.MMapOpts{
Length: s.size,
@@ -590,19 +565,19 @@ func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
// "The caller must have read permission on the shared memory segment."
// - man shmctl(2)
- if !s.checkPermissions(ctx, fs.PermMask{Read: true}) {
+ creds := auth.CredentialsFromContext(ctx)
+ if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) {
// "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow
// read access for shmid, and the calling process does not have the
// CAP_IPC_OWNER capability in the user namespace that governs its IPC
// namespace." - man shmctl(2)
- return nil, syserror.EACCES
+ return nil, linuxerr.EACCES
}
var mode uint16
if s.pendingDestruction {
mode |= linux.SHM_DEST
}
- creds := auth.CredentialsFromContext(ctx)
// Use the reference count as a rudimentary count of the number of
// attaches. We exclude:
@@ -619,12 +594,12 @@ func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
ds := &linux.ShmidDS{
ShmPerm: linux.IPCPerm{
- Key: uint32(s.key),
- UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
- GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
- CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
- CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
- Mode: mode | uint16(s.perms.LinuxMode()),
+ Key: uint32(s.obj.Key),
+ UID: uint32(creds.UserNamespace.MapFromKUID(s.obj.Owner.UID)),
+ GID: uint32(creds.UserNamespace.MapFromKGID(s.obj.Owner.GID)),
+ CUID: uint32(creds.UserNamespace.MapFromKUID(s.obj.Creator.UID)),
+ CGID: uint32(creds.UserNamespace.MapFromKGID(s.obj.Creator.GID)),
+ Mode: mode | uint16(s.obj.Perms.LinuxMode()),
Seq: 0, // IPC sequences not supported.
},
ShmSegsz: s.size,
@@ -644,24 +619,24 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
s.mu.Lock()
defer s.mu.Unlock()
- if !s.checkOwnership(ctx) {
- return syserror.EPERM
+ creds := auth.CredentialsFromContext(ctx)
+ if !s.obj.CheckOwnership(creds) {
+ return linuxerr.EPERM
}
- creds := auth.CredentialsFromContext(ctx)
uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID))
gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID))
if !uid.Ok() || !gid.Ok() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
// User may only modify the lower 9 bits of the mode. All the other bits are
// always 0 for the underlying inode.
mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff)
- s.perms = fs.FilePermsFromMode(mode)
+ s.obj.Perms = fs.FilePermsFromMode(mode)
- s.owner.UID = uid
- s.owner.GID = gid
+ s.obj.Owner.UID = uid
+ s.obj.Owner.GID = gid
s.changeTime = ktime.NowFromContext(ctx)
return nil
@@ -690,40 +665,3 @@ func (s *Shm) MarkDestroyed(ctx context.Context) {
s.DecRef(ctx)
return
}
-
-// checkOwnership verifies whether a segment may be accessed by ctx as an
-// owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux.
-//
-// Precondition: Caller must hold s.mu.
-func (s *Shm) checkOwnership(ctx context.Context) bool {
- creds := auth.CredentialsFromContext(ctx)
- if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID {
- return true
- }
-
- // Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux
- // doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented
- // for use to "override IPC ownership checks".
- return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS)
-}
-
-// checkPermissions verifies whether a segment is accessible by ctx for access
-// described by req. See ipc/util.c:ipcperms() in Linux.
-//
-// Precondition: Caller must hold s.mu.
-func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool {
- creds := auth.CredentialsFromContext(ctx)
-
- p := s.perms.Other
- if s.owner.UID == creds.EffectiveKUID {
- p = s.perms.User
- } else if creds.InGroup(s.owner.GID) {
- p = s.perms.Group
- }
- if p.SupersetOf(req) {
- return true
- }
-
- // Tasks with CAP_IPC_OWNER may bypass permission checks.
- return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS)
-}
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
index 76d472292..1110ecca5 100644
--- a/pkg/sentry/kernel/signalfd/BUILD
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -9,6 +9,7 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/errors/linuxerr",
"//pkg/sentry/fs",
"//pkg/sentry/fs/anon",
"//pkg/sentry/fs/fsutil",
diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go
index f58ec4194..47958e2d4 100644
--- a/pkg/sentry/kernel/signalfd/signalfd.go
+++ b/pkg/sentry/kernel/signalfd/signalfd.go
@@ -18,6 +18,7 @@ package signalfd
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/anon"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -64,7 +65,7 @@ func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) {
t := kernel.TaskFromContext(ctx)
if t == nil {
// No task context? Not valid.
- return nil, syserror.EINVAL
+ return nil, linuxerr.EINVAL
}
// name matches fs/signalfd.c:signalfd4.
dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[signalfd]")
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 2e3b4488a..59eeb253d 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bpf"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/inet"
@@ -32,7 +33,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -232,7 +232,7 @@ type Task struct {
// exitStatus is the task's exit status.
//
// exitStatus is protected by the signal mutex.
- exitStatus ExitStatus
+ exitStatus linux.WaitStatus
// syscallRestartBlock represents a custom restart function to run in
// restart_syscall(2) to resume an interrupted syscall.
@@ -846,7 +846,7 @@ func (t *Task) OOMScoreAdj() int32 {
// value should be between -1000 and 1000 inclusive.
func (t *Task) SetOOMScoreAdj(adj int32) error {
if adj > 1000 || adj < -1000 {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
atomic.StoreInt32(&t.tg.oomScoreAdj, adj)
return nil
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
index e574997f7..dd364ae50 100644
--- a/pkg/sentry/kernel/task_acct.go
+++ b/pkg/sentry/kernel/task_acct.go
@@ -18,10 +18,10 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usage"
- "gvisor.dev/gvisor/pkg/syserror"
)
// Getitimer implements getitimer(2).
@@ -44,7 +44,7 @@ func (t *Task) Getitimer(id int32) (linux.ItimerVal, error) {
s, _ = t.tg.itimerProfSetting.At(tm)
t.tg.signalHandlers.mu.Unlock()
default:
- return linux.ItimerVal{}, syserror.EINVAL
+ return linux.ItimerVal{}, linuxerr.EINVAL
}
val, iv := ktime.SpecFromSetting(tm, s)
return linux.ItimerVal{
@@ -105,7 +105,7 @@ func (t *Task) Setitimer(id int32, newitv linux.ItimerVal) (linux.ItimerVal, err
return linux.ItimerVal{}, err
}
default:
- return linux.ItimerVal{}, syserror.EINVAL
+ return linux.ItimerVal{}, linuxerr.EINVAL
}
oldval, oldiv := ktime.SpecFromSetting(tm, olds)
return linux.ItimerVal{
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index 07533d982..b2520eecf 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -163,7 +163,7 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
region.End()
t.SleepFinish(true)
// We've timed out.
- return syserror.ETIMEDOUT
+ return linuxerr.ETIMEDOUT
}
}
diff --git a/pkg/sentry/kernel/task_cgroup.go b/pkg/sentry/kernel/task_cgroup.go
index 7c138e80f..828b90014 100644
--- a/pkg/sentry/kernel/task_cgroup.go
+++ b/pkg/sentry/kernel/task_cgroup.go
@@ -20,15 +20,13 @@ import (
"sort"
"strings"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/pkg/syserror"
)
// EnterInitialCgroups moves t into an initial set of cgroups.
//
// Precondition: t isn't in any cgroups yet, t.cgs is empty.
-//
-// +checklocksignore parent.mu is conditionally acquired.
func (t *Task) EnterInitialCgroups(parent *Task) {
var inherit map[Cgroup]struct{}
if parent != nil {
@@ -67,7 +65,7 @@ func (t *Task) EnterCgroup(c Cgroup) error {
//
// TODO(b/183137098): Implement cgroup migration.
log.Warningf("Cgroup migration is not implemented")
- return syserror.EBUSY
+ return linuxerr.EBUSY
}
}
}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 405771f3f..da4b77ca2 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -20,147 +20,46 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bpf"
"gvisor.dev/gvisor/pkg/cleanup"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/inet"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
-// SharingOptions controls what resources are shared by a new task created by
-// Task.Clone, or an existing task affected by Task.Unshare.
-type SharingOptions struct {
- // If NewAddressSpace is true, the task should have an independent virtual
- // address space.
- NewAddressSpace bool
-
- // If NewSignalHandlers is true, the task should use an independent set of
- // signal handlers.
- NewSignalHandlers bool
-
- // If NewThreadGroup is true, the task should be the leader of its own
- // thread group. TerminationSignal is the signal that the thread group
- // will send to its parent when it exits. If NewThreadGroup is false,
- // TerminationSignal is ignored.
- NewThreadGroup bool
- TerminationSignal linux.Signal
-
- // If NewPIDNamespace is true:
- //
- // - In the context of Task.Clone, the new task should be the init task
- // (TID 1) in a new PID namespace.
- //
- // - In the context of Task.Unshare, the task should create a new PID
- // namespace, and all subsequent clones of the task should be members of
- // the new PID namespace.
- NewPIDNamespace bool
-
- // If NewUserNamespace is true, the task should have an independent user
- // namespace.
- NewUserNamespace bool
-
- // If NewNetworkNamespace is true, the task should have an independent
- // network namespace.
- NewNetworkNamespace bool
-
- // If NewFiles is true, the task should use an independent file descriptor
- // table.
- NewFiles bool
-
- // If NewFSContext is true, the task should have an independent FSContext.
- NewFSContext bool
-
- // If NewUTSNamespace is true, the task should have an independent UTS
- // namespace.
- NewUTSNamespace bool
-
- // If NewIPCNamespace is true, the task should have an independent IPC
- // namespace.
- NewIPCNamespace bool
-}
-
-// CloneOptions controls the behavior of Task.Clone.
-type CloneOptions struct {
- // SharingOptions defines the set of resources that the new task will share
- // with its parent.
- SharingOptions
-
- // Stack is the initial stack pointer of the new task. If Stack is 0, the
- // new task will start with the same stack pointer as its parent.
- Stack hostarch.Addr
-
- // If SetTLS is true, set the new task's TLS (thread-local storage)
- // descriptor to TLS. If SetTLS is false, TLS is ignored.
- SetTLS bool
- TLS hostarch.Addr
-
- // If ChildClearTID is true, when the child exits, 0 is written to the
- // address ChildTID in the child's memory, and if the write is successful a
- // futex wake on the same address is performed.
- //
- // If ChildSetTID is true, the child's thread ID (in the child's PID
- // namespace) is written to address ChildTID in the child's memory. (As in
- // Linux, failed writes are silently ignored.)
- ChildClearTID bool
- ChildSetTID bool
- ChildTID hostarch.Addr
-
- // If ParentSetTID is true, the child's thread ID (in the parent's PID
- // namespace) is written to address ParentTID in the parent's memory. (As
- // in Linux, failed writes are silently ignored.)
- //
- // Older versions of the clone(2) man page state that CLONE_PARENT_SETTID
- // causes the child's thread ID to be written to ptid in both the parent
- // and child's memory, but this is a documentation error fixed by
- // 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID").
- ParentSetTID bool
- ParentTID hostarch.Addr
-
- // If Vfork is true, place the parent in vforkStop until the cloned task
- // releases its TaskImage.
- Vfork bool
-
- // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for
- // this clone(), and do not ptrace-attach the caller's tracer to the new
- // task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate).
- Untraced bool
-
- // If InheritTracer is true, ptrace-attach the caller's tracer to the new
- // task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported
- // for it. If both Untraced and InheritTracer are true, no event will be
- // reported, but tracer inheritance will still occur.
- InheritTracer bool
-}
-
// Clone implements the clone(2) syscall and returns the thread ID of the new
// task in t's PID namespace. Clone may return both a non-zero thread ID and a
// non-nil error.
//
// Preconditions: The caller must be running Task.doSyscallInvoke on the task
// goroutine.
-func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
+func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
// Since signal actions may refer to application signal handlers by virtual
// address, any set of signal handlers must refer to the same address
// space.
- if !opts.NewSignalHandlers && opts.NewAddressSpace {
- return 0, nil, syserror.EINVAL
+ if args.Flags&(linux.CLONE_SIGHAND|linux.CLONE_VM) == linux.CLONE_SIGHAND {
+ return 0, nil, linuxerr.EINVAL
}
// In order for the behavior of thread-group-directed signals to be sane,
// all tasks in a thread group must share signal handlers.
- if !opts.NewThreadGroup && opts.NewSignalHandlers {
- return 0, nil, syserror.EINVAL
+ if args.Flags&(linux.CLONE_THREAD|linux.CLONE_SIGHAND) == linux.CLONE_THREAD {
+ return 0, nil, linuxerr.EINVAL
}
// All tasks in a thread group must be in the same PID namespace.
- if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) {
- return 0, nil, syserror.EINVAL
+ if (args.Flags&linux.CLONE_THREAD != 0) && (args.Flags&linux.CLONE_NEWPID != 0 || t.childPIDNamespace != nil) {
+ return 0, nil, linuxerr.EINVAL
}
// The two different ways of specifying a new PID namespace are
// incompatible.
- if opts.NewPIDNamespace && t.childPIDNamespace != nil {
- return 0, nil, syserror.EINVAL
+ if args.Flags&linux.CLONE_NEWPID != 0 && t.childPIDNamespace != nil {
+ return 0, nil, linuxerr.EINVAL
}
// Thread groups and FS contexts cannot span user namespaces.
- if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) {
- return 0, nil, syserror.EINVAL
+ if args.Flags&linux.CLONE_NEWUSER != 0 && args.Flags&(linux.CLONE_THREAD|linux.CLONE_FS) != 0 {
+ return 0, nil, linuxerr.EINVAL
+ }
+ // args.ExitSignal must be a valid signal.
+ if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() {
+ return 0, nil, linuxerr.EINVAL
}
// Pull task registers and FPU state, a cloned task will inherit the
@@ -174,7 +73,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
// user_namespaces(7)
creds := t.Credentials()
userns := creds.UserNamespace
- if opts.NewUserNamespace {
+ if args.Flags&linux.CLONE_NEWUSER != 0 {
var err error
// "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
// the caller is in a chroot environment (i.e., the caller's root
@@ -182,28 +81,26 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
// in which it resides)." - clone(2). Neither chroot(2) nor
// user_namespaces(7) document this.
if t.IsChrooted() {
- return 0, nil, syserror.EPERM
+ return 0, nil, linuxerr.EPERM
}
userns, err = creds.NewChildUserNamespace()
if err != nil {
return 0, nil, err
}
}
- if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
- return 0, nil, syserror.EPERM
+ if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
+ return 0, nil, linuxerr.EPERM
}
utsns := t.UTSNamespace()
- if opts.NewUTSNamespace {
+ if args.Flags&linux.CLONE_NEWUTS != 0 {
// Note that this must happen after NewUserNamespace so we get
// the new userns if there is one.
utsns = t.UTSNamespace().Clone(userns)
}
ipcns := t.IPCNamespace()
- if opts.NewIPCNamespace {
- // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
- // namespace"
+ if args.Flags&linux.CLONE_NEWIPC != 0 {
ipcns = NewIPCNamespace(userns)
} else {
ipcns.IncRef()
@@ -214,7 +111,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
defer cu.Clean()
netns := t.NetworkNamespace()
- if opts.NewNetworkNamespace {
+ if args.Flags&linux.CLONE_NEWNET != 0 {
netns = inet.NewNamespace(netns)
}
@@ -227,7 +124,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
})
}
- image, err := t.image.Fork(t, t.k, !opts.NewAddressSpace)
+ image, err := t.image.Fork(t, t.k, args.Flags&linux.CLONE_VM != 0)
if err != nil {
return 0, nil, err
}
@@ -236,17 +133,17 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
})
// clone() returns 0 in the child.
image.Arch.SetReturn(0)
- if opts.Stack != 0 {
- image.Arch.SetStack(uintptr(opts.Stack))
+ if args.Stack != 0 {
+ image.Arch.SetStack(uintptr(args.Stack))
}
- if opts.SetTLS {
- if !image.Arch.SetTLS(uintptr(opts.TLS)) {
- return 0, nil, syserror.EPERM
+ if args.Flags&linux.CLONE_SETTLS != 0 {
+ if !image.Arch.SetTLS(uintptr(args.TLS)) {
+ return 0, nil, linuxerr.EPERM
}
}
var fsContext *FSContext
- if opts.NewFSContext {
+ if args.Flags&linux.CLONE_FS == 0 {
fsContext = t.fsContext.Fork()
} else {
fsContext = t.fsContext
@@ -254,7 +151,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
}
var fdTable *FDTable
- if opts.NewFiles {
+ if args.Flags&linux.CLONE_FILES == 0 {
fdTable = t.fdTable.Fork(t)
} else {
fdTable = t.fdTable
@@ -264,22 +161,22 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
pidns := t.tg.pidns
if t.childPIDNamespace != nil {
pidns = t.childPIDNamespace
- } else if opts.NewPIDNamespace {
+ } else if args.Flags&linux.CLONE_NEWPID != 0 {
pidns = pidns.NewChild(userns)
}
tg := t.tg
rseqAddr := hostarch.Addr(0)
rseqSignature := uint32(0)
- if opts.NewThreadGroup {
+ if args.Flags&linux.CLONE_THREAD == 0 {
if tg.mounts != nil {
tg.mounts.IncRef()
}
sh := t.tg.signalHandlers
- if opts.NewSignalHandlers {
+ if args.Flags&linux.CLONE_SIGHAND == 0 {
sh = sh.Fork()
}
- tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy())
+ tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, linux.Signal(args.ExitSignal), tg.limits.GetCopy())
tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj)
rseqAddr = t.rseqAddr
rseqSignature = t.rseqSignature
@@ -304,7 +201,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
RSeqSignature: rseqSignature,
ContainerID: t.ContainerID(),
}
- if opts.NewThreadGroup {
+ if args.Flags&linux.CLONE_THREAD == 0 {
cfg.Parent = t
} else {
cfg.InheritParent = t
@@ -322,7 +219,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
//
// However kernel/fork.c:copy_process() adds a limitation to this:
// "sigaltstack should be cleared when sharing the same VM".
- if opts.NewAddressSpace || opts.Vfork {
+ if args.Flags&linux.CLONE_VM == 0 || args.Flags&linux.CLONE_VFORK != 0 {
nt.SetSignalStack(t.SignalStack())
}
@@ -347,35 +244,35 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...)
nt.syscallFilters.Store(copiedFilters)
}
- if opts.Vfork {
+ if args.Flags&linux.CLONE_VFORK != 0 {
nt.vforkParent = t
}
- if opts.ChildClearTID {
- nt.SetClearTID(opts.ChildTID)
+ if args.Flags&linux.CLONE_CHILD_CLEARTID != 0 {
+ nt.SetClearTID(hostarch.Addr(args.ChildTID))
}
- if opts.ChildSetTID {
+ if args.Flags&linux.CLONE_CHILD_SETTID != 0 {
ctid := nt.ThreadID()
- ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
+ ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(args.ChildTID))
}
ntid := t.tg.pidns.IDOfTask(nt)
- if opts.ParentSetTID {
- ntid.CopyOut(t, opts.ParentTID)
+ if args.Flags&linux.CLONE_PARENT_SETTID != 0 {
+ ntid.CopyOut(t, hostarch.Addr(args.ParentTID))
}
kind := ptraceCloneKindClone
- if opts.Vfork {
+ if args.Flags&linux.CLONE_VFORK != 0 {
kind = ptraceCloneKindVfork
- } else if opts.TerminationSignal == linux.SIGCHLD {
+ } else if linux.Signal(args.ExitSignal) == linux.SIGCHLD {
kind = ptraceCloneKindFork
}
- if t.ptraceClone(kind, nt, opts) {
- if opts.Vfork {
+ if t.ptraceClone(kind, nt, args) {
+ if args.Flags&linux.CLONE_VFORK != 0 {
return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil
}
return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil
}
- if opts.Vfork {
+ if args.Flags&linux.CLONE_VFORK != 0 {
t.maybeBeginVforkStop(nt)
return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil
}
@@ -446,39 +343,47 @@ func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState {
}
// Unshare changes the set of resources t shares with other tasks, as specified
-// by opts.
+// by flags.
//
// Preconditions: The caller must be running on the task goroutine.
-func (t *Task) Unshare(opts *SharingOptions) error {
- // In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and
- // NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if
- // t is the only task using its MM, which due to clone(2)'s rules imply
- // that it is also the only task using its signal handlers / in its thread
- // group, and cause EINVAL to be returned otherwise.
+func (t *Task) Unshare(flags int32) error {
+ // "CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM can be specified in flags if
+ // the caller is single threaded (i.e., it is not sharing its address space
+ // with another process or thread). In this case, these flags have no
+ // effect. (Note also that specifying CLONE_THREAD automatically implies
+ // CLONE_VM, and specifying CLONE_VM automatically implies CLONE_SIGHAND.)
+ // If the process is multithreaded, then the use of these flags results in
+ // an error." - unshare(2). This is incorrect (cf.
+ // kernel/fork.c:ksys_unshare()):
+ //
+ // - CLONE_THREAD does not imply CLONE_VM.
+ //
+ // - CLONE_SIGHAND implies CLONE_THREAD.
+ //
+ // - Only CLONE_VM requires that the caller is not sharing its address
+ // space with another thread. CLONE_SIGHAND requires that the caller is not
+ // sharing its signal handlers, and CLONE_THREAD requires that the caller
+ // is the only thread in its thread group.
//
// Since we don't count the number of tasks using each address space or set
- // of signal handlers, we reject NewSignalHandlers and NewAddressSpace
- // altogether, and interpret NewThreadGroup as requiring that t be the only
- // member of its thread group. This seems to be logically coherent, in the
- // sense that clone(2) allows a task to share signal handlers and address
- // spaces with tasks in other thread groups.
- if opts.NewAddressSpace || opts.NewSignalHandlers {
- return syserror.EINVAL
+ // of signal handlers, we reject CLONE_VM and CLONE_SIGHAND altogether.
+ if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 {
+ return linuxerr.EINVAL
}
creds := t.Credentials()
- if opts.NewThreadGroup {
+ if flags&linux.CLONE_THREAD != 0 {
t.tg.signalHandlers.mu.Lock()
if t.tg.tasksCount != 1 {
t.tg.signalHandlers.mu.Unlock()
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
t.tg.signalHandlers.mu.Unlock()
// This isn't racy because we're the only living task, and therefore
// the only task capable of creating new ones, in our thread group.
}
- if opts.NewUserNamespace {
+ if flags&linux.CLONE_NEWUSER != 0 {
if t.IsChrooted() {
- return syserror.EPERM
+ return linuxerr.EPERM
}
newUserNS, err := creds.NewChildUserNamespace()
if err != nil {
@@ -492,34 +397,34 @@ func (t *Task) Unshare(opts *SharingOptions) error {
creds = t.Credentials()
}
haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
- if opts.NewPIDNamespace {
+ if flags&linux.CLONE_NEWPID != 0 {
if !haveCapSysAdmin {
- return syserror.EPERM
+ return linuxerr.EPERM
}
t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
}
t.mu.Lock()
// Can't defer unlock: DecRefs must occur without holding t.mu.
- if opts.NewNetworkNamespace {
+ if flags&linux.CLONE_NEWNET != 0 {
if !haveCapSysAdmin {
t.mu.Unlock()
- return syserror.EPERM
+ return linuxerr.EPERM
}
t.netns = inet.NewNamespace(t.netns)
}
- if opts.NewUTSNamespace {
+ if flags&linux.CLONE_NEWUTS != 0 {
if !haveCapSysAdmin {
t.mu.Unlock()
- return syserror.EPERM
+ return linuxerr.EPERM
}
// Note that this must happen after NewUserNamespace, so the
// new user namespace is used if there is one.
t.utsns = t.utsns.Clone(creds.UserNamespace)
}
- if opts.NewIPCNamespace {
+ if flags&linux.CLONE_NEWIPC != 0 {
if !haveCapSysAdmin {
t.mu.Unlock()
- return syserror.EPERM
+ return linuxerr.EPERM
}
// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
// namespace"
@@ -527,12 +432,12 @@ func (t *Task) Unshare(opts *SharingOptions) error {
t.ipcns = NewIPCNamespace(creds.UserNamespace)
}
var oldFDTable *FDTable
- if opts.NewFiles {
+ if flags&linux.CLONE_FILES != 0 {
oldFDTable = t.fdTable
t.fdTable = oldFDTable.Fork(t)
}
var oldFSContext *FSContext
- if opts.NewFSContext {
+ if flags&linux.CLONE_FS != 0 {
oldFSContext = t.fsContext
t.fsContext = oldFSContext.Fork()
}
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index d115b8783..fbfcc19e5 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -28,66 +28,14 @@ import (
"errors"
"fmt"
"strconv"
- "strings"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
-// An ExitStatus is a value communicated from an exiting task or thread group
-// to the party that reaps it.
-//
-// +stateify savable
-type ExitStatus struct {
- // Code is the numeric value passed to the call to exit or exit_group that
- // caused the exit. If the exit was not caused by such a call, Code is 0.
- Code int
-
- // Signo is the signal that caused the exit. If the exit was not caused by
- // a signal, Signo is 0.
- Signo int
-}
-
-func (es ExitStatus) String() string {
- var b strings.Builder
- if code := es.Code; code != 0 {
- if b.Len() != 0 {
- b.WriteByte(' ')
- }
- _, _ = fmt.Fprintf(&b, "Code=%d", code)
- }
- if signal := es.Signo; signal != 0 {
- if b.Len() != 0 {
- b.WriteByte(' ')
- }
- _, _ = fmt.Fprintf(&b, "Signal=%d", signal)
- }
- return b.String()
-}
-
-// Signaled returns true if the ExitStatus indicates that the exiting task or
-// thread group was killed by a signal.
-func (es ExitStatus) Signaled() bool {
- return es.Signo != 0
-}
-
-// Status returns the numeric representation of the ExitStatus returned by e.g.
-// the wait4() system call.
-func (es ExitStatus) Status() uint32 {
- return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff)
-}
-
-// ShellExitCode returns the numeric exit code that Bash would return for an
-// exit status of es.
-func (es ExitStatus) ShellExitCode() int {
- if es.Signaled() {
- return 128 + es.Signo
- }
- return es.Code
-}
-
// TaskExitState represents a step in the task exit path.
//
// "Exiting" and "exited" are often ambiguous; prefer to name specific states.
@@ -163,13 +111,13 @@ func (t *Task) killedLocked() bool {
return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0
}
-// PrepareExit indicates an exit with status es.
+// PrepareExit indicates an exit with the given status.
//
// Preconditions: The caller must be running on the task goroutine.
-func (t *Task) PrepareExit(es ExitStatus) {
+func (t *Task) PrepareExit(ws linux.WaitStatus) {
t.tg.signalHandlers.mu.Lock()
defer t.tg.signalHandlers.mu.Unlock()
- t.exitStatus = es
+ t.exitStatus = ws
}
// PrepareGroupExit indicates a group exit with status es to t's thread group.
@@ -180,7 +128,7 @@ func (t *Task) PrepareExit(es ExitStatus) {
// ptrace.)
//
// Preconditions: The caller must be running on the task goroutine.
-func (t *Task) PrepareGroupExit(es ExitStatus) {
+func (t *Task) PrepareGroupExit(ws linux.WaitStatus) {
t.tg.signalHandlers.mu.Lock()
defer t.tg.signalHandlers.mu.Unlock()
if t.tg.exiting || t.tg.execing != nil {
@@ -198,8 +146,8 @@ func (t *Task) PrepareGroupExit(es ExitStatus) {
return
}
t.tg.exiting = true
- t.tg.exitStatus = es
- t.exitStatus = es
+ t.tg.exitStatus = ws
+ t.exitStatus = ws
for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
if sibling != t {
sibling.killLocked()
@@ -207,11 +155,11 @@ func (t *Task) PrepareGroupExit(es ExitStatus) {
}
}
-// Kill requests that all tasks in ts exit as if group exiting with status es.
+// Kill requests that all tasks in ts exit as if group exiting with status ws.
// Kill does not wait for tasks to exit.
//
// Kill has no analogue in Linux; it's provided for save/restore only.
-func (ts *TaskSet) Kill(es ExitStatus) {
+func (ts *TaskSet) Kill(ws linux.WaitStatus) {
ts.mu.Lock()
defer ts.mu.Unlock()
ts.Root.exiting = true
@@ -219,7 +167,7 @@ func (ts *TaskSet) Kill(es ExitStatus) {
t.tg.signalHandlers.mu.Lock()
if !t.tg.exiting {
t.tg.exiting = true
- t.tg.exitStatus = es
+ t.tg.exitStatus = ws
}
t.killLocked()
t.tg.signalHandlers.mu.Unlock()
@@ -730,10 +678,10 @@ func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *linux.S
info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
if t.exitStatus.Signaled() {
info.Code = linux.CLD_KILLED
- info.SetStatus(int32(t.exitStatus.Signo))
+ info.SetStatus(int32(t.exitStatus.TerminationSignal()))
} else {
info.Code = linux.CLD_EXITED
- info.SetStatus(int32(t.exitStatus.Code))
+ info.SetStatus(int32(t.exitStatus.ExitStatus()))
}
// TODO(b/72102453): Set utime, stime.
return info
@@ -741,7 +689,7 @@ func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *linux.S
// ExitStatus returns t's exit status, which is only guaranteed to be
// meaningful if t.ExitState() != TaskExitNone.
-func (t *Task) ExitStatus() ExitStatus {
+func (t *Task) ExitStatus() linux.WaitStatus {
t.tg.pidns.owner.mu.RLock()
defer t.tg.pidns.owner.mu.RUnlock()
t.tg.signalHandlers.mu.Lock()
@@ -751,7 +699,7 @@ func (t *Task) ExitStatus() ExitStatus {
// ExitStatus returns the exit status that would be returned by a consuming
// wait*() on tg.
-func (tg *ThreadGroup) ExitStatus() ExitStatus {
+func (tg *ThreadGroup) ExitStatus() linux.WaitStatus {
tg.pidns.owner.mu.RLock()
defer tg.pidns.owner.mu.RUnlock()
tg.signalHandlers.mu.Lock()
@@ -762,7 +710,9 @@ func (tg *ThreadGroup) ExitStatus() ExitStatus {
return tg.leader.exitStatus
}
-// TerminationSignal returns the thread group's termination signal.
+// TerminationSignal returns the thread group's termination signal, which is
+// the signal that will be sent to its leader's parent when all threads have
+// exited.
func (tg *ThreadGroup) TerminationSignal() linux.Signal {
tg.pidns.owner.mu.RLock()
defer tg.pidns.owner.mu.RUnlock()
@@ -888,8 +838,8 @@ type WaitResult struct {
// Event is exactly one of the events defined above.
Event waiter.EventMask
- // Status is the numeric status associated with the event.
- Status uint32
+ // Status is the wait status associated with the event.
+ Status linux.WaitStatus
}
// Wait waits for an event from a thread group that is a child of t's thread
@@ -942,7 +892,7 @@ func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) {
if anyWaitableTasks {
return nil, ErrNoWaitableEvent
}
- return nil, syserror.ECHILD
+ return nil, linuxerr.ECHILD
}
// Preconditions: The TaskSet mutex must be locked for writing.
@@ -1042,7 +992,7 @@ func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtrace
}
pid := t.tg.pidns.tids[target]
uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
- status := target.exitStatus.Status()
+ status := target.exitStatus
if !opts.ConsumeEvent {
return &WaitResult{
Task: target,
@@ -1056,7 +1006,7 @@ func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtrace
// differ from that reported by a consuming wait; the latter will return
// the group exit code if one is available.
if target.tg.exiting {
- status = target.tg.exitStatus.Status()
+ status = target.tg.exitStatus
}
// t may be (in the thread group of) target's parent, tracer, or both. We
// don't need to check for !exitTracerAcked because tracees are detached
@@ -1122,12 +1072,11 @@ func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions)
target.tg.groupStopWaitable = false
}
return &WaitResult{
- Task: target,
- TID: pid,
- UID: uid,
- Event: EventChildGroupStop,
- // There is no name for these status constants.
- Status: (uint32(sig)&0xff)<<8 | 0x7f,
+ Task: target,
+ TID: pid,
+ UID: uid,
+ Event: EventChildGroupStop,
+ Status: linux.WaitStatusStopped(uint32(sig)),
}
}
@@ -1148,7 +1097,7 @@ func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *
TID: pid,
UID: uid,
Event: EventGroupContinue,
- Status: 0xffff,
+ Status: linux.WaitStatusContinued(),
}
}
@@ -1176,7 +1125,7 @@ func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *Wai
TID: pid,
UID: uid,
Event: EventTraceeStop,
- Status: uint32(code)<<8 | 0x7f,
+ Status: linux.WaitStatusStopped(uint32(code)),
}
}
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index 0325967e4..a9067b682 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -16,9 +16,9 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/mm"
- "gvisor.dev/gvisor/pkg/syserror"
)
// Credentials returns t's credentials.
@@ -47,7 +47,7 @@ func (t *Task) HasCapability(cp linux.Capability) bool {
func (t *Task) SetUID(uid auth.UID) error {
// setuid considers -1 to be invalid.
if !uid.Ok() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
t.mu.Lock()
@@ -56,7 +56,7 @@ func (t *Task) SetUID(uid auth.UID) error {
creds := t.Credentials()
kuid := creds.UserNamespace.MapToKUID(uid)
if !kuid.Ok() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
// "setuid() sets the effective user ID of the calling process. If the
// effective UID of the caller is root (more precisely: if the caller has
@@ -70,7 +70,7 @@ func (t *Task) SetUID(uid auth.UID) error {
// capability) and uid does not match the real UID or saved set-user-ID of
// the calling process."
if kuid != creds.RealKUID && kuid != creds.SavedKUID {
- return syserror.EPERM
+ return linuxerr.EPERM
}
t.setKUIDsUncheckedLocked(creds.RealKUID, kuid, creds.SavedKUID)
return nil
@@ -87,26 +87,26 @@ func (t *Task) SetREUID(r, e auth.UID) error {
if r.Ok() {
newR = creds.UserNamespace.MapToKUID(r)
if !newR.Ok() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
}
newE := creds.EffectiveKUID
if e.Ok() {
newE = creds.UserNamespace.MapToKUID(e)
if !newE.Ok() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
}
if !creds.HasCapability(linux.CAP_SETUID) {
// "Unprivileged processes may only set the effective user ID to the
// real user ID, the effective user ID, or the saved set-user-ID."
if newE != creds.RealKUID && newE != creds.EffectiveKUID && newE != creds.SavedKUID {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// "Unprivileged users may only set the real user ID to the real user
// ID or the effective user ID."
if newR != creds.RealKUID && newR != creds.EffectiveKUID {
- return syserror.EPERM
+ return linuxerr.EPERM
}
}
// "If the real user ID is set (i.e., ruid is not -1) or the effective user
@@ -223,7 +223,7 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
// SetGID implements the semantics of setgid(2).
func (t *Task) SetGID(gid auth.GID) error {
if !gid.Ok() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
t.mu.Lock()
@@ -232,14 +232,14 @@ func (t *Task) SetGID(gid auth.GID) error {
creds := t.Credentials()
kgid := creds.UserNamespace.MapToKGID(gid)
if !kgid.Ok() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if creds.HasCapability(linux.CAP_SETGID) {
t.setKGIDsUncheckedLocked(kgid, kgid, kgid)
return nil
}
if kgid != creds.RealKGID && kgid != creds.SavedKGID {
- return syserror.EPERM
+ return linuxerr.EPERM
}
t.setKGIDsUncheckedLocked(creds.RealKGID, kgid, creds.SavedKGID)
return nil
@@ -255,22 +255,22 @@ func (t *Task) SetREGID(r, e auth.GID) error {
if r.Ok() {
newR = creds.UserNamespace.MapToKGID(r)
if !newR.Ok() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
}
newE := creds.EffectiveKGID
if e.Ok() {
newE = creds.UserNamespace.MapToKGID(e)
if !newE.Ok() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
}
if !creds.HasCapability(linux.CAP_SETGID) {
if newE != creds.RealKGID && newE != creds.EffectiveKGID && newE != creds.SavedKGID {
- return syserror.EPERM
+ return linuxerr.EPERM
}
if newR != creds.RealKGID && newR != creds.EffectiveKGID {
- return syserror.EPERM
+ return linuxerr.EPERM
}
}
newS := creds.SavedKGID
@@ -343,13 +343,13 @@ func (t *Task) SetExtraGIDs(gids []auth.GID) error {
defer t.mu.Unlock()
creds := t.Credentials()
if !creds.HasCapability(linux.CAP_SETGID) {
- return syserror.EPERM
+ return linuxerr.EPERM
}
kgids := make([]auth.KGID, len(gids))
for i, gid := range gids {
kgid := creds.UserNamespace.MapToKGID(gid)
if !kgid.Ok() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
kgids[i] = kgid
}
@@ -367,25 +367,25 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili
// "Permitted: This is a limiting superset for the effective capabilities
// that the thread may assume." - capabilities(7)
if effective & ^permitted != 0 {
- return syserror.EPERM
+ return linuxerr.EPERM
}
creds := t.Credentials()
// "It is also a limiting superset for the capabilities that may be added
// to the inheritable set by a thread that does not have the CAP_SETPCAP
// capability in its effective set."
if !creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(creds.InheritableCaps|creds.PermittedCaps) != 0) {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// "If a thread drops a capability from its permitted set, it can never
// reacquire that capability (unless it execve(2)s ..."
if permitted & ^creds.PermittedCaps != 0 {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// "... if a capability is not in the bounding set, then a thread can't add
// this capability to its inheritable set, even if it was in its permitted
// capabilities ..."
if inheritable & ^(creds.InheritableCaps|creds.BoundingCaps) != 0 {
- return syserror.EPERM
+ return linuxerr.EPERM
}
creds = creds.Fork() // The credentials object is immutable. See doc for creds.
creds.PermittedCaps = permitted
@@ -402,7 +402,7 @@ func (t *Task) DropBoundingCapability(cp linux.Capability) error {
defer t.mu.Unlock()
creds := t.Credentials()
if !creds.HasCapability(linux.CAP_SETPCAP) {
- return syserror.EPERM
+ return linuxerr.EPERM
}
creds = creds.Fork() // The credentials object is immutable. See doc for creds.
creds.BoundingCaps &^= auth.CapabilitySetOf(cp)
@@ -422,7 +422,7 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
// If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN
// in ns (by rule 3 in auth.Credentials.HasCapability).
if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
- return syserror.EPERM
+ return linuxerr.EPERM
}
creds = creds.Fork() // The credentials object is immutable. See doc for creds.
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index 72b9a0384..8de08151a 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -235,7 +235,7 @@ func (t *Task) traceExitEvent() {
if !trace.IsEnabled() {
return
}
- trace.Logf(t.traceContext, traceCategory, "exit status: 0x%x", t.exitStatus.Status())
+ trace.Logf(t.traceContext, traceCategory, "exit status: %s", t.exitStatus)
}
// traceExecEvent is called when a task calls exec.
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 068f25af1..054ff212f 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -377,7 +377,7 @@ func (app *runApp) execute(t *Task) taskRunState {
default:
// What happened? Can't continue.
t.Warningf("Unexpected SwitchToApp error: %v", err)
- t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)})
+ t.PrepareExit(linux.WaitStatusExit(int32(ExtractErrno(err, -1))))
return (*runExit)(nil)
}
}
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index f142feab4..9d9fa76a6 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -23,12 +23,12 @@ import (
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usage"
- "gvisor.dev/gvisor/pkg/syserror"
)
// TaskGoroutineState is a coarse representation of the current execution
@@ -601,7 +601,7 @@ func (t *Task) SetCPUMask(mask sched.CPUSet) error {
// Ensure that at least 1 CPU is still allowed.
if mask.NumCPUs() == 0 {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
if t.k.useHostCores {
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 8ca61ed48..7065ac79c 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -22,6 +22,7 @@ import (
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/eventchannel"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -156,7 +157,8 @@ func (t *Task) PendingSignals() linux.SignalSet {
// deliverSignal delivers the given signal and returns the following run state.
func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRunState {
- sigact := computeAction(linux.Signal(info.Signo), act)
+ sig := linux.Signal(info.Signo)
+ sigact := computeAction(sig, act)
if t.haveSyscallReturn {
if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
@@ -197,14 +199,14 @@ func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRu
}
// Attach an fault address if appropriate.
- switch linux.Signal(info.Signo) {
+ switch sig {
case linux.SIGSEGV, linux.SIGFPE, linux.SIGILL, linux.SIGTRAP, linux.SIGBUS:
ucs.FaultAddr = info.Addr()
}
eventchannel.Emit(ucs)
- t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)})
+ t.PrepareGroupExit(linux.WaitStatusTerminationSignal(sig))
return (*runExit)(nil)
case SignalActionStop:
@@ -224,12 +226,12 @@ func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRu
// Send a forced SIGSEGV. If the signal that couldn't be delivered
// was a SIGSEGV, force the handler to SIG_DFL.
- t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */)
+ t.forceSignal(linux.SIGSEGV, sig == linux.SIGSEGV /* unconditional */)
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
}
default:
- panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(linux.Signal(info.Signo), act)))
+ panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(sig, act)))
}
return (*runInterrupt)(nil)
}
@@ -338,7 +340,7 @@ func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*linux.
}
if timeout == 0 {
- return nil, syserror.EAGAIN
+ return nil, linuxerr.EAGAIN
}
// Unblock signals we're waiting for. Remember the original signal mask so
@@ -359,8 +361,8 @@ func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*linux.
if info := t.dequeueSignalLocked(mask); info != nil {
return info, nil
}
- if err == syserror.ETIMEDOUT {
- return nil, syserror.EAGAIN
+ if err == linuxerr.ETIMEDOUT {
+ return nil, linuxerr.EAGAIN
}
return nil, err
}
@@ -369,9 +371,9 @@ func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*linux.
//
// The following errors may be returned:
//
-// syserror.ESRCH - The task has exited.
-// syserror.EINVAL - The signal is not valid.
-// syserror.EAGAIN - THe signal is realtime, and cannot be queued.
+// linuxerr.ESRCH - The task has exited.
+// linuxerr.EINVAL - The signal is not valid.
+// linuxerr.EAGAIN - THe signal is realtime, and cannot be queued.
//
func (t *Task) SendSignal(info *linux.SignalInfo) error {
t.tg.pidns.owner.mu.RLock()
@@ -406,14 +408,14 @@ func (t *Task) sendSignalLocked(info *linux.SignalInfo, group bool) error {
func (t *Task) sendSignalTimerLocked(info *linux.SignalInfo, group bool, timer *IntervalTimer) error {
if t.exitState == TaskExitDead {
- return syserror.ESRCH
+ return linuxerr.ESRCH
}
sig := linux.Signal(info.Signo)
if sig == 0 {
return nil
}
if !sig.IsValid() {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
// Signal side effects apply even if the signal is ultimately discarded.
@@ -450,7 +452,7 @@ func (t *Task) sendSignalTimerLocked(info *linux.SignalInfo, group bool, timer *
}
if !q.enqueue(info, timer) {
if sig.IsRealtime() {
- return syserror.EAGAIN
+ return linuxerr.EAGAIN
}
t.Debugf("Discarding duplicate signal %d", sig)
if timer != nil {
@@ -505,7 +507,7 @@ func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) {
// ignores tg.execing.
if !tg.exiting {
tg.exiting = true
- tg.exitStatus = ExitStatus{Signo: int(linux.SIGKILL)}
+ tg.exitStatus = linux.WaitStatusTerminationSignal(linux.SIGKILL)
}
for t := tg.tasks.Front(); t != nil; t = t.Next() {
t.killLocked()
@@ -684,7 +686,7 @@ func (t *Task) SetSignalStack(alt linux.SignalStack) bool {
// to *actptr (if actptr is not nil) and returns the old signal action.
func (tg *ThreadGroup) SetSigAction(sig linux.Signal, actptr *linux.SigAction) (linux.SigAction, error) {
if !sig.IsValid() {
- return linux.SigAction{}, syserror.EINVAL
+ return linux.SigAction{}, linuxerr.EINVAL
}
tg.pidns.owner.mu.RLock()
@@ -695,7 +697,7 @@ func (tg *ThreadGroup) SetSigAction(sig linux.Signal, actptr *linux.SigAction) (
oldact := sh.actions[sig]
if actptr != nil {
if sig == linux.SIGKILL || sig == linux.SIGSTOP {
- return oldact, syserror.EINVAL
+ return oldact, linuxerr.EINVAL
}
act := *actptr
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 41fd2d471..0565059c1 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -17,6 +17,7 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -299,7 +300,7 @@ func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
// Did we do a full cycle?
if tid == ns.last {
// No tid available.
- return 0, syserror.EAGAIN
+ return 0, linuxerr.EAGAIN
}
}
}
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 1874f74e5..0586c9def 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -22,6 +22,7 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bits"
+ "gvisor.dev/gvisor/pkg/errors"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal"
@@ -160,7 +161,7 @@ func (t *Task) doSyscall() taskRunState {
// ok
case linux.SECCOMP_RET_KILL_THREAD:
t.Debugf("Syscall %d: killed by seccomp", sysno)
- t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+ t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
return (*runExit)(nil)
case linux.SECCOMP_RET_TRACE:
t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno)
@@ -310,7 +311,7 @@ func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState {
return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller}
case linux.SECCOMP_RET_KILL_THREAD:
t.Debugf("vsyscall %d: killed by seccomp", sysno)
- t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+ t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
return (*runExit)(nil)
default:
panic(fmt.Sprintf("Unknown seccomp result %d", r))
@@ -337,7 +338,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
// Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
// causes do_exit(SIGSYS), and changing sp is ignored.
if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr {
- t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+ t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
return (*runExit)(nil)
}
if sysno == ^uintptr(0) {
@@ -380,6 +381,8 @@ func ExtractErrno(err error, sysno int) int {
return 0
case unix.Errno:
return int(err)
+ case *errors.Error:
+ return int(err.Errno())
case syserror.SyscallRestartErrno:
return int(err)
case *memmap.BusError:
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index fc6d9438a..8e2c36598 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/syserror"
@@ -132,7 +133,7 @@ func (t *Task) CopyOutIovecs(addr hostarch.Addr, src hostarch.AddrRangeSeq) erro
case 8:
const itemLen = 16
if _, ok := addr.AddLength(uint64(src.NumRanges()) * itemLen); !ok {
- return syserror.EFAULT
+ return linuxerr.EFAULT
}
b := t.CopyScratchBuffer(itemLen)
@@ -190,7 +191,7 @@ func (t *Task) CopyInIovecs(addr hostarch.Addr, numIovecs int) (hostarch.AddrRan
case 8:
const itemLen = 16
if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok {
- return hostarch.AddrRangeSeq{}, syserror.EFAULT
+ return hostarch.AddrRangeSeq{}, linuxerr.EFAULT
}
b := t.CopyScratchBuffer(itemLen)
@@ -202,11 +203,11 @@ func (t *Task) CopyInIovecs(addr hostarch.Addr, numIovecs int) (hostarch.AddrRan
base := hostarch.Addr(hostarch.ByteOrder.Uint64(b[0:8]))
length := hostarch.ByteOrder.Uint64(b[8:16])
if length > math.MaxInt64 {
- return hostarch.AddrRangeSeq{}, syserror.EINVAL
+ return hostarch.AddrRangeSeq{}, linuxerr.EINVAL
}
ar, ok := t.MemoryManager().CheckIORange(base, int64(length))
if !ok {
- return hostarch.AddrRangeSeq{}, syserror.EFAULT
+ return hostarch.AddrRangeSeq{}, linuxerr.EFAULT
}
if numIovecs == 1 {
@@ -252,7 +253,7 @@ func (t *Task) SingleIOSequence(addr hostarch.Addr, length int, opts usermem.IOO
}
ar, ok := t.MemoryManager().CheckIORange(addr, int64(length))
if !ok {
- return usermem.IOSequence{}, syserror.EFAULT
+ return usermem.IOSequence{}, linuxerr.EFAULT
}
return usermem.IOSequence{
IO: t.MemoryManager(),
@@ -270,7 +271,7 @@ func (t *Task) SingleIOSequence(addr hostarch.Addr, length int, opts usermem.IOO
// Preconditions: Same as Task.CopyInIovecs.
func (t *Task) IovecsIOSequence(addr hostarch.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
- return usermem.IOSequence{}, syserror.EINVAL
+ return usermem.IOSequence{}, linuxerr.EINVAL
}
ars, err := t.CopyInIovecs(addr, iovcnt)
if err != nil {
@@ -312,7 +313,7 @@ func (cc *taskCopyContext) getMemoryManager() (*mm.MemoryManager, error) {
tmm := cc.t.MemoryManager()
cc.t.mu.Unlock()
if !tmm.IncUsers() {
- return nil, syserror.EFAULT
+ return nil, linuxerr.EFAULT
}
return tmm, nil
}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 4566e4c7c..2eda15303 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -19,13 +19,13 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
)
// A ThreadGroup is a logical grouping of tasks that has widespread
@@ -143,7 +143,7 @@ type ThreadGroup struct {
//
// While exiting is false, exitStatus is protected by the signal mutex.
// When exiting becomes true, exitStatus becomes immutable.
- exitStatus ExitStatus
+ exitStatus linux.WaitStatus
// terminationSignal is the signal that this thread group's leader will
// send to its parent when it exits.
@@ -357,7 +357,7 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, steal bool, isReadable bool)
// "The calling process must be a session leader and not have a
// controlling terminal already." - tty_ioctl(4)
if tg.processGroup.session.leader != tg || tg.tty != nil {
- return syserror.EINVAL
+ return linuxerr.EINVAL
}
creds := auth.CredentialsFromContext(tg.leader)
@@ -371,7 +371,7 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, steal bool, isReadable bool)
if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session {
// Stealing requires CAP_SYS_ADMIN in the root user namespace.
if !hasAdmin || !steal {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// Steal the TTY away. Unlike TIOCNOTTY, don't send signals.
for othertg := range tg.pidns.owner.Root.tgids {
@@ -391,7 +391,7 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, steal bool, isReadable bool)
}
if !isReadable && !hasAdmin {
- return syserror.EPERM
+ return linuxerr.EPERM
}
// Set the controlling terminal and foreground process group.
@@ -419,7 +419,7 @@ func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error {
if tg.tty == nil || tg.tty != tty {
tg.signalHandlers.mu.Unlock()
- return syserror.ENOTTY
+ return linuxerr.ENOTTY
}
// "If the process was session leader, then send SIGHUP and SIGCONT to
@@ -473,7 +473,7 @@ func (tg *ThreadGroup) ForegroundProcessGroup(tty *TTY) (int32, error) {
// "When fd does not refer to the controlling terminal of the calling
// process, -1 is returned" - tcgetpgrp(3)
if tg.tty != tty {
- return -1, syserror.ENOTTY
+ return -1, linuxerr.ENOTTY
}
return int32(tg.processGroup.session.foreground.id), nil
@@ -496,24 +496,24 @@ func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID)
// tty must be the controlling terminal.
if tg.tty != tty {
- return -1, syserror.ENOTTY
+ return -1, linuxerr.ENOTTY
}
// pgid must be positive.
if pgid < 0 {
- return -1, syserror.EINVAL
+ return -1, linuxerr.EINVAL
}
// pg must not be empty. Empty process groups are removed from their
// pid namespaces.
pg, ok := tg.pidns.processGroups[pgid]
if !ok {
- return -1, syserror.ESRCH
+ return -1, linuxerr.ESRCH
}
// pg must be part of this process's session.
if tg.processGroup.session != pg.session {
- return -1, syserror.EPERM
+ return -1, linuxerr.EPERM
}
tg.processGroup.session.foreground.id = pgid
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 2817aa3ba..e293d9a0f 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -13,8 +13,8 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/errors/linuxerr",
"//pkg/sync",
- "//pkg/syserror",
"//pkg/waiter",
],
)
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 26aa34aa6..191b92811 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -22,8 +22,8 @@ import (
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -322,7 +322,7 @@ func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Sett
// interpreted as a time relative to now.
func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (Setting, error) {
if value < 0 {
- return Setting{}, syserror.EINVAL
+ return Setting{}, linuxerr.EINVAL
}
if value == 0 {
return Setting{Period: interval}, nil
@@ -338,7 +338,7 @@ func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (S
// interpreted as an absolute time.
func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) {
if value.Before(ZeroTime) {
- return Setting{}, syserror.EINVAL
+ return Setting{}, linuxerr.EINVAL
}
if value.IsZero() {
return Setting{Period: interval}, nil
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
index dfc3c0719..b6039505a 100644
--- a/pkg/sentry/kernel/timekeeper_test.go
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -17,12 +17,12 @@ package kernel
import (
"testing"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sentry/usage"
- "gvisor.dev/gvisor/pkg/syserror"
)
// mockClocks is a sentrytime.Clocks that simply returns the times in the
@@ -45,7 +45,7 @@ func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) {
case sentrytime.Realtime:
return c.realtime, nil
default:
- return 0, syserror.EINVAL
+ return 0, linuxerr.EINVAL
}
}