summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r--pkg/sentry/kernel/BUILD6
-rw-r--r--pkg/sentry/kernel/cgroup.go281
-rw-r--r--pkg/sentry/kernel/eventfd/BUILD1
-rw-r--r--pkg/sentry/kernel/eventfd/eventfd.go7
-rw-r--r--pkg/sentry/kernel/futex/BUILD3
-rw-r--r--pkg/sentry/kernel/futex/futex.go48
-rw-r--r--pkg/sentry/kernel/futex/futex_test.go16
-rw-r--r--pkg/sentry/kernel/kcov.go8
-rw-r--r--pkg/sentry/kernel/kernel.go52
-rw-r--r--pkg/sentry/kernel/pipe/BUILD1
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go6
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go21
-rw-r--r--pkg/sentry/kernel/ptrace.go11
-rw-r--r--pkg/sentry/kernel/ptrace_amd64.go3
-rw-r--r--pkg/sentry/kernel/ptrace_arm64.go4
-rw-r--r--pkg/sentry/kernel/rseq.go35
-rw-r--r--pkg/sentry/kernel/seccomp.go10
-rw-r--r--pkg/sentry/kernel/shm/BUILD1
-rw-r--r--pkg/sentry/kernel/shm/shm.go30
-rw-r--r--pkg/sentry/kernel/syscalls.go8
-rw-r--r--pkg/sentry/kernel/task.go18
-rw-r--r--pkg/sentry/kernel/task_cgroup.go138
-rw-r--r--pkg/sentry/kernel/task_clone.go11
-rw-r--r--pkg/sentry/kernel/task_exit.go4
-rw-r--r--pkg/sentry/kernel/task_futex.go27
-rw-r--r--pkg/sentry/kernel/task_image.go4
-rw-r--r--pkg/sentry/kernel/task_log.go13
-rw-r--r--pkg/sentry/kernel/task_run.go8
-rw-r--r--pkg/sentry/kernel/task_signals.go16
-rw-r--r--pkg/sentry/kernel/task_start.go9
-rw-r--r--pkg/sentry/kernel/task_syscall.go12
-rw-r--r--pkg/sentry/kernel/task_usermem.go67
-rw-r--r--pkg/sentry/kernel/threads.go9
-rw-r--r--pkg/sentry/kernel/timekeeper_test.go4
-rw-r--r--pkg/sentry/kernel/vdso.go4
35 files changed, 704 insertions, 192 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index c53e3e720..a1ec6daab 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -141,6 +141,7 @@ go_library(
srcs = [
"abstract_socket_namespace.go",
"aio.go",
+ "cgroup.go",
"context.go",
"fd_table.go",
"fd_table_refs.go",
@@ -178,6 +179,7 @@ go_library(
"task.go",
"task_acct.go",
"task_block.go",
+ "task_cgroup.go",
"task_clone.go",
"task_context.go",
"task_exec.go",
@@ -226,6 +228,7 @@ go_library(
"//pkg/eventchannel",
"//pkg/fspath",
"//pkg/goid",
+ "//pkg/hostarch",
"//pkg/log",
"//pkg/marshal",
"//pkg/marshal/primitive",
@@ -240,6 +243,7 @@ go_library(
"//pkg/sentry/fs/lock",
"//pkg/sentry/fs/timerfd",
"//pkg/sentry/fsbridge",
+ "//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/fsimpl/pipefs",
"//pkg/sentry/fsimpl/sockfs",
"//pkg/sentry/fsimpl/timerfd",
@@ -294,6 +298,7 @@ go_test(
deps = [
"//pkg/abi",
"//pkg/context",
+ "//pkg/hostarch",
"//pkg/sentry/arch",
"//pkg/sentry/contexttest",
"//pkg/sentry/fs",
@@ -305,6 +310,5 @@ go_test(
"//pkg/sentry/usage",
"//pkg/sync",
"//pkg/syserror",
- "//pkg/usermem",
],
)
diff --git a/pkg/sentry/kernel/cgroup.go b/pkg/sentry/kernel/cgroup.go
new file mode 100644
index 000000000..1f1c63f37
--- /dev/null
+++ b/pkg/sentry/kernel/cgroup.go
@@ -0,0 +1,281 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "bytes"
+ "fmt"
+ "sort"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+)
+
+// InvalidCgroupHierarchyID indicates an uninitialized hierarchy ID.
+const InvalidCgroupHierarchyID uint32 = 0
+
+// CgroupControllerType is the name of a cgroup controller.
+type CgroupControllerType string
+
+// CgroupController is the common interface to cgroup controllers available to
+// the entire sentry. The controllers themselves are defined by cgroupfs.
+//
+// Callers of this interface are often unable access synchronization needed to
+// ensure returned values remain valid. Some of values returned from this
+// interface are thus snapshots in time, and may become stale. This is ok for
+// many callers like procfs.
+type CgroupController interface {
+ // Returns the type of this cgroup controller (ex "memory", "cpu"). Returned
+ // value is valid for the lifetime of the controller.
+ Type() CgroupControllerType
+
+ // Hierarchy returns the ID of the hierarchy this cgroup controller is
+ // attached to. Returned value is valid for the lifetime of the controller.
+ HierarchyID() uint32
+
+ // Filesystem returns the filesystem this controller is attached to.
+ // Returned value is valid for the lifetime of the controller.
+ Filesystem() *vfs.Filesystem
+
+ // RootCgroup returns the root cgroup for this controller. Returned value is
+ // valid for the lifetime of the controller.
+ RootCgroup() Cgroup
+
+ // NumCgroups returns the number of cgroups managed by this controller.
+ // Returned value is a snapshot in time.
+ NumCgroups() uint64
+
+ // Enabled returns whether this controller is enabled. Returned value is a
+ // snapshot in time.
+ Enabled() bool
+}
+
+// Cgroup represents a named pointer to a cgroup in cgroupfs. When a task enters
+// a cgroup, it holds a reference on the underlying dentry pointing to the
+// cgroup.
+//
+// +stateify savable
+type Cgroup struct {
+ *kernfs.Dentry
+ CgroupImpl
+}
+
+func (c *Cgroup) decRef() {
+ c.Dentry.DecRef(context.Background())
+}
+
+// Path returns the absolute path of c, relative to its hierarchy root.
+func (c *Cgroup) Path() string {
+ return c.FSLocalPath()
+}
+
+// HierarchyID returns the id of the hierarchy that contains this cgroup.
+func (c *Cgroup) HierarchyID() uint32 {
+ // Note: a cgroup is guaranteed to have at least one controller.
+ return c.Controllers()[0].HierarchyID()
+}
+
+// CgroupImpl is the common interface to cgroups.
+type CgroupImpl interface {
+ Controllers() []CgroupController
+ Enter(t *Task)
+ Leave(t *Task)
+}
+
+// hierarchy represents a cgroupfs filesystem instance, with a unique set of
+// controllers attached to it. Multiple cgroupfs mounts may reference the same
+// hierarchy.
+//
+// +stateify savable
+type hierarchy struct {
+ id uint32
+ // These are a subset of the controllers in CgroupRegistry.controllers,
+ // grouped here by hierarchy for conveninent lookup.
+ controllers map[CgroupControllerType]CgroupController
+ // fs is not owned by hierarchy. The FS is responsible for unregistering the
+ // hierarchy on destruction, which removes this association.
+ fs *vfs.Filesystem
+}
+
+func (h *hierarchy) match(ctypes []CgroupControllerType) bool {
+ if len(ctypes) != len(h.controllers) {
+ return false
+ }
+ for _, ty := range ctypes {
+ if _, ok := h.controllers[ty]; !ok {
+ return false
+ }
+ }
+ return true
+}
+
+// CgroupRegistry tracks the active set of cgroup controllers on the system.
+//
+// +stateify savable
+type CgroupRegistry struct {
+ // lastHierarchyID is the id of the last allocated cgroup hierarchy. Valid
+ // ids are from 1 to math.MaxUint32. Must be accessed through atomic ops.
+ //
+ lastHierarchyID uint32
+
+ mu sync.Mutex `state:"nosave"`
+
+ // controllers is the set of currently known cgroup controllers on the
+ // system. Protected by mu.
+ //
+ // +checklocks:mu
+ controllers map[CgroupControllerType]CgroupController
+
+ // hierarchies is the active set of cgroup hierarchies. Protected by mu.
+ //
+ // +checklocks:mu
+ hierarchies map[uint32]hierarchy
+}
+
+func newCgroupRegistry() *CgroupRegistry {
+ return &CgroupRegistry{
+ controllers: make(map[CgroupControllerType]CgroupController),
+ hierarchies: make(map[uint32]hierarchy),
+ }
+}
+
+// nextHierarchyID returns a newly allocated, unique hierarchy ID.
+func (r *CgroupRegistry) nextHierarchyID() (uint32, error) {
+ if hid := atomic.AddUint32(&r.lastHierarchyID, 1); hid != 0 {
+ return hid, nil
+ }
+ return InvalidCgroupHierarchyID, fmt.Errorf("cgroup hierarchy ID overflow")
+}
+
+// FindHierarchy returns a cgroup filesystem containing exactly the set of
+// controllers named in names. If no such FS is found, FindHierarchy return
+// nil. FindHierarchy takes a reference on the returned FS, which is transferred
+// to the caller.
+func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Filesystem {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ for _, h := range r.hierarchies {
+ if h.match(ctypes) {
+ h.fs.IncRef()
+ return h.fs
+ }
+ }
+
+ return nil
+}
+
+// Register registers the provided set of controllers with the registry as a new
+// hierarchy. If any controller is already registered, the function returns an
+// error without modifying the registry. The hierarchy can be later referenced
+// by the returned id.
+func (r *CgroupRegistry) Register(cs []CgroupController) (uint32, error) {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ if len(cs) == 0 {
+ return InvalidCgroupHierarchyID, fmt.Errorf("can't register hierarchy with no controllers")
+ }
+
+ for _, c := range cs {
+ if _, ok := r.controllers[c.Type()]; ok {
+ return InvalidCgroupHierarchyID, fmt.Errorf("controllers may only be mounted on a single hierarchy")
+ }
+ }
+
+ hid, err := r.nextHierarchyID()
+ if err != nil {
+ return hid, err
+ }
+
+ h := hierarchy{
+ id: hid,
+ controllers: make(map[CgroupControllerType]CgroupController),
+ fs: cs[0].Filesystem(),
+ }
+ for _, c := range cs {
+ n := c.Type()
+ r.controllers[n] = c
+ h.controllers[n] = c
+ }
+ r.hierarchies[hid] = h
+ return hid, nil
+}
+
+// Unregister removes a previously registered hierarchy from the registry. If
+// the controller was not previously registered, Unregister is a no-op.
+func (r *CgroupRegistry) Unregister(hid uint32) {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ if h, ok := r.hierarchies[hid]; ok {
+ for name, _ := range h.controllers {
+ delete(r.controllers, name)
+ }
+ delete(r.hierarchies, hid)
+ }
+}
+
+// computeInitialGroups takes a reference on each of the returned cgroups. The
+// caller takes ownership of this returned reference.
+func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[Cgroup]struct{} {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ ctlSet := make(map[CgroupControllerType]CgroupController)
+ cgset := make(map[Cgroup]struct{})
+
+ // Remember controllers from the inherited cgroups set...
+ for cg, _ := range inherit {
+ cg.IncRef() // Ref transferred to caller.
+ for _, ctl := range cg.Controllers() {
+ ctlSet[ctl.Type()] = ctl
+ cgset[cg] = struct{}{}
+ }
+ }
+
+ // ... and add the root cgroups of all the missing controllers.
+ for name, ctl := range r.controllers {
+ if _, ok := ctlSet[name]; !ok {
+ cg := ctl.RootCgroup()
+ cg.IncRef() // Ref transferred to caller.
+ cgset[cg] = struct{}{}
+ }
+ }
+ return cgset
+}
+
+// GenerateProcCgroups writes the contents of /proc/cgroups to buf.
+func (r *CgroupRegistry) GenerateProcCgroups(buf *bytes.Buffer) {
+ r.mu.Lock()
+ entries := make([]string, 0, len(r.controllers))
+ for _, c := range r.controllers {
+ en := 0
+ if c.Enabled() {
+ en = 1
+ }
+ entries = append(entries, fmt.Sprintf("%s\t%d\t%d\t%d\n", c.Type(), c.HierarchyID(), c.NumCgroups(), en))
+ }
+ r.mu.Unlock()
+
+ sort.Strings(entries)
+ fmt.Fprint(buf, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n")
+ for _, e := range entries {
+ fmt.Fprint(buf, e)
+ }
+}
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index 7ecbd29ab..564c3d42e 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -10,6 +10,7 @@ go_library(
"//pkg/abi/linux",
"//pkg/context",
"//pkg/fdnotifier",
+ "//pkg/hostarch",
"//pkg/sentry/fs",
"//pkg/sentry/fs/anon",
"//pkg/sentry/fs/fsutil",
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 2aca02fd5..4466fbc9d 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fdnotifier"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/anon"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -186,7 +187,7 @@ func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) erro
e.wq.Notify(waiter.WritableEvents)
var buf [8]byte
- usermem.ByteOrder.PutUint64(buf[:], val)
+ hostarch.ByteOrder.PutUint64(buf[:], val)
_, err := dst.CopyOut(ctx, buf[:])
return err
}
@@ -194,7 +195,7 @@ func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) erro
// Must be called with e.mu locked.
func (e *EventOperations) hostWrite(val uint64) error {
var buf [8]byte
- usermem.ByteOrder.PutUint64(buf[:], val)
+ hostarch.ByteOrder.PutUint64(buf[:], val)
_, err := unix.Write(e.hostfd, buf[:])
if err == unix.EWOULDBLOCK {
return syserror.ErrWouldBlock
@@ -207,7 +208,7 @@ func (e *EventOperations) write(ctx context.Context, src usermem.IOSequence) err
if _, err := src.CopyIn(ctx, buf[:]); err != nil {
return err
}
- val := usermem.ByteOrder.Uint64(buf[:])
+ val := hostarch.ByteOrder.Uint64(buf[:])
return e.Signal(val)
}
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 041e3d4ca..a75686cf3 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -37,6 +37,7 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/hostarch",
"//pkg/log",
"//pkg/sentry/memmap",
"//pkg/sync",
@@ -52,8 +53,8 @@ go_test(
library = ":futex",
deps = [
"//pkg/context",
+ "//pkg/hostarch",
"//pkg/sync",
- "//pkg/usermem",
"@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index e4dcc4d40..0427cf3f4 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -20,10 +20,10 @@ package futex
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
// KeyKind indicates the type of a Key.
@@ -83,8 +83,8 @@ func (k *Key) clone() Key {
}
// Preconditions: k.Kind == KindPrivate or KindSharedPrivate.
-func (k *Key) addr() usermem.Addr {
- return usermem.Addr(k.Offset)
+func (k *Key) addr() hostarch.Addr {
+ return hostarch.Addr(k.Offset)
}
// matches returns true if a wakeup on k2 should wake a waiter waiting on k.
@@ -97,14 +97,14 @@ func (k *Key) matches(k2 *Key) bool {
type Target interface {
context.Context
- // SwapUint32 gives access to usermem.IO.SwapUint32.
- SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
+ // SwapUint32 gives access to hostarch.IO.SwapUint32.
+ SwapUint32(addr hostarch.Addr, new uint32) (uint32, error)
- // CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32.
- CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
+ // CompareAndSwap gives access to hostarch.IO.CompareAndSwapUint32.
+ CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error)
- // LoadUint32 gives access to usermem.IO.LoadUint32.
- LoadUint32(addr usermem.Addr) (uint32, error)
+ // LoadUint32 gives access to hostarch.IO.LoadUint32.
+ LoadUint32(addr hostarch.Addr) (uint32, error)
// GetSharedKey returns a Key with kind KindSharedPrivate or
// KindSharedMappable corresponding to the memory mapped at address addr.
@@ -112,11 +112,11 @@ type Target interface {
// If GetSharedKey returns a Key with a non-nil MappingIdentity, a
// reference is held on the MappingIdentity, which must be dropped by the
// caller when the Key is no longer in use.
- GetSharedKey(addr usermem.Addr) (Key, error)
+ GetSharedKey(addr hostarch.Addr) (Key, error)
}
// check performs a basic equality check on the given address.
-func check(t Target, addr usermem.Addr, val uint32) error {
+func check(t Target, addr hostarch.Addr, val uint32) error {
cur, err := t.LoadUint32(addr)
if err != nil {
return err
@@ -128,7 +128,7 @@ func check(t Target, addr usermem.Addr, val uint32) error {
}
// atomicOp performs a complex operation on the given address.
-func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
+func atomicOp(t Target, addr hostarch.Addr, opIn uint32) (bool, error) {
opType := (opIn >> 28) & 0xf
cmp := (opIn >> 24) & 0xf
opArg := (opIn >> 12) & 0xfff
@@ -328,7 +328,7 @@ const (
)
// getKey returns a Key representing address addr in c.
-func getKey(t Target, addr usermem.Addr, private bool) (Key, error) {
+func getKey(t Target, addr hostarch.Addr, private bool) (Key, error) {
// Ensure the address is aligned.
// It must be a DWORD boundary.
if addr&0x3 != 0 {
@@ -341,7 +341,7 @@ func getKey(t Target, addr usermem.Addr, private bool) (Key, error) {
}
// bucketIndexForAddr returns the index into Manager.buckets for addr.
-func bucketIndexForAddr(addr usermem.Addr) uintptr {
+func bucketIndexForAddr(addr hostarch.Addr) uintptr {
// - The bottom 2 bits of addr must be 0, per getKey.
//
// - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
@@ -448,7 +448,7 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
// Wake wakes up to n waiters matching the bitmask on the given addr.
// The number of waiters woken is returned.
-func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32, n int) (int, error) {
+func (m *Manager) Wake(t Target, addr hostarch.Addr, private bool, bitmask uint32, n int) (int, error) {
// This function is very hot; avoid defer.
k, err := getKey(t, addr, private)
if err != nil {
@@ -463,7 +463,7 @@ func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32
return r, nil
}
-func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
+func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
k1, err := getKey(t, addr, private)
if err != nil {
return 0, err
@@ -498,14 +498,14 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch
// Requeue wakes up to nwake waiters on the given addr, and unconditionally
// requeues up to nreq waiters on naddr.
-func (m *Manager) Requeue(t Target, addr, naddr usermem.Addr, private bool, nwake int, nreq int) (int, error) {
+func (m *Manager) Requeue(t Target, addr, naddr hostarch.Addr, private bool, nwake int, nreq int) (int, error) {
return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq)
}
// RequeueCmp atomically checks that the addr contains val (via the Target),
// wakes up to nwake waiters on addr and then unconditionally requeues nreq
// waiters on naddr.
-func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, val uint32, nwake int, nreq int) (int, error) {
+func (m *Manager) RequeueCmp(t Target, addr, naddr hostarch.Addr, private bool, val uint32, nwake int, nreq int) (int, error) {
return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq)
}
@@ -513,7 +513,7 @@ func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, v
// waiters unconditionally from addr1, and, based on the original value at addr2
// and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
// It returns the total number of waiters woken.
-func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
+func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
k1, err := getKey(t, addr1, private)
if err != nil {
return 0, err
@@ -553,7 +553,7 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwak
// enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
// Waiter must be subsequently removed by calling WaitComplete, whether or not
// a wakeup is received on w.C.
-func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) error {
+func (m *Manager) WaitPrepare(w *Waiter, t Target, addr hostarch.Addr, private bool, val uint32, bitmask uint32) error {
k, err := getKey(t, addr, private)
if err != nil {
return err
@@ -631,7 +631,7 @@ func (m *Manager) WaitComplete(w *Waiter, t Target) {
// FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see
// exit_robust_list()). Given we don't support robust lists, although handled
// below, it's never set.
-func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) {
+func (m *Manager) LockPI(w *Waiter, t Target, addr hostarch.Addr, tid uint32, private, try bool) (bool, error) {
k, err := getKey(t, addr, private)
if err != nil {
return false, err
@@ -663,7 +663,7 @@ func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, pri
return success, nil
}
-func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) {
+func (m *Manager) lockPILocked(w *Waiter, t Target, addr hostarch.Addr, tid uint32, b *bucket, try bool) (bool, error) {
for {
cur, err := t.LoadUint32(addr)
if err != nil {
@@ -724,7 +724,7 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint3
// The address provided must contain the caller's TID. If there are waiters,
// TID of the next waiter (FIFO) is set to the given address, and the waiter
// woken up. If there are no waiters, 0 is set to the address.
-func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
+func (m *Manager) UnlockPI(t Target, addr hostarch.Addr, tid uint32, private bool) error {
k, err := getKey(t, addr, private)
if err != nil {
return err
@@ -738,7 +738,7 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool
return err
}
-func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket, key *Key) error {
+func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bucket, key *Key) error {
cur, err := t.LoadUint32(addr)
if err != nil {
return err
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index ba7f95d8a..deba44e5c 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -23,8 +23,8 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/usermem"
)
// testData implements the Target interface, and allows us to
@@ -43,23 +43,23 @@ func newTestData(size uint) testData {
}
}
-func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
+func (t testData) SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) {
val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), new)
return val, nil
}
-func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
+func (t testData) CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) {
if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), old, new) {
return old, nil
}
return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil
}
-func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) {
+func (t testData) LoadUint32(addr hostarch.Addr) (uint32, error) {
return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil
}
-func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) {
+func (t testData) GetSharedKey(addr hostarch.Addr) (Key, error) {
return Key{
Kind: KindSharedMappable,
Offset: uint64(addr),
@@ -73,7 +73,7 @@ func futexKind(private bool) string {
return "shared"
}
-func newPreparedTestWaiter(t *testing.T, m *Manager, ta Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) *Waiter {
+func newPreparedTestWaiter(t *testing.T, m *Manager, ta Target, addr hostarch.Addr, private bool, val uint32, bitmask uint32) *Waiter {
w := NewWaiter()
if err := m.WaitPrepare(w, ta, addr, private, val, bitmask); err != nil {
t.Fatalf("WaitPrepare failed: %v", err)
@@ -463,12 +463,12 @@ const (
// Beyond being used as a Locker, this is a simple mechanism for
// changing the underlying values for simpler tests.
type testMutex struct {
- a usermem.Addr
+ a hostarch.Addr
d testData
m *Manager
}
-func newTestMutex(addr usermem.Addr, d testData, m *Manager) *testMutex {
+func newTestMutex(addr hostarch.Addr, d testData, m *Manager) *testMutex {
return &testMutex{a: addr, d: d, m: m}
}
diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go
index 4fcdfc541..4b943106b 100644
--- a/pkg/sentry/kernel/kcov.go
+++ b/pkg/sentry/kernel/kcov.go
@@ -22,13 +22,13 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/coverage"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
// kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov
@@ -130,7 +130,7 @@ func (kcov *Kcov) InitTrace(size uint64) error {
// To simplify all the logic around mapping, we require that the length of the
// shared region is a multiple of the system page size.
- if (8*size)&(usermem.PageSize-1) != 0 {
+ if (8*size)&(hostarch.PageSize-1) != 0 {
return syserror.EINVAL
}
@@ -286,7 +286,7 @@ func (rw *kcovReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
}
// Get internal mappings.
- bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Read)
+ bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, hostarch.Read)
if err != nil {
return 0, err
}
@@ -314,7 +314,7 @@ func (rw *kcovReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
}
// Get internal mapping.
- bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Write)
+ bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, hostarch.Write)
if err != nil {
return 0, err
}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 43065b45a..e6e9da898 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -294,6 +294,11 @@ type Kernel struct {
// YAMAPtraceScope is the current level of YAMA ptrace restrictions.
YAMAPtraceScope int32
+
+ // cgroupRegistry contains the set of active cgroup controllers on the
+ // system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on
+ // the system.
+ cgroupRegistry *CgroupRegistry
}
// InitKernelArgs holds arguments to Init.
@@ -438,6 +443,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.socketMount = socketMount
k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord)
+
+ k.cgroupRegistry = newCgroupRegistry()
}
return nil
}
@@ -1815,6 +1822,11 @@ func (k *Kernel) SocketMount() *vfs.Mount {
return k.socketMount
}
+// CgroupRegistry returns the cgroup registry.
+func (k *Kernel) CgroupRegistry() *CgroupRegistry {
+ return k.cgroupRegistry
+}
+
// Release releases resources owned by k.
//
// Precondition: This should only be called after the kernel is fully
@@ -1831,3 +1843,43 @@ func (k *Kernel) Release() {
k.timekeeper.Destroy()
k.vdso.Release(ctx)
}
+
+// PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup
+// hierarchy.
+//
+// Precondition: root must be a new cgroup with no tasks. This implies the
+// controllers for root are also new and currently manage no task, which in turn
+// implies the new cgroup can be populated without migrating tasks between
+// cgroups.
+func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) {
+ k.tasks.mu.RLock()
+ k.tasks.forEachTaskLocked(func(t *Task) {
+ if t.exitState != TaskExitNone {
+ return
+ }
+ t.mu.Lock()
+ t.enterCgroupLocked(root)
+ t.mu.Unlock()
+ })
+ k.tasks.mu.RUnlock()
+}
+
+// ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the
+// hierarchy with the provided id. This is intended for use during hierarchy
+// teardown, as otherwise the tasks would be orphaned w.r.t to some controllers.
+func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) {
+ k.tasks.mu.RLock()
+ k.tasks.forEachTaskLocked(func(t *Task) {
+ if t.exitState != TaskExitNone {
+ return
+ }
+ t.mu.Lock()
+ for cg, _ := range t.cgroups {
+ if cg.HierarchyID() == hid {
+ t.leaveCgroupLocked(cg)
+ }
+ }
+ t.mu.Unlock()
+ })
+ k.tasks.mu.RUnlock()
+}
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index beba6d97d..34c617b08 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -21,6 +21,7 @@ go_library(
"//pkg/abi/linux",
"//pkg/amutex",
"//pkg/context",
+ "//pkg/hostarch",
"//pkg/marshal/primitive",
"//pkg/safemem",
"//pkg/sentry/arch",
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index d004f2357..06769931a 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -22,18 +22,18 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
const (
// MinimumPipeSize is a hard limit of the minimum size of a pipe.
// It corresponds to fs/pipe.c:pipe_min_size.
- MinimumPipeSize = usermem.PageSize
+ MinimumPipeSize = hostarch.PageSize
// MaximumPipeSize is a hard limit on the maximum size of a pipe.
// It corresponds to fs/pipe.c:pipe_max_size.
@@ -41,7 +41,7 @@ const (
// DefaultPipeSize is the system-wide default size of a pipe in bytes.
// It corresponds to pipe_fs_i.h:PIPE_DEF_BUFFERS.
- DefaultPipeSize = 16 * usermem.PageSize
+ DefaultPipeSize = 16 * hostarch.PageSize
// atomicIOBytes is the maximum number of bytes that the pipe will
// guarantee atomic reads or writes atomically.
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index e524afad5..95b948edb 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -17,6 +17,7 @@ package pipe
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -274,7 +275,7 @@ func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescripti
}
src := usermem.IOSequence{
IO: fd,
- Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+ Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(count)}),
}
var (
@@ -302,7 +303,7 @@ func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescripti
func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) {
dst := usermem.IOSequence{
IO: fd,
- Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+ Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(count)}),
}
var (
@@ -328,7 +329,7 @@ func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescript
// fd.pipe.Notify(waiter.WritableEvents) after the read is completed.
//
// Preconditions: fd.pipe.mu must be locked.
-func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
+func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
n, err := fd.pipe.peekLocked(int64(len(dst)), func(srcs safemem.BlockSeq) (uint64, error) {
return safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), srcs)
})
@@ -340,7 +341,7 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
// is completed.
//
// Preconditions: fd.pipe.mu must be locked.
-func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
+func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts usermem.IOOpts) (int, error) {
n, err := fd.pipe.writeLocked(int64(len(src)), func(dsts safemem.BlockSeq) (uint64, error) {
return safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src)))
})
@@ -350,7 +351,7 @@ func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte,
// ZeroOut implements usermem.IO.ZeroOut.
//
// Preconditions: fd.pipe.mu must be locked.
-func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
+func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
n, err := fd.pipe.writeLocked(toZero, func(dsts safemem.BlockSeq) (uint64, error) {
return safemem.ZeroSeq(dsts)
})
@@ -362,7 +363,7 @@ func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int6
// fd.pipe.Notify(waiter.WritableEvents) after the read is completed.
//
// Preconditions: fd.pipe.mu must be locked.
-func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
+func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
return fd.pipe.peekLocked(ars.NumBytes(), func(srcs safemem.BlockSeq) (uint64, error) {
return dst.WriteFromBlocks(srcs)
})
@@ -373,25 +374,25 @@ func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst
// is completed.
//
// Preconditions: fd.pipe.mu must be locked.
-func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
+func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
return fd.pipe.writeLocked(ars.NumBytes(), func(dsts safemem.BlockSeq) (uint64, error) {
return src.ReadToBlocks(dsts)
})
}
// SwapUint32 implements usermem.IO.SwapUint32.
-func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
+func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
// How did a pipe get passed as the virtual address space to futex(2)?
panic("VFSPipeFD.SwapUint32 called unexpectedly")
}
// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
-func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
+func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly")
}
// LoadUint32 implements usermem.IO.LoadUint32.
-func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) {
+func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr hostarch.Addr, opts usermem.IOOpts) (uint32, error) {
panic("VFSPipeFD.LoadUint32 called unexpectedly")
}
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index f5a60e749..57c7659e7 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -19,6 +19,7 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -1011,7 +1012,7 @@ func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
}
// Ptrace implements the ptrace system call.
-func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
+func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error {
// PTRACE_TRACEME ignores all other arguments.
if req == linux.PTRACE_TRACEME {
return t.ptraceTraceme()
@@ -1190,7 +1191,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
panic(fmt.Sprintf("%#x + %#x overflows. Invalid reg size > %#x", ar.Start, n, ar.Length()))
}
ar.End = end
- return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+ return t.CopyOutIovecs(data, hostarch.AddrRangeSeqOf(ar))
case linux.PTRACE_SETREGSET:
ars, err := t.CopyInIovecs(data, 1)
@@ -1214,8 +1215,8 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
return err
}
t.p.FullStateChanged()
- ar.End -= usermem.Addr(n)
- return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+ ar.End -= hostarch.Addr(n)
+ return t.CopyOutIovecs(data, hostarch.AddrRangeSeqOf(ar))
case linux.PTRACE_GETSIGINFO:
t.tg.pidns.owner.mu.RLock()
@@ -1267,7 +1268,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
case linux.PTRACE_GETEVENTMSG:
t.tg.pidns.owner.mu.RLock()
defer t.tg.pidns.owner.mu.RUnlock()
- _, err := primitive.CopyUint64Out(t, usermem.Addr(data), target.ptraceEventMsg)
+ _, err := primitive.CopyUint64Out(t, hostarch.Addr(data), target.ptraceEventMsg)
return err
// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
index 7aea3dcd8..5ae05b5c3 100644
--- a/pkg/sentry/kernel/ptrace_amd64.go
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -18,12 +18,13 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
// ptraceArch implements arch-specific ptrace commands.
-func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error {
+func (t *Task) ptraceArch(target *Task, req int64, addr, data hostarch.Addr) error {
switch req {
case linux.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER
n, err := target.Arch().PtracePeekUser(uintptr(addr))
diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go
index d971b96b3..46dd84cbc 100644
--- a/pkg/sentry/kernel/ptrace_arm64.go
+++ b/pkg/sentry/kernel/ptrace_arm64.go
@@ -17,11 +17,11 @@
package kernel
import (
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
// ptraceArch implements arch-specific ptrace commands.
-func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error {
+func (t *Task) ptraceArch(target *Task, req int64, addr, data hostarch.Addr) error {
return syserror.EIO
}
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 2a9023fdf..4bc5bca44 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -18,6 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -43,8 +44,8 @@ type OldRSeqCriticalRegion struct {
// application handler while its instruction pointer is in CriticalSection,
// set the instruction pointer to Restart and application register r10 (on
// amd64) to the former instruction pointer.
- CriticalSection usermem.AddrRange
- Restart usermem.Addr
+ CriticalSection hostarch.AddrRange
+ Restart hostarch.Addr
}
// RSeqAvailable returns true if t supports (old and new) restartable sequences.
@@ -55,7 +56,7 @@ func (t *Task) RSeqAvailable() bool {
// SetRSeq registers addr as this thread's rseq structure.
//
// Preconditions: The caller must be running on the task goroutine.
-func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error {
+func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error {
if t.rseqAddr != 0 {
if t.rseqAddr != addr {
return syserror.EINVAL
@@ -100,7 +101,7 @@ func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error {
// ClearRSeq unregisters addr as this thread's rseq structure.
//
// Preconditions: The caller must be running on the task goroutine.
-func (t *Task) ClearRSeq(addr usermem.Addr, length, signature uint32) error {
+func (t *Task) ClearRSeq(addr hostarch.Addr, length, signature uint32) error {
if t.rseqAddr == 0 {
return syserror.EINVAL
}
@@ -166,7 +167,7 @@ func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error {
// CPU number.
//
// Preconditions: The caller must be running on the task goroutine.
-func (t *Task) OldRSeqCPUAddr() usermem.Addr {
+func (t *Task) OldRSeqCPUAddr() hostarch.Addr {
return t.oldRSeqCPUAddr
}
@@ -177,7 +178,7 @@ func (t *Task) OldRSeqCPUAddr() usermem.Addr {
// * t.RSeqAvailable() == true.
// * The caller must be running on the task goroutine.
// * t's AddressSpace must be active.
-func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
+func (t *Task) SetOldRSeqCPUAddr(addr hostarch.Addr) error {
t.oldRSeqCPUAddr = addr
// Check that addr is writable.
@@ -221,7 +222,7 @@ func (t *Task) oldRSeqCopyOutCPU() error {
}
buf := t.CopyScratchBuffer(4)
- usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
+ hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
_, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf)
return err
}
@@ -236,8 +237,8 @@ func (t *Task) rseqCopyOutCPU() error {
buf := t.CopyScratchBuffer(8)
// CPUIDStart and CPUID are the first two fields in linux.RSeq.
- usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) // CPUIDStart
- usermem.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID
+ hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) // CPUIDStart
+ hostarch.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID
// N.B. This write is not atomic, but since this occurs on the task
// goroutine then as long as userspace uses a single-instruction read
// it can't see an invalid value.
@@ -251,8 +252,8 @@ func (t *Task) rseqCopyOutCPU() error {
func (t *Task) rseqClearCPU() error {
buf := t.CopyScratchBuffer(8)
// CPUIDStart and CPUID are the first two fields in linux.RSeq.
- usermem.ByteOrder.PutUint32(buf, 0) // CPUIDStart
- usermem.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID
+ hostarch.ByteOrder.PutUint32(buf, 0) // CPUIDStart
+ hostarch.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID
// N.B. This write is not atomic, but since this occurs on the task
// goroutine then as long as userspace uses a single-instruction read
// it can't see an invalid value.
@@ -305,7 +306,7 @@ func (t *Task) rseqAddrInterrupt() {
return
}
- critAddr := usermem.Addr(usermem.ByteOrder.Uint64(buf))
+ critAddr := hostarch.Addr(hostarch.ByteOrder.Uint64(buf))
if critAddr == 0 {
return
}
@@ -325,7 +326,7 @@ func (t *Task) rseqAddrInterrupt() {
return
}
- start := usermem.Addr(cs.Start)
+ start := hostarch.Addr(cs.Start)
critRange, ok := start.ToRange(cs.PostCommitOffset)
if !ok {
t.Debugf("Invalid start and offset in %+v", cs)
@@ -334,7 +335,7 @@ func (t *Task) rseqAddrInterrupt() {
return
}
- abort := usermem.Addr(cs.Abort)
+ abort := hostarch.Addr(cs.Abort)
if critRange.Contains(abort) {
t.Debugf("Abort in critical section in %+v", cs)
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
@@ -353,7 +354,7 @@ func (t *Task) rseqAddrInterrupt() {
return
}
- sig := usermem.ByteOrder.Uint32(buf)
+ sig := hostarch.ByteOrder.Uint32(buf)
if sig != t.rseqSignature {
t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature)
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
@@ -376,7 +377,7 @@ func (t *Task) rseqAddrInterrupt() {
}
// Finally we can actually decide whether or not to restart.
- if !critRange.Contains(usermem.Addr(t.Arch().IP())) {
+ if !critRange.Contains(hostarch.Addr(t.Arch().IP())) {
return
}
@@ -386,7 +387,7 @@ func (t *Task) rseqAddrInterrupt() {
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) oldRSeqInterrupt() {
r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
- if ip := t.Arch().IP(); r.CriticalSection.Contains(usermem.Addr(ip)) {
+ if ip := t.Arch().IP(); r.CriticalSection.Contains(hostarch.Addr(ip)) {
t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart)
t.Arch().SetIP(uintptr(r.Restart))
t.Arch().SetOldRSeqInterruptedIP(ip)
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index 8163a6132..a95e174a2 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -18,9 +18,9 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bpf"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
const maxSyscallFilterInstructions = 1 << 15
@@ -35,11 +35,11 @@ func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input {
return bpf.InputBytes{
Data: buf,
// Go-marshal always uses the native byte order.
- Order: usermem.ByteOrder,
+ Order: hostarch.ByteOrder,
}
}
-func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo {
+func seccompSiginfo(t *Task, errno, sysno int32, ip hostarch.Addr) *arch.SignalInfo {
si := &arch.SignalInfo{
Signo: int32(linux.SIGSYS),
Errno: errno,
@@ -56,7 +56,7 @@ func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalIn
// in because vsyscalls do not use the values in t.Arch().)
//
// Preconditions: The caller must be running on the task goroutine.
-func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) linux.BPFAction {
+func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) linux.BPFAction {
result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip))
action := result & linux.SECCOMP_RET_ACTION
switch action {
@@ -102,7 +102,7 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u
return action
}
-func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
+func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) uint32 {
data := linux.SeccompData{
Nr: sysno,
Arch: t.image.st.AuditNumber,
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index 073e14507..1c3c0794f 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -28,6 +28,7 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/hostarch",
"//pkg/log",
"//pkg/refs",
"//pkg/refsvfs2",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 92d60ba78..a73f1bdca 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -38,6 +38,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -47,7 +48,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
// Key represents a shm segment key. Analogous to a file name.
@@ -197,13 +197,13 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
}
var sizeAligned uint64
- if val, ok := usermem.Addr(size).RoundUp(); ok {
+ if val, ok := hostarch.Addr(size).RoundUp(); ok {
sizeAligned = uint64(val)
} else {
return nil, syserror.EINVAL
}
- if numPages := sizeAligned / usermem.PageSize; r.totalPages+numPages > linux.SHMALL {
+ if numPages := sizeAligned / hostarch.PageSize; r.totalPages+numPages > linux.SHMALL {
// "... allocating a segment of the requested size would cause the
// system to exceed the system-wide limit on shared memory (SHMALL)."
// - man shmget(2)
@@ -232,7 +232,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
}
- effectiveSize := uint64(usermem.Addr(size).MustRoundUp())
+ effectiveSize := uint64(hostarch.Addr(size).MustRoundUp())
fr, err := mfp.MemoryFile().Allocate(effectiveSize, usage.Anonymous)
if err != nil {
return nil, err
@@ -267,7 +267,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
r.shms[id] = shm
r.keysToShms[key] = shm
- r.totalPages += effectiveSize / usermem.PageSize
+ r.totalPages += effectiveSize / hostarch.PageSize
return shm, nil
}
@@ -318,7 +318,7 @@ func (r *Registry) remove(s *Shm) {
}
delete(r.shms, s.ID)
- r.totalPages -= s.effectiveSize / usermem.PageSize
+ r.totalPages -= s.effectiveSize / hostarch.PageSize
}
// Release drops the self-reference of each active shm segment in the registry.
@@ -386,7 +386,7 @@ type Shm struct {
// effectiveSize of the segment, rounding up to the next page
// boundary. Immutable.
//
- // Invariant: effectiveSize must be a multiple of usermem.PageSize.
+ // Invariant: effectiveSize must be a multiple of hostarch.PageSize.
effectiveSize uint64
// fr is the offset into mfp.MemoryFile() that backs this contents of this
@@ -467,7 +467,7 @@ func (s *Shm) Msync(context.Context, memmap.MappableRange) error {
}
// AddMapping implements memmap.Mappable.AddMapping.
-func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) error {
+func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) error {
s.mu.Lock()
defer s.mu.Unlock()
s.attachTime = ktime.NowFromContext(ctx)
@@ -482,7 +482,7 @@ func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.A
}
// RemoveMapping implements memmap.Mappable.RemoveMapping.
-func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) {
+func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) {
s.mu.Lock()
defer s.mu.Unlock()
// RemoveMapping may be called during task exit, when ctx
@@ -503,12 +503,12 @@ func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ userme
}
// CopyMapping implements memmap.Mappable.CopyMapping.
-func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error {
+func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, hostarch.AddrRange, uint64, bool) error {
return nil
}
// Translate implements memmap.Mappable.Translate.
-func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
var err error
if required.End > s.fr.Length() {
err = &memmap.BusError{syserror.EFAULT}
@@ -519,7 +519,7 @@ func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableR
Source: source,
File: s.mfp.MemoryFile(),
Offset: s.fr.Start + source.Start,
- Perms: usermem.AnyAccess,
+ Perms: hostarch.AnyAccess,
},
}, err
}
@@ -543,7 +543,7 @@ type AttachOpts struct {
//
// Postconditions: The returned MMapOpts are valid only as long as a reference
// continues to be held on s.
-func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
+func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.pendingDestruction && s.ReadRefs() == 0 {
@@ -565,12 +565,12 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts Attac
Offset: 0,
Addr: addr,
Fixed: opts.Remap,
- Perms: usermem.AccessType{
+ Perms: hostarch.AccessType{
Read: true,
Write: !opts.Readonly,
Execute: opts.Execute,
},
- MaxPerms: usermem.AnyAccess,
+ MaxPerms: hostarch.AnyAccess,
Mappable: s,
MappingIdentity: s,
}, nil
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 332bdb8e8..953d4310e 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -20,9 +20,9 @@ import (
"gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/bits"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/usermem"
)
// maxSyscallNum is the highest supported syscall number.
@@ -243,7 +243,7 @@ type SyscallTable struct {
// Emulate is a collection of instruction addresses to emulate. The
// keys are addresses, and the values are system call numbers.
- Emulate map[usermem.Addr]uintptr
+ Emulate map[hostarch.Addr]uintptr
// The function to call in case of a missing system call.
Missing MissingFn
@@ -316,7 +316,7 @@ func (s *SyscallTable) Init() {
}
if s.Emulate == nil {
// Ensure non-nil emulate table.
- s.Emulate = make(map[usermem.Addr]uintptr)
+ s.Emulate = make(map[hostarch.Addr]uintptr)
}
max := s.MaxSysno() // Checked during RegisterSyscallTable.
@@ -359,7 +359,7 @@ func (s *SyscallTable) LookupNo(name string) (uintptr, error) {
}
// LookupEmulate looks up an emulation syscall number.
-func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
+func (s *SyscallTable) LookupEmulate(addr hostarch.Addr) (uintptr, bool) {
sysno, ok := s.Emulate[addr]
return sysno, ok
}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 36141dd09..be1371855 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bpf"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/inet"
@@ -33,7 +34,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -470,7 +470,7 @@ type Task struct {
// ThreadID to 0, and wake any futex waiters.
//
// cleartid is exclusive to the task goroutine.
- cleartid usermem.Addr
+ cleartid hostarch.Addr
// This is mostly a fake cpumask just for sched_set/getaffinity as we
// don't really control the affinity.
@@ -540,12 +540,12 @@ type Task struct {
// oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable.
//
// oldRSeqCPUAddr is exclusive to the task goroutine.
- oldRSeqCPUAddr usermem.Addr
+ oldRSeqCPUAddr hostarch.Addr
// rseqAddr is a pointer to the userspace linux.RSeq structure.
//
// rseqAddr is exclusive to the task goroutine.
- rseqAddr usermem.Addr
+ rseqAddr hostarch.Addr
// rseqSignature is the signature that the rseq abort IP must be signed
// with.
@@ -575,7 +575,7 @@ type Task struct {
// robustList is a pointer to the head of the tasks's robust futex
// list.
- robustList usermem.Addr
+ robustList hostarch.Addr
// startTime is the real time at which the task started. It is set when
// a Task is created or invokes execve(2).
@@ -587,6 +587,12 @@ type Task struct {
//
// kcov is exclusive to the task goroutine.
kcov *Kcov
+
+ // cgroups is the set of cgroups this task belongs to. This may be empty if
+ // no cgroup controllers are enabled. Protected by mu.
+ //
+ // +checklocks:mu
+ cgroups map[Cgroup]struct{}
}
func (t *Task) savePtraceTracer() *Task {
@@ -652,7 +658,7 @@ func (t *Task) Kernel() *Kernel {
// SetClearTID sets t's cleartid.
//
// Preconditions: The caller must be running on the task goroutine.
-func (t *Task) SetClearTID(addr usermem.Addr) {
+func (t *Task) SetClearTID(addr hostarch.Addr) {
t.cleartid = addr
}
diff --git a/pkg/sentry/kernel/task_cgroup.go b/pkg/sentry/kernel/task_cgroup.go
new file mode 100644
index 000000000..25d2504fa
--- /dev/null
+++ b/pkg/sentry/kernel/task_cgroup.go
@@ -0,0 +1,138 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "bytes"
+ "fmt"
+ "sort"
+ "strings"
+
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// EnterInitialCgroups moves t into an initial set of cgroups.
+//
+// Precondition: t isn't in any cgroups yet, t.cgs is empty.
+//
+// +checklocksignore parent.mu is conditionally acquired.
+func (t *Task) EnterInitialCgroups(parent *Task) {
+ var inherit map[Cgroup]struct{}
+ if parent != nil {
+ parent.mu.Lock()
+ defer parent.mu.Unlock()
+ inherit = parent.cgroups
+ }
+ joinSet := t.k.cgroupRegistry.computeInitialGroups(inherit)
+
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ // Transfer ownership of joinSet refs to the task's cgset.
+ t.cgroups = joinSet
+ for c, _ := range t.cgroups {
+ // Since t isn't in any cgroup yet, we can skip the check against
+ // existing cgroups.
+ c.Enter(t)
+ }
+}
+
+// EnterCgroup moves t into c.
+func (t *Task) EnterCgroup(c Cgroup) error {
+ newControllers := make(map[CgroupControllerType]struct{})
+ for _, ctl := range c.Controllers() {
+ newControllers[ctl.Type()] = struct{}{}
+ }
+
+ t.mu.Lock()
+ defer t.mu.Unlock()
+
+ for oldCG, _ := range t.cgroups {
+ for _, oldCtl := range oldCG.Controllers() {
+ if _, ok := newControllers[oldCtl.Type()]; ok {
+ // Already in a cgroup with the same controller as one of the
+ // new ones. Requires migration between cgroups.
+ //
+ // TODO(b/183137098): Implement cgroup migration.
+ log.Warningf("Cgroup migration is not implemented")
+ return syserror.EBUSY
+ }
+ }
+ }
+
+ // No migration required.
+ t.enterCgroupLocked(c)
+
+ return nil
+}
+
+// +checklocks:t.mu
+func (t *Task) enterCgroupLocked(c Cgroup) {
+ c.IncRef()
+ t.cgroups[c] = struct{}{}
+ c.Enter(t)
+}
+
+// LeaveCgroups removes t out from all its cgroups.
+func (t *Task) LeaveCgroups() {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ for c, _ := range t.cgroups {
+ t.leaveCgroupLocked(c)
+ }
+}
+
+// +checklocks:t.mu
+func (t *Task) leaveCgroupLocked(c Cgroup) {
+ c.Leave(t)
+ delete(t.cgroups, c)
+ c.decRef()
+}
+
+// taskCgroupEntry represents a line in /proc/<pid>/cgroup, and is used to
+// format a cgroup for display.
+type taskCgroupEntry struct {
+ hierarchyID uint32
+ controllers string
+ path string
+}
+
+// GenerateProcTaskCgroup writes the contents of /proc/<pid>/cgroup for t to buf.
+func (t *Task) GenerateProcTaskCgroup(buf *bytes.Buffer) {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+
+ cgEntries := make([]taskCgroupEntry, 0, len(t.cgroups))
+ for c, _ := range t.cgroups {
+ ctls := c.Controllers()
+ ctlNames := make([]string, 0, len(ctls))
+ for _, ctl := range ctls {
+ ctlNames = append(ctlNames, string(ctl.Type()))
+ }
+
+ cgEntries = append(cgEntries, taskCgroupEntry{
+ // Note: We're guaranteed to have at least one controller, and all
+ // controllers are guaranteed to be on the same hierarchy.
+ hierarchyID: ctls[0].HierarchyID(),
+ controllers: strings.Join(ctlNames, ","),
+ path: c.Path(),
+ })
+ }
+
+ sort.Slice(cgEntries, func(i, j int) bool { return cgEntries[i].hierarchyID > cgEntries[j].hierarchyID })
+ for _, cgE := range cgEntries {
+ fmt.Fprintf(buf, "%d:%s:%s\n", cgE.hierarchyID, cgE.controllers, cgE.path)
+ }
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index f305e69c0..405771f3f 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -20,6 +20,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bpf"
"gvisor.dev/gvisor/pkg/cleanup"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -85,12 +86,12 @@ type CloneOptions struct {
// Stack is the initial stack pointer of the new task. If Stack is 0, the
// new task will start with the same stack pointer as its parent.
- Stack usermem.Addr
+ Stack hostarch.Addr
// If SetTLS is true, set the new task's TLS (thread-local storage)
// descriptor to TLS. If SetTLS is false, TLS is ignored.
SetTLS bool
- TLS usermem.Addr
+ TLS hostarch.Addr
// If ChildClearTID is true, when the child exits, 0 is written to the
// address ChildTID in the child's memory, and if the write is successful a
@@ -101,7 +102,7 @@ type CloneOptions struct {
// Linux, failed writes are silently ignored.)
ChildClearTID bool
ChildSetTID bool
- ChildTID usermem.Addr
+ ChildTID hostarch.Addr
// If ParentSetTID is true, the child's thread ID (in the parent's PID
// namespace) is written to address ParentTID in the parent's memory. (As
@@ -112,7 +113,7 @@ type CloneOptions struct {
// and child's memory, but this is a documentation error fixed by
// 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID").
ParentSetTID bool
- ParentTID usermem.Addr
+ ParentTID hostarch.Addr
// If Vfork is true, place the parent in vforkStop until the cloned task
// releases its TaskImage.
@@ -268,7 +269,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
}
tg := t.tg
- rseqAddr := usermem.Addr(0)
+ rseqAddr := hostarch.Addr(0)
rseqSignature := uint32(0)
if opts.NewThreadGroup {
if tg.mounts != nil {
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index ad59e4f60..b1af1a7ef 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -275,6 +275,10 @@ func (*runExitMain) execute(t *Task) taskRunState {
t.fsContext.DecRef(t)
t.fdTable.DecRef(t)
+ // Detach task from all cgroups. This must happen before potentially the
+ // last ref to the cgroupfs mount is dropped below.
+ t.LeaveCgroups()
+
t.mu.Lock()
if t.mountNamespaceVFS2 != nil {
t.mountNamespaceVFS2.DecRef(t)
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index 195c7da9b..4dc41b82b 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -16,6 +16,7 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/usermem"
@@ -30,33 +31,33 @@ func (t *Task) Futex() *futex.Manager {
}
// SwapUint32 implements futex.Target.SwapUint32.
-func (t *Task) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
+func (t *Task) SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) {
return t.MemoryManager().SwapUint32(t, addr, new, usermem.IOOpts{
AddressSpaceActive: true,
})
}
// CompareAndSwapUint32 implements futex.Target.CompareAndSwapUint32.
-func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
+func (t *Task) CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) {
return t.MemoryManager().CompareAndSwapUint32(t, addr, old, new, usermem.IOOpts{
AddressSpaceActive: true,
})
}
// LoadUint32 implements futex.Target.LoadUint32.
-func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) {
+func (t *Task) LoadUint32(addr hostarch.Addr) (uint32, error) {
return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{
AddressSpaceActive: true,
})
}
// GetSharedKey implements futex.Target.GetSharedKey.
-func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) {
+func (t *Task) GetSharedKey(addr hostarch.Addr) (futex.Key, error) {
return t.MemoryManager().GetSharedFutexKey(t, addr)
}
// GetRobustList sets the robust futex list for the task.
-func (t *Task) GetRobustList() usermem.Addr {
+func (t *Task) GetRobustList() hostarch.Addr {
t.mu.Lock()
addr := t.robustList
t.mu.Unlock()
@@ -64,7 +65,7 @@ func (t *Task) GetRobustList() usermem.Addr {
}
// SetRobustList sets the robust futex list for the task.
-func (t *Task) SetRobustList(addr usermem.Addr) {
+func (t *Task) SetRobustList(addr hostarch.Addr) {
t.mu.Lock()
t.robustList = addr
t.mu.Unlock()
@@ -84,28 +85,28 @@ func (t *Task) exitRobustList() {
}
var rl linux.RobustListHead
- if _, err := rl.CopyIn(t, usermem.Addr(addr)); err != nil {
+ if _, err := rl.CopyIn(t, hostarch.Addr(addr)); err != nil {
return
}
next := primitive.Uint64(rl.List)
done := 0
- var pendingLockAddr usermem.Addr
+ var pendingLockAddr hostarch.Addr
if rl.ListOpPending != 0 {
- pendingLockAddr = usermem.Addr(rl.ListOpPending + rl.FutexOffset)
+ pendingLockAddr = hostarch.Addr(rl.ListOpPending + rl.FutexOffset)
}
// Wake up normal elements.
- for usermem.Addr(next) != addr {
+ for hostarch.Addr(next) != addr {
// We traverse to the next element of the list before we
// actually wake anything. This prevents the race where waking
// this futex causes a modification of the list.
- thisLockAddr := usermem.Addr(uint64(next) + rl.FutexOffset)
+ thisLockAddr := hostarch.Addr(uint64(next) + rl.FutexOffset)
// Try to decode the next element in the list before waking the
// current futex. But don't check the error until after we've
// woken the current futex. Linux does it in this order too
- _, nextErr := next.CopyIn(t, usermem.Addr(next))
+ _, nextErr := next.CopyIn(t, hostarch.Addr(next))
// Wakeup the current futex if it's not pending.
if thisLockAddr != pendingLockAddr {
@@ -133,7 +134,7 @@ func (t *Task) exitRobustList() {
}
// wakeRobustListOne wakes a single futex from the robust list.
-func (t *Task) wakeRobustListOne(addr usermem.Addr) {
+func (t *Task) wakeRobustListOne(addr hostarch.Addr) {
// Bit 0 in address signals PI futex.
pi := addr&1 == 1
addr = addr &^ 1
diff --git a/pkg/sentry/kernel/task_image.go b/pkg/sentry/kernel/task_image.go
index ce5fbd299..bd5543d4e 100644
--- a/pkg/sentry/kernel/task_image.go
+++ b/pkg/sentry/kernel/task_image.go
@@ -19,12 +19,12 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/sentry/loader"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/syserr"
- "gvisor.dev/gvisor/pkg/usermem"
)
var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC)
@@ -129,7 +129,7 @@ func (t *Task) Stack() *arch.Stack {
return &arch.Stack{
Arch: t.Arch(),
IO: t.MemoryManager(),
- Bottom: usermem.Addr(t.Arch().Stack()),
+ Bottom: hostarch.Addr(t.Arch().Stack()),
}
}
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index c70e5e6ce..72b9a0384 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -20,6 +20,7 @@ import (
"sort"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -108,9 +109,9 @@ func (t *Task) debugDumpStack() {
return
}
t.Debugf("Stack:")
- start := usermem.Addr(t.Arch().Stack())
+ start := hostarch.Addr(t.Arch().Stack())
// Round addr down to a 16-byte boundary.
- start &= ^usermem.Addr(15)
+ start &= ^hostarch.Addr(15)
// Print 16 bytes per line, one byte at a time.
for offset := uint64(0); offset < maxStackDebugBytes; offset += 16 {
addr, ok := start.AddLength(offset)
@@ -127,7 +128,7 @@ func (t *Task) debugDumpStack() {
t.Debugf("%x: % x", addr, data[:n])
}
if err != nil {
- t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err)
+ t.Debugf("Error reading stack at address %x: %v", addr+hostarch.Addr(n), err)
break
}
}
@@ -147,9 +148,9 @@ func (t *Task) debugDumpCode() {
}
t.Debugf("Code:")
// Print code on both sides of the instruction register.
- start := usermem.Addr(t.Arch().IP()) - maxCodeDebugBytes/2
+ start := hostarch.Addr(t.Arch().IP()) - maxCodeDebugBytes/2
// Round addr down to a 16-byte boundary.
- start &= ^usermem.Addr(15)
+ start &= ^hostarch.Addr(15)
// Print 16 bytes per line, one byte at a time.
for offset := uint64(0); offset < maxCodeDebugBytes; offset += 16 {
addr, ok := start.AddLength(offset)
@@ -166,7 +167,7 @@ func (t *Task) debugDumpCode() {
t.Debugf("%x: % x", addr, data[:n])
}
if err != nil {
- t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err)
+ t.Debugf("Error reading stack at address %x: %v", addr+hostarch.Addr(n), err)
break
}
}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 3ccecf4b6..068f25af1 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -23,13 +23,13 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/goid"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
// A taskRunState is a reified state in the task state machine. See README.md
@@ -148,7 +148,7 @@ func (*runApp) handleCPUIDInstruction(t *Task) error {
region := trace.StartRegion(t.traceContext, cpuidRegion)
expected := arch.CPUIDInstruction[:]
found := make([]byte, len(expected))
- _, err := t.CopyInBytes(usermem.Addr(t.Arch().IP()), found)
+ _, err := t.CopyInBytes(hostarch.Addr(t.Arch().IP()), found)
if err == nil && bytes.Equal(expected, found) {
// Skip the cpuid instruction.
t.Arch().CPUIDEmulate(t)
@@ -307,8 +307,8 @@ func (app *runApp) execute(t *Task) taskRunState {
// normally.
if at.Any() {
region := trace.StartRegion(t.traceContext, faultRegion)
- addr := usermem.Addr(info.Addr())
- err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
+ addr := hostarch.Addr(info.Addr())
+ err := t.MemoryManager().HandleUserFault(t, addr, at, hostarch.Addr(t.Arch().Stack()))
region.End()
if err == nil {
// The fault was handled appropriately.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 75af3af79..c2b9fc08f 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -23,11 +23,11 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/eventchannel"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -243,7 +243,7 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
// Are executing on the main stack,
// or the provided alternate stack?
- sp := usermem.Addr(t.Arch().Stack())
+ sp := hostarch.Addr(t.Arch().Stack())
// N.B. This is a *copy* of the alternate stack that the user's signal
// handler expects to see in its ucontext (even if it's not in use).
@@ -251,7 +251,7 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
if act.IsOnStack() && alt.IsEnabled() {
alt.SetOnStack()
if !alt.Contains(sp) {
- sp = usermem.Addr(alt.Top())
+ sp = hostarch.Addr(alt.Top())
}
}
@@ -652,7 +652,7 @@ func (t *Task) SignalStack() arch.SignalStack {
// onSignalStack returns true if the task is executing on the given signal stack.
func (t *Task) onSignalStack(alt arch.SignalStack) bool {
- sp := usermem.Addr(t.Arch().Stack())
+ sp := hostarch.Addr(t.Arch().Stack())
return alt.Contains(sp)
}
@@ -720,7 +720,7 @@ func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (a
// CopyOutSignalAct converts the given SignalAct into an architecture-specific
// type and then copies it out to task memory.
-func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
+func (t *Task) CopyOutSignalAct(addr hostarch.Addr, s *arch.SignalAct) error {
n := t.Arch().NewSignalAct()
n.SerializeFrom(s)
_, err := n.CopyOut(t, addr)
@@ -729,7 +729,7 @@ func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
// CopyInSignalAct copies an architecture-specific sigaction type from task
// memory and then converts it into a SignalAct.
-func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
+func (t *Task) CopyInSignalAct(addr hostarch.Addr) (arch.SignalAct, error) {
n := t.Arch().NewSignalAct()
var s arch.SignalAct
if _, err := n.CopyIn(t, addr); err != nil {
@@ -741,7 +741,7 @@ func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
// CopyOutSignalStack converts the given SignalStack into an
// architecture-specific type and then copies it out to task memory.
-func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error {
+func (t *Task) CopyOutSignalStack(addr hostarch.Addr, s *arch.SignalStack) error {
n := t.Arch().NewSignalStack()
n.SerializeFrom(s)
_, err := n.CopyOut(t, addr)
@@ -750,7 +750,7 @@ func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error
// CopyInSignalStack copies an architecture-specific stack_t from task memory
// and then converts it into a SignalStack.
-func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
+func (t *Task) CopyInSignalStack(addr hostarch.Addr) (arch.SignalStack, error) {
n := t.Arch().NewSignalStack()
var s arch.SignalStack
if _, err := n.CopyIn(t, addr); err != nil {
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 36e1384f1..32031cd70 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -17,6 +17,7 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -25,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
// TaskConfig defines the configuration of a new Task (see below).
@@ -86,7 +86,7 @@ type TaskConfig struct {
MountNamespaceVFS2 *vfs.MountNamespace
// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
- RSeqAddr usermem.Addr
+ RSeqAddr hostarch.Addr
// RSeqSignature is the signature that the rseq abort IP must be signed
// with.
@@ -151,6 +151,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
rseqSignature: cfg.RSeqSignature,
futexWaiter: futex.NewWaiter(),
containerID: cfg.ContainerID,
+ cgroups: make(map[Cgroup]struct{}),
}
t.creds.Store(cfg.Credentials)
t.endStopCond.L = &t.tg.signalHandlers.mu
@@ -189,6 +190,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
t.parent.children[t] = struct{}{}
}
+ if VFS2Enabled {
+ t.EnterInitialCgroups(t.parent)
+ }
+
if tg.leader == nil {
// New thread group.
tg.leader = t
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 2e84bd88a..2c658d001 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -22,12 +22,12 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bits"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application")
@@ -153,7 +153,7 @@ func (t *Task) doSyscall() taskRunState {
// Check seccomp filters. The nil check is for performance (as seccomp use
// is rare), not needed for correctness.
if t.syscallFilters.Load() != nil {
- switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r {
+ switch r := t.checkSeccompSyscall(int32(sysno), args, hostarch.Addr(t.Arch().IP())); r {
case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
t.Debugf("Syscall %d: denied by seccomp", sysno)
return (*runSyscallExit)(nil)
@@ -283,12 +283,12 @@ func (*runSyscallExit) execute(t *Task) taskRunState {
// doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
// indicated by an execution fault at address addr. doVsyscall returns the
// task's next run state.
-func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
+func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState {
vsyscallCount.Increment()
// Grab the caller up front, to make sure there's a sensible stack.
caller := t.Arch().Native(uintptr(0))
- if _, err := caller.CopyIn(t, usermem.Addr(t.Arch().Stack())); err != nil {
+ if _, err := caller.CopyIn(t, hostarch.Addr(t.Arch().Stack())); err != nil {
t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
@@ -322,7 +322,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
}
type runVsyscallAfterPtraceEventSeccomp struct {
- addr usermem.Addr
+ addr hostarch.Addr
sysno uintptr
caller marshal.Marshallable
}
@@ -337,7 +337,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
// currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
// Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
// causes do_exit(SIGSYS), and changing sp is ignored.
- if (sysno != ^uintptr(0) && sysno != r.sysno) || usermem.Addr(t.Arch().IP()) != r.addr {
+ if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr {
t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
return (*runExit)(nil)
}
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 94dabbcd8..fc6d9438a 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -27,7 +28,7 @@ import (
// MAX_RW_COUNT is the maximum size in bytes of a single read or write.
// Reads and writes that exceed this size may be silently truncated.
// (Linux: include/linux/fs.h:MAX_RW_COUNT)
-var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
+var MAX_RW_COUNT = int(hostarch.Addr(math.MaxInt32).RoundDown())
// Activate ensures that the task has an active address space.
func (t *Task) Activate() {
@@ -49,7 +50,7 @@ func (t *Task) Deactivate() {
// data without reflection and pass in a byte slice.
//
// This Task's AddressSpace must be active.
-func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+func (t *Task) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) {
return t.MemoryManager().CopyIn(t, addr, dst, usermem.IOOpts{
AddressSpaceActive: true,
})
@@ -59,7 +60,7 @@ func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
// data without reflection and pass in a byte slice.
//
// This Task's AddressSpace must be active.
-func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+func (t *Task) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) {
return t.MemoryManager().CopyOut(t, addr, src, usermem.IOOpts{
AddressSpaceActive: true,
})
@@ -70,7 +71,7 @@ func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
// user memory that is unmapped or not readable by the user.
//
// This Task's AddressSpace must be active.
-func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) {
+func (t *Task) CopyInString(addr hostarch.Addr, maxlen int) (string, error) {
return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxlen, usermem.IOOpts{
AddressSpaceActive: true,
})
@@ -90,7 +91,7 @@ func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) {
// { "abc" } => 4 (3 for length, 1 for elements)
//
// This Task's AddressSpace must be active.
-func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([]string, error) {
+func (t *Task) CopyInVector(addr hostarch.Addr, maxElemSize, maxTotalSize int) ([]string, error) {
var v []string
for {
argAddr := t.Arch().Native(0)
@@ -109,12 +110,12 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
if maxTotalSize < thisMax {
thisMax = maxTotalSize
}
- arg, err := t.CopyInString(usermem.Addr(t.Arch().Value(argAddr)), thisMax)
+ arg, err := t.CopyInString(hostarch.Addr(t.Arch().Value(argAddr)), thisMax)
if err != nil {
return v, err
}
v = append(v, arg)
- addr += usermem.Addr(t.Arch().Width())
+ addr += hostarch.Addr(t.Arch().Width())
maxTotalSize -= len(arg) + 1
}
return v, nil
@@ -126,7 +127,7 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
// Preconditions: Same as usermem.IO.CopyOut, plus:
// * The caller must be running on the task goroutine.
// * t's AddressSpace must be active.
-func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error {
+func (t *Task) CopyOutIovecs(addr hostarch.Addr, src hostarch.AddrRangeSeq) error {
switch t.Arch().Width() {
case 8:
const itemLen = 16
@@ -137,8 +138,8 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
b := t.CopyScratchBuffer(itemLen)
for ; !src.IsEmpty(); src = src.Tail() {
ar := src.Head()
- usermem.ByteOrder.PutUint64(b[0:8], uint64(ar.Start))
- usermem.ByteOrder.PutUint64(b[8:16], uint64(ar.Length()))
+ hostarch.ByteOrder.PutUint64(b[0:8], uint64(ar.Start))
+ hostarch.ByteOrder.PutUint64(b[8:16], uint64(ar.Length()))
if _, err := t.CopyOutBytes(addr, b); err != nil {
return err
}
@@ -153,8 +154,8 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
}
// CopyInIovecs copies an array of numIovecs struct iovecs from the memory
-// mapped at addr, converts them to usermem.AddrRanges, and returns them as a
-// usermem.AddrRangeSeq.
+// mapped at addr, converts them to hostarch.AddrRanges, and returns them as a
+// hostarch.AddrRangeSeq.
//
// CopyInIovecs shares the following properties with Linux's
// lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector():
@@ -175,42 +176,42 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
// Preconditions: Same as usermem.IO.CopyIn, plus:
// * The caller must be running on the task goroutine.
// * t's AddressSpace must be active.
-func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) {
+func (t *Task) CopyInIovecs(addr hostarch.Addr, numIovecs int) (hostarch.AddrRangeSeq, error) {
if numIovecs == 0 {
- return usermem.AddrRangeSeq{}, nil
+ return hostarch.AddrRangeSeq{}, nil
}
- var dst []usermem.AddrRange
+ var dst []hostarch.AddrRange
if numIovecs > 1 {
- dst = make([]usermem.AddrRange, 0, numIovecs)
+ dst = make([]hostarch.AddrRange, 0, numIovecs)
}
switch t.Arch().Width() {
case 8:
const itemLen = 16
if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok {
- return usermem.AddrRangeSeq{}, syserror.EFAULT
+ return hostarch.AddrRangeSeq{}, syserror.EFAULT
}
b := t.CopyScratchBuffer(itemLen)
for i := 0; i < numIovecs; i++ {
if _, err := t.CopyInBytes(addr, b); err != nil {
- return usermem.AddrRangeSeq{}, err
+ return hostarch.AddrRangeSeq{}, err
}
- base := usermem.Addr(usermem.ByteOrder.Uint64(b[0:8]))
- length := usermem.ByteOrder.Uint64(b[8:16])
+ base := hostarch.Addr(hostarch.ByteOrder.Uint64(b[0:8]))
+ length := hostarch.ByteOrder.Uint64(b[8:16])
if length > math.MaxInt64 {
- return usermem.AddrRangeSeq{}, syserror.EINVAL
+ return hostarch.AddrRangeSeq{}, syserror.EINVAL
}
ar, ok := t.MemoryManager().CheckIORange(base, int64(length))
if !ok {
- return usermem.AddrRangeSeq{}, syserror.EFAULT
+ return hostarch.AddrRangeSeq{}, syserror.EFAULT
}
if numIovecs == 1 {
// Special case to avoid allocating dst.
- return usermem.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil
+ return hostarch.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil
}
dst = append(dst, ar)
@@ -218,7 +219,7 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange
}
default:
- return usermem.AddrRangeSeq{}, syserror.ENOSYS
+ return hostarch.AddrRangeSeq{}, syserror.ENOSYS
}
// Truncate to MAX_RW_COUNT.
@@ -226,13 +227,13 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange
for i := range dst {
dstlen := uint64(dst[i].Length())
if rem := uint64(MAX_RW_COUNT) - total; rem < dstlen {
- dst[i].End -= usermem.Addr(dstlen - rem)
+ dst[i].End -= hostarch.Addr(dstlen - rem)
dstlen = rem
}
total += dstlen
}
- return usermem.AddrRangeSeqFromSlice(dst), nil
+ return hostarch.AddrRangeSeqFromSlice(dst), nil
}
// SingleIOSequence returns a usermem.IOSequence representing [addr,
@@ -245,7 +246,7 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange
// write syscalls in Linux do not use import_single_range(). However they check
// access_ok() in fs/read_write.c:vfs_read/vfs_write, and overflowing address
// ranges are truncated to MAX_RW_COUNT by fs/read_write.c:rw_verify_area().)
-func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+func (t *Task) SingleIOSequence(addr hostarch.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) {
if length > MAX_RW_COUNT {
length = MAX_RW_COUNT
}
@@ -255,7 +256,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp
}
return usermem.IOSequence{
IO: t.MemoryManager(),
- Addrs: usermem.AddrRangeSeqOf(ar),
+ Addrs: hostarch.AddrRangeSeqOf(ar),
Opts: opts,
}, nil
}
@@ -267,7 +268,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp
// IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
//
// Preconditions: Same as Task.CopyInIovecs.
-func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+func (t *Task) IovecsIOSequence(addr hostarch.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
return usermem.IOSequence{}, syserror.EINVAL
}
@@ -317,7 +318,7 @@ func (cc *taskCopyContext) getMemoryManager() (*mm.MemoryManager, error) {
}
// CopyInBytes implements marshal.CopyContext.CopyInBytes.
-func (cc *taskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+func (cc *taskCopyContext) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) {
tmm, err := cc.getMemoryManager()
if err != nil {
return 0, err
@@ -327,7 +328,7 @@ func (cc *taskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, erro
}
// CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
-func (cc *taskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+func (cc *taskCopyContext) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) {
tmm, err := cc.getMemoryManager()
if err != nil {
return 0, err
@@ -360,11 +361,11 @@ func (cc *ownTaskCopyContext) CopyScratchBuffer(size int) []byte {
}
// CopyInBytes implements marshal.CopyContext.CopyInBytes.
-func (cc *ownTaskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+func (cc *ownTaskCopyContext) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) {
return cc.t.MemoryManager().CopyIn(cc.t, addr, dst, cc.opts)
}
// CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
-func (cc *ownTaskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+func (cc *ownTaskCopyContext) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) {
return cc.t.MemoryManager().CopyOut(cc.t, addr, src, cc.opts)
}
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 09d070ec8..77ad62445 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -114,6 +114,15 @@ func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
}
}
+// forEachTaskLocked applies f to each Task in ts.
+//
+// Preconditions: ts.mu must be locked (for reading or writing).
+func (ts *TaskSet) forEachTaskLocked(f func(t *Task)) {
+ for t := range ts.Root.tids {
+ f(t)
+ }
+}
+
// A PIDNamespace represents a PID namespace, a bimap between thread IDs and
// tasks. See the pid_namespaces(7) man page for further details.
//
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
index cf2f7ca72..dfc3c0719 100644
--- a/pkg/sentry/kernel/timekeeper_test.go
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -17,12 +17,12 @@ package kernel
import (
"testing"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
// mockClocks is a sentrytime.Clocks that simply returns the times in the
@@ -54,7 +54,7 @@ func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) {
func stateTestClocklessTimekeeper(tb testing.TB) *Timekeeper {
ctx := contexttest.Context(tb)
mfp := pgalloc.MemoryFileProviderFromContext(ctx)
- fr, err := mfp.MemoryFile().Allocate(usermem.PageSize, usage.Anonymous)
+ fr, err := mfp.MemoryFile().Allocate(hostarch.PageSize, usage.Anonymous)
if err != nil {
tb.Fatalf("failed to allocate memory: %v", err)
}
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 9e5c2d26f..cc0917504 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -17,10 +17,10 @@ package kernel
import (
"fmt"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/usermem"
)
// vdsoParams are the parameters exposed to the VDSO.
@@ -96,7 +96,7 @@ func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSO
// access returns a mapping of the param page.
func (v *VDSOParamPage) access() (safemem.Block, error) {
- bs, err := v.mfp.MemoryFile().MapInternal(v.fr, usermem.ReadWrite)
+ bs, err := v.mfp.MemoryFile().MapInternal(v.fr, hostarch.ReadWrite)
if err != nil {
return safemem.Block{}, err
}