diff options
Diffstat (limited to 'pkg/sentry/kernel')
35 files changed, 704 insertions, 192 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index c53e3e720..a1ec6daab 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -141,6 +141,7 @@ go_library( srcs = [ "abstract_socket_namespace.go", "aio.go", + "cgroup.go", "context.go", "fd_table.go", "fd_table_refs.go", @@ -178,6 +179,7 @@ go_library( "task.go", "task_acct.go", "task_block.go", + "task_cgroup.go", "task_clone.go", "task_context.go", "task_exec.go", @@ -226,6 +228,7 @@ go_library( "//pkg/eventchannel", "//pkg/fspath", "//pkg/goid", + "//pkg/hostarch", "//pkg/log", "//pkg/marshal", "//pkg/marshal/primitive", @@ -240,6 +243,7 @@ go_library( "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/fsimpl/timerfd", @@ -294,6 +298,7 @@ go_test( deps = [ "//pkg/abi", "//pkg/context", + "//pkg/hostarch", "//pkg/sentry/arch", "//pkg/sentry/contexttest", "//pkg/sentry/fs", @@ -305,6 +310,5 @@ go_test( "//pkg/sentry/usage", "//pkg/sync", "//pkg/syserror", - "//pkg/usermem", ], ) diff --git a/pkg/sentry/kernel/cgroup.go b/pkg/sentry/kernel/cgroup.go new file mode 100644 index 000000000..1f1c63f37 --- /dev/null +++ b/pkg/sentry/kernel/cgroup.go @@ -0,0 +1,281 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "bytes" + "fmt" + "sort" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" +) + +// InvalidCgroupHierarchyID indicates an uninitialized hierarchy ID. +const InvalidCgroupHierarchyID uint32 = 0 + +// CgroupControllerType is the name of a cgroup controller. +type CgroupControllerType string + +// CgroupController is the common interface to cgroup controllers available to +// the entire sentry. The controllers themselves are defined by cgroupfs. +// +// Callers of this interface are often unable access synchronization needed to +// ensure returned values remain valid. Some of values returned from this +// interface are thus snapshots in time, and may become stale. This is ok for +// many callers like procfs. +type CgroupController interface { + // Returns the type of this cgroup controller (ex "memory", "cpu"). Returned + // value is valid for the lifetime of the controller. + Type() CgroupControllerType + + // Hierarchy returns the ID of the hierarchy this cgroup controller is + // attached to. Returned value is valid for the lifetime of the controller. + HierarchyID() uint32 + + // Filesystem returns the filesystem this controller is attached to. + // Returned value is valid for the lifetime of the controller. + Filesystem() *vfs.Filesystem + + // RootCgroup returns the root cgroup for this controller. Returned value is + // valid for the lifetime of the controller. + RootCgroup() Cgroup + + // NumCgroups returns the number of cgroups managed by this controller. + // Returned value is a snapshot in time. + NumCgroups() uint64 + + // Enabled returns whether this controller is enabled. Returned value is a + // snapshot in time. + Enabled() bool +} + +// Cgroup represents a named pointer to a cgroup in cgroupfs. When a task enters +// a cgroup, it holds a reference on the underlying dentry pointing to the +// cgroup. +// +// +stateify savable +type Cgroup struct { + *kernfs.Dentry + CgroupImpl +} + +func (c *Cgroup) decRef() { + c.Dentry.DecRef(context.Background()) +} + +// Path returns the absolute path of c, relative to its hierarchy root. +func (c *Cgroup) Path() string { + return c.FSLocalPath() +} + +// HierarchyID returns the id of the hierarchy that contains this cgroup. +func (c *Cgroup) HierarchyID() uint32 { + // Note: a cgroup is guaranteed to have at least one controller. + return c.Controllers()[0].HierarchyID() +} + +// CgroupImpl is the common interface to cgroups. +type CgroupImpl interface { + Controllers() []CgroupController + Enter(t *Task) + Leave(t *Task) +} + +// hierarchy represents a cgroupfs filesystem instance, with a unique set of +// controllers attached to it. Multiple cgroupfs mounts may reference the same +// hierarchy. +// +// +stateify savable +type hierarchy struct { + id uint32 + // These are a subset of the controllers in CgroupRegistry.controllers, + // grouped here by hierarchy for conveninent lookup. + controllers map[CgroupControllerType]CgroupController + // fs is not owned by hierarchy. The FS is responsible for unregistering the + // hierarchy on destruction, which removes this association. + fs *vfs.Filesystem +} + +func (h *hierarchy) match(ctypes []CgroupControllerType) bool { + if len(ctypes) != len(h.controllers) { + return false + } + for _, ty := range ctypes { + if _, ok := h.controllers[ty]; !ok { + return false + } + } + return true +} + +// CgroupRegistry tracks the active set of cgroup controllers on the system. +// +// +stateify savable +type CgroupRegistry struct { + // lastHierarchyID is the id of the last allocated cgroup hierarchy. Valid + // ids are from 1 to math.MaxUint32. Must be accessed through atomic ops. + // + lastHierarchyID uint32 + + mu sync.Mutex `state:"nosave"` + + // controllers is the set of currently known cgroup controllers on the + // system. Protected by mu. + // + // +checklocks:mu + controllers map[CgroupControllerType]CgroupController + + // hierarchies is the active set of cgroup hierarchies. Protected by mu. + // + // +checklocks:mu + hierarchies map[uint32]hierarchy +} + +func newCgroupRegistry() *CgroupRegistry { + return &CgroupRegistry{ + controllers: make(map[CgroupControllerType]CgroupController), + hierarchies: make(map[uint32]hierarchy), + } +} + +// nextHierarchyID returns a newly allocated, unique hierarchy ID. +func (r *CgroupRegistry) nextHierarchyID() (uint32, error) { + if hid := atomic.AddUint32(&r.lastHierarchyID, 1); hid != 0 { + return hid, nil + } + return InvalidCgroupHierarchyID, fmt.Errorf("cgroup hierarchy ID overflow") +} + +// FindHierarchy returns a cgroup filesystem containing exactly the set of +// controllers named in names. If no such FS is found, FindHierarchy return +// nil. FindHierarchy takes a reference on the returned FS, which is transferred +// to the caller. +func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Filesystem { + r.mu.Lock() + defer r.mu.Unlock() + + for _, h := range r.hierarchies { + if h.match(ctypes) { + h.fs.IncRef() + return h.fs + } + } + + return nil +} + +// Register registers the provided set of controllers with the registry as a new +// hierarchy. If any controller is already registered, the function returns an +// error without modifying the registry. The hierarchy can be later referenced +// by the returned id. +func (r *CgroupRegistry) Register(cs []CgroupController) (uint32, error) { + r.mu.Lock() + defer r.mu.Unlock() + + if len(cs) == 0 { + return InvalidCgroupHierarchyID, fmt.Errorf("can't register hierarchy with no controllers") + } + + for _, c := range cs { + if _, ok := r.controllers[c.Type()]; ok { + return InvalidCgroupHierarchyID, fmt.Errorf("controllers may only be mounted on a single hierarchy") + } + } + + hid, err := r.nextHierarchyID() + if err != nil { + return hid, err + } + + h := hierarchy{ + id: hid, + controllers: make(map[CgroupControllerType]CgroupController), + fs: cs[0].Filesystem(), + } + for _, c := range cs { + n := c.Type() + r.controllers[n] = c + h.controllers[n] = c + } + r.hierarchies[hid] = h + return hid, nil +} + +// Unregister removes a previously registered hierarchy from the registry. If +// the controller was not previously registered, Unregister is a no-op. +func (r *CgroupRegistry) Unregister(hid uint32) { + r.mu.Lock() + defer r.mu.Unlock() + + if h, ok := r.hierarchies[hid]; ok { + for name, _ := range h.controllers { + delete(r.controllers, name) + } + delete(r.hierarchies, hid) + } +} + +// computeInitialGroups takes a reference on each of the returned cgroups. The +// caller takes ownership of this returned reference. +func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[Cgroup]struct{} { + r.mu.Lock() + defer r.mu.Unlock() + + ctlSet := make(map[CgroupControllerType]CgroupController) + cgset := make(map[Cgroup]struct{}) + + // Remember controllers from the inherited cgroups set... + for cg, _ := range inherit { + cg.IncRef() // Ref transferred to caller. + for _, ctl := range cg.Controllers() { + ctlSet[ctl.Type()] = ctl + cgset[cg] = struct{}{} + } + } + + // ... and add the root cgroups of all the missing controllers. + for name, ctl := range r.controllers { + if _, ok := ctlSet[name]; !ok { + cg := ctl.RootCgroup() + cg.IncRef() // Ref transferred to caller. + cgset[cg] = struct{}{} + } + } + return cgset +} + +// GenerateProcCgroups writes the contents of /proc/cgroups to buf. +func (r *CgroupRegistry) GenerateProcCgroups(buf *bytes.Buffer) { + r.mu.Lock() + entries := make([]string, 0, len(r.controllers)) + for _, c := range r.controllers { + en := 0 + if c.Enabled() { + en = 1 + } + entries = append(entries, fmt.Sprintf("%s\t%d\t%d\t%d\n", c.Type(), c.HierarchyID(), c.NumCgroups(), en)) + } + r.mu.Unlock() + + sort.Strings(entries) + fmt.Fprint(buf, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n") + for _, e := range entries { + fmt.Fprint(buf, e) + } +} diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD index 7ecbd29ab..564c3d42e 100644 --- a/pkg/sentry/kernel/eventfd/BUILD +++ b/pkg/sentry/kernel/eventfd/BUILD @@ -10,6 +10,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fdnotifier", + "//pkg/hostarch", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 2aca02fd5..4466fbc9d 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -186,7 +187,7 @@ func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) erro e.wq.Notify(waiter.WritableEvents) var buf [8]byte - usermem.ByteOrder.PutUint64(buf[:], val) + hostarch.ByteOrder.PutUint64(buf[:], val) _, err := dst.CopyOut(ctx, buf[:]) return err } @@ -194,7 +195,7 @@ func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) erro // Must be called with e.mu locked. func (e *EventOperations) hostWrite(val uint64) error { var buf [8]byte - usermem.ByteOrder.PutUint64(buf[:], val) + hostarch.ByteOrder.PutUint64(buf[:], val) _, err := unix.Write(e.hostfd, buf[:]) if err == unix.EWOULDBLOCK { return syserror.ErrWouldBlock @@ -207,7 +208,7 @@ func (e *EventOperations) write(ctx context.Context, src usermem.IOSequence) err if _, err := src.CopyIn(ctx, buf[:]); err != nil { return err } - val := usermem.ByteOrder.Uint64(buf[:]) + val := hostarch.ByteOrder.Uint64(buf[:]) return e.Signal(val) } diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index 041e3d4ca..a75686cf3 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -37,6 +37,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/hostarch", "//pkg/log", "//pkg/sentry/memmap", "//pkg/sync", @@ -52,8 +53,8 @@ go_test( library = ":futex", deps = [ "//pkg/context", + "//pkg/hostarch", "//pkg/sync", - "//pkg/usermem", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index e4dcc4d40..0427cf3f4 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -20,10 +20,10 @@ package futex import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) // KeyKind indicates the type of a Key. @@ -83,8 +83,8 @@ func (k *Key) clone() Key { } // Preconditions: k.Kind == KindPrivate or KindSharedPrivate. -func (k *Key) addr() usermem.Addr { - return usermem.Addr(k.Offset) +func (k *Key) addr() hostarch.Addr { + return hostarch.Addr(k.Offset) } // matches returns true if a wakeup on k2 should wake a waiter waiting on k. @@ -97,14 +97,14 @@ func (k *Key) matches(k2 *Key) bool { type Target interface { context.Context - // SwapUint32 gives access to usermem.IO.SwapUint32. - SwapUint32(addr usermem.Addr, new uint32) (uint32, error) + // SwapUint32 gives access to hostarch.IO.SwapUint32. + SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) - // CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32. - CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) + // CompareAndSwap gives access to hostarch.IO.CompareAndSwapUint32. + CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) - // LoadUint32 gives access to usermem.IO.LoadUint32. - LoadUint32(addr usermem.Addr) (uint32, error) + // LoadUint32 gives access to hostarch.IO.LoadUint32. + LoadUint32(addr hostarch.Addr) (uint32, error) // GetSharedKey returns a Key with kind KindSharedPrivate or // KindSharedMappable corresponding to the memory mapped at address addr. @@ -112,11 +112,11 @@ type Target interface { // If GetSharedKey returns a Key with a non-nil MappingIdentity, a // reference is held on the MappingIdentity, which must be dropped by the // caller when the Key is no longer in use. - GetSharedKey(addr usermem.Addr) (Key, error) + GetSharedKey(addr hostarch.Addr) (Key, error) } // check performs a basic equality check on the given address. -func check(t Target, addr usermem.Addr, val uint32) error { +func check(t Target, addr hostarch.Addr, val uint32) error { cur, err := t.LoadUint32(addr) if err != nil { return err @@ -128,7 +128,7 @@ func check(t Target, addr usermem.Addr, val uint32) error { } // atomicOp performs a complex operation on the given address. -func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) { +func atomicOp(t Target, addr hostarch.Addr, opIn uint32) (bool, error) { opType := (opIn >> 28) & 0xf cmp := (opIn >> 24) & 0xf opArg := (opIn >> 12) & 0xfff @@ -328,7 +328,7 @@ const ( ) // getKey returns a Key representing address addr in c. -func getKey(t Target, addr usermem.Addr, private bool) (Key, error) { +func getKey(t Target, addr hostarch.Addr, private bool) (Key, error) { // Ensure the address is aligned. // It must be a DWORD boundary. if addr&0x3 != 0 { @@ -341,7 +341,7 @@ func getKey(t Target, addr usermem.Addr, private bool) (Key, error) { } // bucketIndexForAddr returns the index into Manager.buckets for addr. -func bucketIndexForAddr(addr usermem.Addr) uintptr { +func bucketIndexForAddr(addr hostarch.Addr) uintptr { // - The bottom 2 bits of addr must be 0, per getKey. // // - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47 @@ -448,7 +448,7 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) { // Wake wakes up to n waiters matching the bitmask on the given addr. // The number of waiters woken is returned. -func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32, n int) (int, error) { +func (m *Manager) Wake(t Target, addr hostarch.Addr, private bool, bitmask uint32, n int) (int, error) { // This function is very hot; avoid defer. k, err := getKey(t, addr, private) if err != nil { @@ -463,7 +463,7 @@ func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32 return r, nil } -func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) { +func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) { k1, err := getKey(t, addr, private) if err != nil { return 0, err @@ -498,14 +498,14 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch // Requeue wakes up to nwake waiters on the given addr, and unconditionally // requeues up to nreq waiters on naddr. -func (m *Manager) Requeue(t Target, addr, naddr usermem.Addr, private bool, nwake int, nreq int) (int, error) { +func (m *Manager) Requeue(t Target, addr, naddr hostarch.Addr, private bool, nwake int, nreq int) (int, error) { return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq) } // RequeueCmp atomically checks that the addr contains val (via the Target), // wakes up to nwake waiters on addr and then unconditionally requeues nreq // waiters on naddr. -func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, val uint32, nwake int, nreq int) (int, error) { +func (m *Manager) RequeueCmp(t Target, addr, naddr hostarch.Addr, private bool, val uint32, nwake int, nreq int) (int, error) { return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq) } @@ -513,7 +513,7 @@ func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, v // waiters unconditionally from addr1, and, based on the original value at addr2 // and a comparison encoded in op, wakes up to nwake2 waiters from addr2. // It returns the total number of waiters woken. -func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) { +func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) { k1, err := getKey(t, addr1, private) if err != nil { return 0, err @@ -553,7 +553,7 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwak // enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the // Waiter must be subsequently removed by calling WaitComplete, whether or not // a wakeup is received on w.C. -func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) error { +func (m *Manager) WaitPrepare(w *Waiter, t Target, addr hostarch.Addr, private bool, val uint32, bitmask uint32) error { k, err := getKey(t, addr, private) if err != nil { return err @@ -631,7 +631,7 @@ func (m *Manager) WaitComplete(w *Waiter, t Target) { // FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see // exit_robust_list()). Given we don't support robust lists, although handled // below, it's never set. -func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) { +func (m *Manager) LockPI(w *Waiter, t Target, addr hostarch.Addr, tid uint32, private, try bool) (bool, error) { k, err := getKey(t, addr, private) if err != nil { return false, err @@ -663,7 +663,7 @@ func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, pri return success, nil } -func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) { +func (m *Manager) lockPILocked(w *Waiter, t Target, addr hostarch.Addr, tid uint32, b *bucket, try bool) (bool, error) { for { cur, err := t.LoadUint32(addr) if err != nil { @@ -724,7 +724,7 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint3 // The address provided must contain the caller's TID. If there are waiters, // TID of the next waiter (FIFO) is set to the given address, and the waiter // woken up. If there are no waiters, 0 is set to the address. -func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error { +func (m *Manager) UnlockPI(t Target, addr hostarch.Addr, tid uint32, private bool) error { k, err := getKey(t, addr, private) if err != nil { return err @@ -738,7 +738,7 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool return err } -func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket, key *Key) error { +func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bucket, key *Key) error { cur, err := t.LoadUint32(addr) if err != nil { return err diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go index ba7f95d8a..deba44e5c 100644 --- a/pkg/sentry/kernel/futex/futex_test.go +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -23,8 +23,8 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/usermem" ) // testData implements the Target interface, and allows us to @@ -43,23 +43,23 @@ func newTestData(size uint) testData { } } -func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) { +func (t testData) SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) { val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), new) return val, nil } -func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) { +func (t testData) CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) { if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), old, new) { return old, nil } return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil } -func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) { +func (t testData) LoadUint32(addr hostarch.Addr) (uint32, error) { return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil } -func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) { +func (t testData) GetSharedKey(addr hostarch.Addr) (Key, error) { return Key{ Kind: KindSharedMappable, Offset: uint64(addr), @@ -73,7 +73,7 @@ func futexKind(private bool) string { return "shared" } -func newPreparedTestWaiter(t *testing.T, m *Manager, ta Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) *Waiter { +func newPreparedTestWaiter(t *testing.T, m *Manager, ta Target, addr hostarch.Addr, private bool, val uint32, bitmask uint32) *Waiter { w := NewWaiter() if err := m.WaitPrepare(w, ta, addr, private, val, bitmask); err != nil { t.Fatalf("WaitPrepare failed: %v", err) @@ -463,12 +463,12 @@ const ( // Beyond being used as a Locker, this is a simple mechanism for // changing the underlying values for simpler tests. type testMutex struct { - a usermem.Addr + a hostarch.Addr d testData m *Manager } -func newTestMutex(addr usermem.Addr, d testData, m *Manager) *testMutex { +func newTestMutex(addr hostarch.Addr, d testData, m *Manager) *testMutex { return &testMutex{a: addr, d: d, m: m} } diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go index 4fcdfc541..4b943106b 100644 --- a/pkg/sentry/kernel/kcov.go +++ b/pkg/sentry/kernel/kcov.go @@ -22,13 +22,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/coverage" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) // kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov @@ -130,7 +130,7 @@ func (kcov *Kcov) InitTrace(size uint64) error { // To simplify all the logic around mapping, we require that the length of the // shared region is a multiple of the system page size. - if (8*size)&(usermem.PageSize-1) != 0 { + if (8*size)&(hostarch.PageSize-1) != 0 { return syserror.EINVAL } @@ -286,7 +286,7 @@ func (rw *kcovReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { } // Get internal mappings. - bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Read) + bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, hostarch.Read) if err != nil { return 0, err } @@ -314,7 +314,7 @@ func (rw *kcovReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) } // Get internal mapping. - bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Write) + bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, hostarch.Write) if err != nil { return 0, err } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 43065b45a..e6e9da898 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -294,6 +294,11 @@ type Kernel struct { // YAMAPtraceScope is the current level of YAMA ptrace restrictions. YAMAPtraceScope int32 + + // cgroupRegistry contains the set of active cgroup controllers on the + // system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on + // the system. + cgroupRegistry *CgroupRegistry } // InitKernelArgs holds arguments to Init. @@ -438,6 +443,8 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.socketMount = socketMount k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord) + + k.cgroupRegistry = newCgroupRegistry() } return nil } @@ -1815,6 +1822,11 @@ func (k *Kernel) SocketMount() *vfs.Mount { return k.socketMount } +// CgroupRegistry returns the cgroup registry. +func (k *Kernel) CgroupRegistry() *CgroupRegistry { + return k.cgroupRegistry +} + // Release releases resources owned by k. // // Precondition: This should only be called after the kernel is fully @@ -1831,3 +1843,43 @@ func (k *Kernel) Release() { k.timekeeper.Destroy() k.vdso.Release(ctx) } + +// PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup +// hierarchy. +// +// Precondition: root must be a new cgroup with no tasks. This implies the +// controllers for root are also new and currently manage no task, which in turn +// implies the new cgroup can be populated without migrating tasks between +// cgroups. +func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) { + k.tasks.mu.RLock() + k.tasks.forEachTaskLocked(func(t *Task) { + if t.exitState != TaskExitNone { + return + } + t.mu.Lock() + t.enterCgroupLocked(root) + t.mu.Unlock() + }) + k.tasks.mu.RUnlock() +} + +// ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the +// hierarchy with the provided id. This is intended for use during hierarchy +// teardown, as otherwise the tasks would be orphaned w.r.t to some controllers. +func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) { + k.tasks.mu.RLock() + k.tasks.forEachTaskLocked(func(t *Task) { + if t.exitState != TaskExitNone { + return + } + t.mu.Lock() + for cg, _ := range t.cgroups { + if cg.HierarchyID() == hid { + t.leaveCgroupLocked(cg) + } + } + t.mu.Unlock() + }) + k.tasks.mu.RUnlock() +} diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index beba6d97d..34c617b08 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/abi/linux", "//pkg/amutex", "//pkg/context", + "//pkg/hostarch", "//pkg/marshal/primitive", "//pkg/safemem", "//pkg/sentry/arch", diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index d004f2357..06769931a 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -22,18 +22,18 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) const ( // MinimumPipeSize is a hard limit of the minimum size of a pipe. // It corresponds to fs/pipe.c:pipe_min_size. - MinimumPipeSize = usermem.PageSize + MinimumPipeSize = hostarch.PageSize // MaximumPipeSize is a hard limit on the maximum size of a pipe. // It corresponds to fs/pipe.c:pipe_max_size. @@ -41,7 +41,7 @@ const ( // DefaultPipeSize is the system-wide default size of a pipe in bytes. // It corresponds to pipe_fs_i.h:PIPE_DEF_BUFFERS. - DefaultPipeSize = 16 * usermem.PageSize + DefaultPipeSize = 16 * hostarch.PageSize // atomicIOBytes is the maximum number of bytes that the pipe will // guarantee atomic reads or writes atomically. diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index e524afad5..95b948edb 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -17,6 +17,7 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -274,7 +275,7 @@ func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescripti } src := usermem.IOSequence{ IO: fd, - Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}), + Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(count)}), } var ( @@ -302,7 +303,7 @@ func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescripti func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) { dst := usermem.IOSequence{ IO: fd, - Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}), + Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(count)}), } var ( @@ -328,7 +329,7 @@ func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescript // fd.pipe.Notify(waiter.WritableEvents) after the read is completed. // // Preconditions: fd.pipe.mu must be locked. -func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) { +func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts usermem.IOOpts) (int, error) { n, err := fd.pipe.peekLocked(int64(len(dst)), func(srcs safemem.BlockSeq) (uint64, error) { return safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), srcs) }) @@ -340,7 +341,7 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, // is completed. // // Preconditions: fd.pipe.mu must be locked. -func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) { +func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts usermem.IOOpts) (int, error) { n, err := fd.pipe.writeLocked(int64(len(src)), func(dsts safemem.BlockSeq) (uint64, error) { return safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src))) }) @@ -350,7 +351,7 @@ func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, // ZeroOut implements usermem.IO.ZeroOut. // // Preconditions: fd.pipe.mu must be locked. -func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { +func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { n, err := fd.pipe.writeLocked(toZero, func(dsts safemem.BlockSeq) (uint64, error) { return safemem.ZeroSeq(dsts) }) @@ -362,7 +363,7 @@ func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int6 // fd.pipe.Notify(waiter.WritableEvents) after the read is completed. // // Preconditions: fd.pipe.mu must be locked. -func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { +func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { return fd.pipe.peekLocked(ars.NumBytes(), func(srcs safemem.BlockSeq) (uint64, error) { return dst.WriteFromBlocks(srcs) }) @@ -373,25 +374,25 @@ func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst // is completed. // // Preconditions: fd.pipe.mu must be locked. -func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { +func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { return fd.pipe.writeLocked(ars.NumBytes(), func(dsts safemem.BlockSeq) (uint64, error) { return src.ReadToBlocks(dsts) }) } // SwapUint32 implements usermem.IO.SwapUint32. -func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { +func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { // How did a pipe get passed as the virtual address space to futex(2)? panic("VFSPipeFD.SwapUint32 called unexpectedly") } // CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32. -func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { +func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly") } // LoadUint32 implements usermem.IO.LoadUint32. -func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) { +func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr hostarch.Addr, opts usermem.IOOpts) (uint32, error) { panic("VFSPipeFD.LoadUint32 called unexpectedly") } diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index f5a60e749..57c7659e7 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -19,6 +19,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -1011,7 +1012,7 @@ func (t *Task) ptraceSetOptionsLocked(opts uintptr) error { } // Ptrace implements the ptrace system call. -func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { +func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { // PTRACE_TRACEME ignores all other arguments. if req == linux.PTRACE_TRACEME { return t.ptraceTraceme() @@ -1190,7 +1191,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { panic(fmt.Sprintf("%#x + %#x overflows. Invalid reg size > %#x", ar.Start, n, ar.Length())) } ar.End = end - return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) + return t.CopyOutIovecs(data, hostarch.AddrRangeSeqOf(ar)) case linux.PTRACE_SETREGSET: ars, err := t.CopyInIovecs(data, 1) @@ -1214,8 +1215,8 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { return err } t.p.FullStateChanged() - ar.End -= usermem.Addr(n) - return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) + ar.End -= hostarch.Addr(n) + return t.CopyOutIovecs(data, hostarch.AddrRangeSeqOf(ar)) case linux.PTRACE_GETSIGINFO: t.tg.pidns.owner.mu.RLock() @@ -1267,7 +1268,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { case linux.PTRACE_GETEVENTMSG: t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() - _, err := primitive.CopyUint64Out(t, usermem.Addr(data), target.ptraceEventMsg) + _, err := primitive.CopyUint64Out(t, hostarch.Addr(data), target.ptraceEventMsg) return err // PEEKSIGINFO is unimplemented but seems to have no users anywhere. diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go index 7aea3dcd8..5ae05b5c3 100644 --- a/pkg/sentry/kernel/ptrace_amd64.go +++ b/pkg/sentry/kernel/ptrace_amd64.go @@ -18,12 +18,13 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) // ptraceArch implements arch-specific ptrace commands. -func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error { +func (t *Task) ptraceArch(target *Task, req int64, addr, data hostarch.Addr) error { switch req { case linux.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER n, err := target.Arch().PtracePeekUser(uintptr(addr)) diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go index d971b96b3..46dd84cbc 100644 --- a/pkg/sentry/kernel/ptrace_arm64.go +++ b/pkg/sentry/kernel/ptrace_arm64.go @@ -17,11 +17,11 @@ package kernel import ( + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) // ptraceArch implements arch-specific ptrace commands. -func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error { +func (t *Task) ptraceArch(target *Task, req int64, addr, data hostarch.Addr) error { return syserror.EIO } diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index 2a9023fdf..4bc5bca44 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -43,8 +44,8 @@ type OldRSeqCriticalRegion struct { // application handler while its instruction pointer is in CriticalSection, // set the instruction pointer to Restart and application register r10 (on // amd64) to the former instruction pointer. - CriticalSection usermem.AddrRange - Restart usermem.Addr + CriticalSection hostarch.AddrRange + Restart hostarch.Addr } // RSeqAvailable returns true if t supports (old and new) restartable sequences. @@ -55,7 +56,7 @@ func (t *Task) RSeqAvailable() bool { // SetRSeq registers addr as this thread's rseq structure. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error { +func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error { if t.rseqAddr != 0 { if t.rseqAddr != addr { return syserror.EINVAL @@ -100,7 +101,7 @@ func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error { // ClearRSeq unregisters addr as this thread's rseq structure. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) ClearRSeq(addr usermem.Addr, length, signature uint32) error { +func (t *Task) ClearRSeq(addr hostarch.Addr, length, signature uint32) error { if t.rseqAddr == 0 { return syserror.EINVAL } @@ -166,7 +167,7 @@ func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error { // CPU number. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) OldRSeqCPUAddr() usermem.Addr { +func (t *Task) OldRSeqCPUAddr() hostarch.Addr { return t.oldRSeqCPUAddr } @@ -177,7 +178,7 @@ func (t *Task) OldRSeqCPUAddr() usermem.Addr { // * t.RSeqAvailable() == true. // * The caller must be running on the task goroutine. // * t's AddressSpace must be active. -func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error { +func (t *Task) SetOldRSeqCPUAddr(addr hostarch.Addr) error { t.oldRSeqCPUAddr = addr // Check that addr is writable. @@ -221,7 +222,7 @@ func (t *Task) oldRSeqCopyOutCPU() error { } buf := t.CopyScratchBuffer(4) - usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) + hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) _, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf) return err } @@ -236,8 +237,8 @@ func (t *Task) rseqCopyOutCPU() error { buf := t.CopyScratchBuffer(8) // CPUIDStart and CPUID are the first two fields in linux.RSeq. - usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) // CPUIDStart - usermem.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID + hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) // CPUIDStart + hostarch.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID // N.B. This write is not atomic, but since this occurs on the task // goroutine then as long as userspace uses a single-instruction read // it can't see an invalid value. @@ -251,8 +252,8 @@ func (t *Task) rseqCopyOutCPU() error { func (t *Task) rseqClearCPU() error { buf := t.CopyScratchBuffer(8) // CPUIDStart and CPUID are the first two fields in linux.RSeq. - usermem.ByteOrder.PutUint32(buf, 0) // CPUIDStart - usermem.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID + hostarch.ByteOrder.PutUint32(buf, 0) // CPUIDStart + hostarch.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID // N.B. This write is not atomic, but since this occurs on the task // goroutine then as long as userspace uses a single-instruction read // it can't see an invalid value. @@ -305,7 +306,7 @@ func (t *Task) rseqAddrInterrupt() { return } - critAddr := usermem.Addr(usermem.ByteOrder.Uint64(buf)) + critAddr := hostarch.Addr(hostarch.ByteOrder.Uint64(buf)) if critAddr == 0 { return } @@ -325,7 +326,7 @@ func (t *Task) rseqAddrInterrupt() { return } - start := usermem.Addr(cs.Start) + start := hostarch.Addr(cs.Start) critRange, ok := start.ToRange(cs.PostCommitOffset) if !ok { t.Debugf("Invalid start and offset in %+v", cs) @@ -334,7 +335,7 @@ func (t *Task) rseqAddrInterrupt() { return } - abort := usermem.Addr(cs.Abort) + abort := hostarch.Addr(cs.Abort) if critRange.Contains(abort) { t.Debugf("Abort in critical section in %+v", cs) t.forceSignal(linux.SIGSEGV, false /* unconditional */) @@ -353,7 +354,7 @@ func (t *Task) rseqAddrInterrupt() { return } - sig := usermem.ByteOrder.Uint32(buf) + sig := hostarch.ByteOrder.Uint32(buf) if sig != t.rseqSignature { t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature) t.forceSignal(linux.SIGSEGV, false /* unconditional */) @@ -376,7 +377,7 @@ func (t *Task) rseqAddrInterrupt() { } // Finally we can actually decide whether or not to restart. - if !critRange.Contains(usermem.Addr(t.Arch().IP())) { + if !critRange.Contains(hostarch.Addr(t.Arch().IP())) { return } @@ -386,7 +387,7 @@ func (t *Task) rseqAddrInterrupt() { // Preconditions: The caller must be running on the task goroutine. func (t *Task) oldRSeqInterrupt() { r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) - if ip := t.Arch().IP(); r.CriticalSection.Contains(usermem.Addr(ip)) { + if ip := t.Arch().IP(); r.CriticalSection.Contains(hostarch.Addr(ip)) { t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart) t.Arch().SetIP(uintptr(r.Restart)) t.Arch().SetOldRSeqInterruptedIP(ip) diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go index 8163a6132..a95e174a2 100644 --- a/pkg/sentry/kernel/seccomp.go +++ b/pkg/sentry/kernel/seccomp.go @@ -18,9 +18,9 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) const maxSyscallFilterInstructions = 1 << 15 @@ -35,11 +35,11 @@ func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input { return bpf.InputBytes{ Data: buf, // Go-marshal always uses the native byte order. - Order: usermem.ByteOrder, + Order: hostarch.ByteOrder, } } -func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo { +func seccompSiginfo(t *Task, errno, sysno int32, ip hostarch.Addr) *arch.SignalInfo { si := &arch.SignalInfo{ Signo: int32(linux.SIGSYS), Errno: errno, @@ -56,7 +56,7 @@ func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalIn // in because vsyscalls do not use the values in t.Arch().) // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) linux.BPFAction { +func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) linux.BPFAction { result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip)) action := result & linux.SECCOMP_RET_ACTION switch action { @@ -102,7 +102,7 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u return action } -func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 { +func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) uint32 { data := linux.SeccompData{ Nr: sysno, Arch: t.image.st.AuditNumber, diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index 073e14507..1c3c0794f 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -28,6 +28,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/hostarch", "//pkg/log", "//pkg/refs", "//pkg/refsvfs2", diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 92d60ba78..a73f1bdca 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -38,6 +38,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -47,7 +48,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) // Key represents a shm segment key. Analogous to a file name. @@ -197,13 +197,13 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui } var sizeAligned uint64 - if val, ok := usermem.Addr(size).RoundUp(); ok { + if val, ok := hostarch.Addr(size).RoundUp(); ok { sizeAligned = uint64(val) } else { return nil, syserror.EINVAL } - if numPages := sizeAligned / usermem.PageSize; r.totalPages+numPages > linux.SHMALL { + if numPages := sizeAligned / hostarch.PageSize; r.totalPages+numPages > linux.SHMALL { // "... allocating a segment of the requested size would cause the // system to exceed the system-wide limit on shared memory (SHMALL)." // - man shmget(2) @@ -232,7 +232,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider)) } - effectiveSize := uint64(usermem.Addr(size).MustRoundUp()) + effectiveSize := uint64(hostarch.Addr(size).MustRoundUp()) fr, err := mfp.MemoryFile().Allocate(effectiveSize, usage.Anonymous) if err != nil { return nil, err @@ -267,7 +267,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi r.shms[id] = shm r.keysToShms[key] = shm - r.totalPages += effectiveSize / usermem.PageSize + r.totalPages += effectiveSize / hostarch.PageSize return shm, nil } @@ -318,7 +318,7 @@ func (r *Registry) remove(s *Shm) { } delete(r.shms, s.ID) - r.totalPages -= s.effectiveSize / usermem.PageSize + r.totalPages -= s.effectiveSize / hostarch.PageSize } // Release drops the self-reference of each active shm segment in the registry. @@ -386,7 +386,7 @@ type Shm struct { // effectiveSize of the segment, rounding up to the next page // boundary. Immutable. // - // Invariant: effectiveSize must be a multiple of usermem.PageSize. + // Invariant: effectiveSize must be a multiple of hostarch.PageSize. effectiveSize uint64 // fr is the offset into mfp.MemoryFile() that backs this contents of this @@ -467,7 +467,7 @@ func (s *Shm) Msync(context.Context, memmap.MappableRange) error { } // AddMapping implements memmap.Mappable.AddMapping. -func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) error { +func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) error { s.mu.Lock() defer s.mu.Unlock() s.attachTime = ktime.NowFromContext(ctx) @@ -482,7 +482,7 @@ func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.A } // RemoveMapping implements memmap.Mappable.RemoveMapping. -func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) { +func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) { s.mu.Lock() defer s.mu.Unlock() // RemoveMapping may be called during task exit, when ctx @@ -503,12 +503,12 @@ func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ userme } // CopyMapping implements memmap.Mappable.CopyMapping. -func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error { +func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, hostarch.AddrRange, uint64, bool) error { return nil } // Translate implements memmap.Mappable.Translate. -func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { +func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { var err error if required.End > s.fr.Length() { err = &memmap.BusError{syserror.EFAULT} @@ -519,7 +519,7 @@ func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableR Source: source, File: s.mfp.MemoryFile(), Offset: s.fr.Start + source.Start, - Perms: usermem.AnyAccess, + Perms: hostarch.AnyAccess, }, }, err } @@ -543,7 +543,7 @@ type AttachOpts struct { // // Postconditions: The returned MMapOpts are valid only as long as a reference // continues to be held on s. -func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) { +func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts AttachOpts) (memmap.MMapOpts, error) { s.mu.Lock() defer s.mu.Unlock() if s.pendingDestruction && s.ReadRefs() == 0 { @@ -565,12 +565,12 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts Attac Offset: 0, Addr: addr, Fixed: opts.Remap, - Perms: usermem.AccessType{ + Perms: hostarch.AccessType{ Read: true, Write: !opts.Readonly, Execute: opts.Execute, }, - MaxPerms: usermem.AnyAccess, + MaxPerms: hostarch.AnyAccess, Mappable: s, MappingIdentity: s, }, nil diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 332bdb8e8..953d4310e 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -20,9 +20,9 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/usermem" ) // maxSyscallNum is the highest supported syscall number. @@ -243,7 +243,7 @@ type SyscallTable struct { // Emulate is a collection of instruction addresses to emulate. The // keys are addresses, and the values are system call numbers. - Emulate map[usermem.Addr]uintptr + Emulate map[hostarch.Addr]uintptr // The function to call in case of a missing system call. Missing MissingFn @@ -316,7 +316,7 @@ func (s *SyscallTable) Init() { } if s.Emulate == nil { // Ensure non-nil emulate table. - s.Emulate = make(map[usermem.Addr]uintptr) + s.Emulate = make(map[hostarch.Addr]uintptr) } max := s.MaxSysno() // Checked during RegisterSyscallTable. @@ -359,7 +359,7 @@ func (s *SyscallTable) LookupNo(name string) (uintptr, error) { } // LookupEmulate looks up an emulation syscall number. -func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { +func (s *SyscallTable) LookupEmulate(addr hostarch.Addr) (uintptr, bool) { sysno, ok := s.Emulate[addr] return sysno, ok } diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 36141dd09..be1371855 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -33,7 +34,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -470,7 +470,7 @@ type Task struct { // ThreadID to 0, and wake any futex waiters. // // cleartid is exclusive to the task goroutine. - cleartid usermem.Addr + cleartid hostarch.Addr // This is mostly a fake cpumask just for sched_set/getaffinity as we // don't really control the affinity. @@ -540,12 +540,12 @@ type Task struct { // oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable. // // oldRSeqCPUAddr is exclusive to the task goroutine. - oldRSeqCPUAddr usermem.Addr + oldRSeqCPUAddr hostarch.Addr // rseqAddr is a pointer to the userspace linux.RSeq structure. // // rseqAddr is exclusive to the task goroutine. - rseqAddr usermem.Addr + rseqAddr hostarch.Addr // rseqSignature is the signature that the rseq abort IP must be signed // with. @@ -575,7 +575,7 @@ type Task struct { // robustList is a pointer to the head of the tasks's robust futex // list. - robustList usermem.Addr + robustList hostarch.Addr // startTime is the real time at which the task started. It is set when // a Task is created or invokes execve(2). @@ -587,6 +587,12 @@ type Task struct { // // kcov is exclusive to the task goroutine. kcov *Kcov + + // cgroups is the set of cgroups this task belongs to. This may be empty if + // no cgroup controllers are enabled. Protected by mu. + // + // +checklocks:mu + cgroups map[Cgroup]struct{} } func (t *Task) savePtraceTracer() *Task { @@ -652,7 +658,7 @@ func (t *Task) Kernel() *Kernel { // SetClearTID sets t's cleartid. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) SetClearTID(addr usermem.Addr) { +func (t *Task) SetClearTID(addr hostarch.Addr) { t.cleartid = addr } diff --git a/pkg/sentry/kernel/task_cgroup.go b/pkg/sentry/kernel/task_cgroup.go new file mode 100644 index 000000000..25d2504fa --- /dev/null +++ b/pkg/sentry/kernel/task_cgroup.go @@ -0,0 +1,138 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "bytes" + "fmt" + "sort" + "strings" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/syserror" +) + +// EnterInitialCgroups moves t into an initial set of cgroups. +// +// Precondition: t isn't in any cgroups yet, t.cgs is empty. +// +// +checklocksignore parent.mu is conditionally acquired. +func (t *Task) EnterInitialCgroups(parent *Task) { + var inherit map[Cgroup]struct{} + if parent != nil { + parent.mu.Lock() + defer parent.mu.Unlock() + inherit = parent.cgroups + } + joinSet := t.k.cgroupRegistry.computeInitialGroups(inherit) + + t.mu.Lock() + defer t.mu.Unlock() + // Transfer ownership of joinSet refs to the task's cgset. + t.cgroups = joinSet + for c, _ := range t.cgroups { + // Since t isn't in any cgroup yet, we can skip the check against + // existing cgroups. + c.Enter(t) + } +} + +// EnterCgroup moves t into c. +func (t *Task) EnterCgroup(c Cgroup) error { + newControllers := make(map[CgroupControllerType]struct{}) + for _, ctl := range c.Controllers() { + newControllers[ctl.Type()] = struct{}{} + } + + t.mu.Lock() + defer t.mu.Unlock() + + for oldCG, _ := range t.cgroups { + for _, oldCtl := range oldCG.Controllers() { + if _, ok := newControllers[oldCtl.Type()]; ok { + // Already in a cgroup with the same controller as one of the + // new ones. Requires migration between cgroups. + // + // TODO(b/183137098): Implement cgroup migration. + log.Warningf("Cgroup migration is not implemented") + return syserror.EBUSY + } + } + } + + // No migration required. + t.enterCgroupLocked(c) + + return nil +} + +// +checklocks:t.mu +func (t *Task) enterCgroupLocked(c Cgroup) { + c.IncRef() + t.cgroups[c] = struct{}{} + c.Enter(t) +} + +// LeaveCgroups removes t out from all its cgroups. +func (t *Task) LeaveCgroups() { + t.mu.Lock() + defer t.mu.Unlock() + for c, _ := range t.cgroups { + t.leaveCgroupLocked(c) + } +} + +// +checklocks:t.mu +func (t *Task) leaveCgroupLocked(c Cgroup) { + c.Leave(t) + delete(t.cgroups, c) + c.decRef() +} + +// taskCgroupEntry represents a line in /proc/<pid>/cgroup, and is used to +// format a cgroup for display. +type taskCgroupEntry struct { + hierarchyID uint32 + controllers string + path string +} + +// GenerateProcTaskCgroup writes the contents of /proc/<pid>/cgroup for t to buf. +func (t *Task) GenerateProcTaskCgroup(buf *bytes.Buffer) { + t.mu.Lock() + defer t.mu.Unlock() + + cgEntries := make([]taskCgroupEntry, 0, len(t.cgroups)) + for c, _ := range t.cgroups { + ctls := c.Controllers() + ctlNames := make([]string, 0, len(ctls)) + for _, ctl := range ctls { + ctlNames = append(ctlNames, string(ctl.Type())) + } + + cgEntries = append(cgEntries, taskCgroupEntry{ + // Note: We're guaranteed to have at least one controller, and all + // controllers are guaranteed to be on the same hierarchy. + hierarchyID: ctls[0].HierarchyID(), + controllers: strings.Join(ctlNames, ","), + path: c.Path(), + }) + } + + sort.Slice(cgEntries, func(i, j int) bool { return cgEntries[i].hierarchyID > cgEntries[j].hierarchyID }) + for _, cgE := range cgEntries { + fmt.Fprintf(buf, "%d:%s:%s\n", cgE.hierarchyID, cgE.controllers, cgE.path) + } +} diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index f305e69c0..405771f3f 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -85,12 +86,12 @@ type CloneOptions struct { // Stack is the initial stack pointer of the new task. If Stack is 0, the // new task will start with the same stack pointer as its parent. - Stack usermem.Addr + Stack hostarch.Addr // If SetTLS is true, set the new task's TLS (thread-local storage) // descriptor to TLS. If SetTLS is false, TLS is ignored. SetTLS bool - TLS usermem.Addr + TLS hostarch.Addr // If ChildClearTID is true, when the child exits, 0 is written to the // address ChildTID in the child's memory, and if the write is successful a @@ -101,7 +102,7 @@ type CloneOptions struct { // Linux, failed writes are silently ignored.) ChildClearTID bool ChildSetTID bool - ChildTID usermem.Addr + ChildTID hostarch.Addr // If ParentSetTID is true, the child's thread ID (in the parent's PID // namespace) is written to address ParentTID in the parent's memory. (As @@ -112,7 +113,7 @@ type CloneOptions struct { // and child's memory, but this is a documentation error fixed by // 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID"). ParentSetTID bool - ParentTID usermem.Addr + ParentTID hostarch.Addr // If Vfork is true, place the parent in vforkStop until the cloned task // releases its TaskImage. @@ -268,7 +269,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } tg := t.tg - rseqAddr := usermem.Addr(0) + rseqAddr := hostarch.Addr(0) rseqSignature := uint32(0) if opts.NewThreadGroup { if tg.mounts != nil { diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index ad59e4f60..b1af1a7ef 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -275,6 +275,10 @@ func (*runExitMain) execute(t *Task) taskRunState { t.fsContext.DecRef(t) t.fdTable.DecRef(t) + // Detach task from all cgroups. This must happen before potentially the + // last ref to the cgroupfs mount is dropped below. + t.LeaveCgroups() + t.mu.Lock() if t.mountNamespaceVFS2 != nil { t.mountNamespaceVFS2.DecRef(t) diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go index 195c7da9b..4dc41b82b 100644 --- a/pkg/sentry/kernel/task_futex.go +++ b/pkg/sentry/kernel/task_futex.go @@ -16,6 +16,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/usermem" @@ -30,33 +31,33 @@ func (t *Task) Futex() *futex.Manager { } // SwapUint32 implements futex.Target.SwapUint32. -func (t *Task) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) { +func (t *Task) SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) { return t.MemoryManager().SwapUint32(t, addr, new, usermem.IOOpts{ AddressSpaceActive: true, }) } // CompareAndSwapUint32 implements futex.Target.CompareAndSwapUint32. -func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) { +func (t *Task) CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) { return t.MemoryManager().CompareAndSwapUint32(t, addr, old, new, usermem.IOOpts{ AddressSpaceActive: true, }) } // LoadUint32 implements futex.Target.LoadUint32. -func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) { +func (t *Task) LoadUint32(addr hostarch.Addr) (uint32, error) { return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{ AddressSpaceActive: true, }) } // GetSharedKey implements futex.Target.GetSharedKey. -func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) { +func (t *Task) GetSharedKey(addr hostarch.Addr) (futex.Key, error) { return t.MemoryManager().GetSharedFutexKey(t, addr) } // GetRobustList sets the robust futex list for the task. -func (t *Task) GetRobustList() usermem.Addr { +func (t *Task) GetRobustList() hostarch.Addr { t.mu.Lock() addr := t.robustList t.mu.Unlock() @@ -64,7 +65,7 @@ func (t *Task) GetRobustList() usermem.Addr { } // SetRobustList sets the robust futex list for the task. -func (t *Task) SetRobustList(addr usermem.Addr) { +func (t *Task) SetRobustList(addr hostarch.Addr) { t.mu.Lock() t.robustList = addr t.mu.Unlock() @@ -84,28 +85,28 @@ func (t *Task) exitRobustList() { } var rl linux.RobustListHead - if _, err := rl.CopyIn(t, usermem.Addr(addr)); err != nil { + if _, err := rl.CopyIn(t, hostarch.Addr(addr)); err != nil { return } next := primitive.Uint64(rl.List) done := 0 - var pendingLockAddr usermem.Addr + var pendingLockAddr hostarch.Addr if rl.ListOpPending != 0 { - pendingLockAddr = usermem.Addr(rl.ListOpPending + rl.FutexOffset) + pendingLockAddr = hostarch.Addr(rl.ListOpPending + rl.FutexOffset) } // Wake up normal elements. - for usermem.Addr(next) != addr { + for hostarch.Addr(next) != addr { // We traverse to the next element of the list before we // actually wake anything. This prevents the race where waking // this futex causes a modification of the list. - thisLockAddr := usermem.Addr(uint64(next) + rl.FutexOffset) + thisLockAddr := hostarch.Addr(uint64(next) + rl.FutexOffset) // Try to decode the next element in the list before waking the // current futex. But don't check the error until after we've // woken the current futex. Linux does it in this order too - _, nextErr := next.CopyIn(t, usermem.Addr(next)) + _, nextErr := next.CopyIn(t, hostarch.Addr(next)) // Wakeup the current futex if it's not pending. if thisLockAddr != pendingLockAddr { @@ -133,7 +134,7 @@ func (t *Task) exitRobustList() { } // wakeRobustListOne wakes a single futex from the robust list. -func (t *Task) wakeRobustListOne(addr usermem.Addr) { +func (t *Task) wakeRobustListOne(addr hostarch.Addr) { // Bit 0 in address signals PI futex. pi := addr&1 == 1 addr = addr &^ 1 diff --git a/pkg/sentry/kernel/task_image.go b/pkg/sentry/kernel/task_image.go index ce5fbd299..bd5543d4e 100644 --- a/pkg/sentry/kernel/task_image.go +++ b/pkg/sentry/kernel/task_image.go @@ -19,12 +19,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/usermem" ) var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC) @@ -129,7 +129,7 @@ func (t *Task) Stack() *arch.Stack { return &arch.Stack{ Arch: t.Arch(), IO: t.MemoryManager(), - Bottom: usermem.Addr(t.Arch().Stack()), + Bottom: hostarch.Addr(t.Arch().Stack()), } } diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index c70e5e6ce..72b9a0384 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -20,6 +20,7 @@ import ( "sort" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/usermem" ) @@ -108,9 +109,9 @@ func (t *Task) debugDumpStack() { return } t.Debugf("Stack:") - start := usermem.Addr(t.Arch().Stack()) + start := hostarch.Addr(t.Arch().Stack()) // Round addr down to a 16-byte boundary. - start &= ^usermem.Addr(15) + start &= ^hostarch.Addr(15) // Print 16 bytes per line, one byte at a time. for offset := uint64(0); offset < maxStackDebugBytes; offset += 16 { addr, ok := start.AddLength(offset) @@ -127,7 +128,7 @@ func (t *Task) debugDumpStack() { t.Debugf("%x: % x", addr, data[:n]) } if err != nil { - t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err) + t.Debugf("Error reading stack at address %x: %v", addr+hostarch.Addr(n), err) break } } @@ -147,9 +148,9 @@ func (t *Task) debugDumpCode() { } t.Debugf("Code:") // Print code on both sides of the instruction register. - start := usermem.Addr(t.Arch().IP()) - maxCodeDebugBytes/2 + start := hostarch.Addr(t.Arch().IP()) - maxCodeDebugBytes/2 // Round addr down to a 16-byte boundary. - start &= ^usermem.Addr(15) + start &= ^hostarch.Addr(15) // Print 16 bytes per line, one byte at a time. for offset := uint64(0); offset < maxCodeDebugBytes; offset += 16 { addr, ok := start.AddLength(offset) @@ -166,7 +167,7 @@ func (t *Task) debugDumpCode() { t.Debugf("%x: % x", addr, data[:n]) } if err != nil { - t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err) + t.Debugf("Error reading stack at address %x: %v", addr+hostarch.Addr(n), err) break } } diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 3ccecf4b6..068f25af1 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -23,13 +23,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/goid" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/hostcpu" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) // A taskRunState is a reified state in the task state machine. See README.md @@ -148,7 +148,7 @@ func (*runApp) handleCPUIDInstruction(t *Task) error { region := trace.StartRegion(t.traceContext, cpuidRegion) expected := arch.CPUIDInstruction[:] found := make([]byte, len(expected)) - _, err := t.CopyInBytes(usermem.Addr(t.Arch().IP()), found) + _, err := t.CopyInBytes(hostarch.Addr(t.Arch().IP()), found) if err == nil && bytes.Equal(expected, found) { // Skip the cpuid instruction. t.Arch().CPUIDEmulate(t) @@ -307,8 +307,8 @@ func (app *runApp) execute(t *Task) taskRunState { // normally. if at.Any() { region := trace.StartRegion(t.traceContext, faultRegion) - addr := usermem.Addr(info.Addr()) - err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack())) + addr := hostarch.Addr(info.Addr()) + err := t.MemoryManager().HandleUserFault(t, addr, at, hostarch.Addr(t.Arch().Stack())) region.End() if err == nil { // The fault was handled appropriately. diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 75af3af79..c2b9fc08f 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -23,11 +23,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -243,7 +243,7 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) // Are executing on the main stack, // or the provided alternate stack? - sp := usermem.Addr(t.Arch().Stack()) + sp := hostarch.Addr(t.Arch().Stack()) // N.B. This is a *copy* of the alternate stack that the user's signal // handler expects to see in its ucontext (even if it's not in use). @@ -251,7 +251,7 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) if act.IsOnStack() && alt.IsEnabled() { alt.SetOnStack() if !alt.Contains(sp) { - sp = usermem.Addr(alt.Top()) + sp = hostarch.Addr(alt.Top()) } } @@ -652,7 +652,7 @@ func (t *Task) SignalStack() arch.SignalStack { // onSignalStack returns true if the task is executing on the given signal stack. func (t *Task) onSignalStack(alt arch.SignalStack) bool { - sp := usermem.Addr(t.Arch().Stack()) + sp := hostarch.Addr(t.Arch().Stack()) return alt.Contains(sp) } @@ -720,7 +720,7 @@ func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (a // CopyOutSignalAct converts the given SignalAct into an architecture-specific // type and then copies it out to task memory. -func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error { +func (t *Task) CopyOutSignalAct(addr hostarch.Addr, s *arch.SignalAct) error { n := t.Arch().NewSignalAct() n.SerializeFrom(s) _, err := n.CopyOut(t, addr) @@ -729,7 +729,7 @@ func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error { // CopyInSignalAct copies an architecture-specific sigaction type from task // memory and then converts it into a SignalAct. -func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) { +func (t *Task) CopyInSignalAct(addr hostarch.Addr) (arch.SignalAct, error) { n := t.Arch().NewSignalAct() var s arch.SignalAct if _, err := n.CopyIn(t, addr); err != nil { @@ -741,7 +741,7 @@ func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) { // CopyOutSignalStack converts the given SignalStack into an // architecture-specific type and then copies it out to task memory. -func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error { +func (t *Task) CopyOutSignalStack(addr hostarch.Addr, s *arch.SignalStack) error { n := t.Arch().NewSignalStack() n.SerializeFrom(s) _, err := n.CopyOut(t, addr) @@ -750,7 +750,7 @@ func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error // CopyInSignalStack copies an architecture-specific stack_t from task memory // and then converts it into a SignalStack. -func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) { +func (t *Task) CopyInSignalStack(addr hostarch.Addr) (arch.SignalStack, error) { n := t.Arch().NewSignalStack() var s arch.SignalStack if _, err := n.CopyIn(t, addr); err != nil { diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 36e1384f1..32031cd70 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -17,6 +17,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -25,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) // TaskConfig defines the configuration of a new Task (see below). @@ -86,7 +86,7 @@ type TaskConfig struct { MountNamespaceVFS2 *vfs.MountNamespace // RSeqAddr is a pointer to the the userspace linux.RSeq structure. - RSeqAddr usermem.Addr + RSeqAddr hostarch.Addr // RSeqSignature is the signature that the rseq abort IP must be signed // with. @@ -151,6 +151,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { rseqSignature: cfg.RSeqSignature, futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, + cgroups: make(map[Cgroup]struct{}), } t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu @@ -189,6 +190,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { t.parent.children[t] = struct{}{} } + if VFS2Enabled { + t.EnterInitialCgroups(t.parent) + } + if tg.leader == nil { // New thread group. tg.leader = t diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index 2e84bd88a..2c658d001 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -22,12 +22,12 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application") @@ -153,7 +153,7 @@ func (t *Task) doSyscall() taskRunState { // Check seccomp filters. The nil check is for performance (as seccomp use // is rare), not needed for correctness. if t.syscallFilters.Load() != nil { - switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r { + switch r := t.checkSeccompSyscall(int32(sysno), args, hostarch.Addr(t.Arch().IP())); r { case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP: t.Debugf("Syscall %d: denied by seccomp", sysno) return (*runSyscallExit)(nil) @@ -283,12 +283,12 @@ func (*runSyscallExit) execute(t *Task) taskRunState { // doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as // indicated by an execution fault at address addr. doVsyscall returns the // task's next run state. -func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState { +func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState { vsyscallCount.Increment() // Grab the caller up front, to make sure there's a sensible stack. caller := t.Arch().Native(uintptr(0)) - if _, err := caller.CopyIn(t, usermem.Addr(t.Arch().Stack())); err != nil { + if _, err := caller.CopyIn(t, hostarch.Addr(t.Arch().Stack())); err != nil { t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) @@ -322,7 +322,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState { } type runVsyscallAfterPtraceEventSeccomp struct { - addr usermem.Addr + addr hostarch.Addr sysno uintptr caller marshal.Marshallable } @@ -337,7 +337,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { // currently emulated call. ... The tracer MUST NOT modify rip or rsp." - // Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip // causes do_exit(SIGSYS), and changing sp is ignored. - if (sysno != ^uintptr(0) && sysno != r.sysno) || usermem.Addr(t.Arch().IP()) != r.addr { + if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr { t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) return (*runExit)(nil) } diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go index 94dabbcd8..fc6d9438a 100644 --- a/pkg/sentry/kernel/task_usermem.go +++ b/pkg/sentry/kernel/task_usermem.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -27,7 +28,7 @@ import ( // MAX_RW_COUNT is the maximum size in bytes of a single read or write. // Reads and writes that exceed this size may be silently truncated. // (Linux: include/linux/fs.h:MAX_RW_COUNT) -var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown()) +var MAX_RW_COUNT = int(hostarch.Addr(math.MaxInt32).RoundDown()) // Activate ensures that the task has an active address space. func (t *Task) Activate() { @@ -49,7 +50,7 @@ func (t *Task) Deactivate() { // data without reflection and pass in a byte slice. // // This Task's AddressSpace must be active. -func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { +func (t *Task) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) { return t.MemoryManager().CopyIn(t, addr, dst, usermem.IOOpts{ AddressSpaceActive: true, }) @@ -59,7 +60,7 @@ func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { // data without reflection and pass in a byte slice. // // This Task's AddressSpace must be active. -func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) { +func (t *Task) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) { return t.MemoryManager().CopyOut(t, addr, src, usermem.IOOpts{ AddressSpaceActive: true, }) @@ -70,7 +71,7 @@ func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) { // user memory that is unmapped or not readable by the user. // // This Task's AddressSpace must be active. -func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) { +func (t *Task) CopyInString(addr hostarch.Addr, maxlen int) (string, error) { return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxlen, usermem.IOOpts{ AddressSpaceActive: true, }) @@ -90,7 +91,7 @@ func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) { // { "abc" } => 4 (3 for length, 1 for elements) // // This Task's AddressSpace must be active. -func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([]string, error) { +func (t *Task) CopyInVector(addr hostarch.Addr, maxElemSize, maxTotalSize int) ([]string, error) { var v []string for { argAddr := t.Arch().Native(0) @@ -109,12 +110,12 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([ if maxTotalSize < thisMax { thisMax = maxTotalSize } - arg, err := t.CopyInString(usermem.Addr(t.Arch().Value(argAddr)), thisMax) + arg, err := t.CopyInString(hostarch.Addr(t.Arch().Value(argAddr)), thisMax) if err != nil { return v, err } v = append(v, arg) - addr += usermem.Addr(t.Arch().Width()) + addr += hostarch.Addr(t.Arch().Width()) maxTotalSize -= len(arg) + 1 } return v, nil @@ -126,7 +127,7 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([ // Preconditions: Same as usermem.IO.CopyOut, plus: // * The caller must be running on the task goroutine. // * t's AddressSpace must be active. -func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error { +func (t *Task) CopyOutIovecs(addr hostarch.Addr, src hostarch.AddrRangeSeq) error { switch t.Arch().Width() { case 8: const itemLen = 16 @@ -137,8 +138,8 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error b := t.CopyScratchBuffer(itemLen) for ; !src.IsEmpty(); src = src.Tail() { ar := src.Head() - usermem.ByteOrder.PutUint64(b[0:8], uint64(ar.Start)) - usermem.ByteOrder.PutUint64(b[8:16], uint64(ar.Length())) + hostarch.ByteOrder.PutUint64(b[0:8], uint64(ar.Start)) + hostarch.ByteOrder.PutUint64(b[8:16], uint64(ar.Length())) if _, err := t.CopyOutBytes(addr, b); err != nil { return err } @@ -153,8 +154,8 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error } // CopyInIovecs copies an array of numIovecs struct iovecs from the memory -// mapped at addr, converts them to usermem.AddrRanges, and returns them as a -// usermem.AddrRangeSeq. +// mapped at addr, converts them to hostarch.AddrRanges, and returns them as a +// hostarch.AddrRangeSeq. // // CopyInIovecs shares the following properties with Linux's // lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector(): @@ -175,42 +176,42 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error // Preconditions: Same as usermem.IO.CopyIn, plus: // * The caller must be running on the task goroutine. // * t's AddressSpace must be active. -func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) { +func (t *Task) CopyInIovecs(addr hostarch.Addr, numIovecs int) (hostarch.AddrRangeSeq, error) { if numIovecs == 0 { - return usermem.AddrRangeSeq{}, nil + return hostarch.AddrRangeSeq{}, nil } - var dst []usermem.AddrRange + var dst []hostarch.AddrRange if numIovecs > 1 { - dst = make([]usermem.AddrRange, 0, numIovecs) + dst = make([]hostarch.AddrRange, 0, numIovecs) } switch t.Arch().Width() { case 8: const itemLen = 16 if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok { - return usermem.AddrRangeSeq{}, syserror.EFAULT + return hostarch.AddrRangeSeq{}, syserror.EFAULT } b := t.CopyScratchBuffer(itemLen) for i := 0; i < numIovecs; i++ { if _, err := t.CopyInBytes(addr, b); err != nil { - return usermem.AddrRangeSeq{}, err + return hostarch.AddrRangeSeq{}, err } - base := usermem.Addr(usermem.ByteOrder.Uint64(b[0:8])) - length := usermem.ByteOrder.Uint64(b[8:16]) + base := hostarch.Addr(hostarch.ByteOrder.Uint64(b[0:8])) + length := hostarch.ByteOrder.Uint64(b[8:16]) if length > math.MaxInt64 { - return usermem.AddrRangeSeq{}, syserror.EINVAL + return hostarch.AddrRangeSeq{}, syserror.EINVAL } ar, ok := t.MemoryManager().CheckIORange(base, int64(length)) if !ok { - return usermem.AddrRangeSeq{}, syserror.EFAULT + return hostarch.AddrRangeSeq{}, syserror.EFAULT } if numIovecs == 1 { // Special case to avoid allocating dst. - return usermem.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil + return hostarch.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil } dst = append(dst, ar) @@ -218,7 +219,7 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange } default: - return usermem.AddrRangeSeq{}, syserror.ENOSYS + return hostarch.AddrRangeSeq{}, syserror.ENOSYS } // Truncate to MAX_RW_COUNT. @@ -226,13 +227,13 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange for i := range dst { dstlen := uint64(dst[i].Length()) if rem := uint64(MAX_RW_COUNT) - total; rem < dstlen { - dst[i].End -= usermem.Addr(dstlen - rem) + dst[i].End -= hostarch.Addr(dstlen - rem) dstlen = rem } total += dstlen } - return usermem.AddrRangeSeqFromSlice(dst), nil + return hostarch.AddrRangeSeqFromSlice(dst), nil } // SingleIOSequence returns a usermem.IOSequence representing [addr, @@ -245,7 +246,7 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange // write syscalls in Linux do not use import_single_range(). However they check // access_ok() in fs/read_write.c:vfs_read/vfs_write, and overflowing address // ranges are truncated to MAX_RW_COUNT by fs/read_write.c:rw_verify_area().) -func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) { +func (t *Task) SingleIOSequence(addr hostarch.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) { if length > MAX_RW_COUNT { length = MAX_RW_COUNT } @@ -255,7 +256,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp } return usermem.IOSequence{ IO: t.MemoryManager(), - Addrs: usermem.AddrRangeSeqOf(ar), + Addrs: hostarch.AddrRangeSeqOf(ar), Opts: opts, }, nil } @@ -267,7 +268,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp // IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec(). // // Preconditions: Same as Task.CopyInIovecs. -func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) { +func (t *Task) IovecsIOSequence(addr hostarch.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) { if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV { return usermem.IOSequence{}, syserror.EINVAL } @@ -317,7 +318,7 @@ func (cc *taskCopyContext) getMemoryManager() (*mm.MemoryManager, error) { } // CopyInBytes implements marshal.CopyContext.CopyInBytes. -func (cc *taskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { +func (cc *taskCopyContext) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) { tmm, err := cc.getMemoryManager() if err != nil { return 0, err @@ -327,7 +328,7 @@ func (cc *taskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, erro } // CopyOutBytes implements marshal.CopyContext.CopyOutBytes. -func (cc *taskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) { +func (cc *taskCopyContext) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) { tmm, err := cc.getMemoryManager() if err != nil { return 0, err @@ -360,11 +361,11 @@ func (cc *ownTaskCopyContext) CopyScratchBuffer(size int) []byte { } // CopyInBytes implements marshal.CopyContext.CopyInBytes. -func (cc *ownTaskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { +func (cc *ownTaskCopyContext) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) { return cc.t.MemoryManager().CopyIn(cc.t, addr, dst, cc.opts) } // CopyOutBytes implements marshal.CopyContext.CopyOutBytes. -func (cc *ownTaskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) { +func (cc *ownTaskCopyContext) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) { return cc.t.MemoryManager().CopyOut(cc.t, addr, src, cc.opts) } diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go index 09d070ec8..77ad62445 100644 --- a/pkg/sentry/kernel/threads.go +++ b/pkg/sentry/kernel/threads.go @@ -114,6 +114,15 @@ func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) { } } +// forEachTaskLocked applies f to each Task in ts. +// +// Preconditions: ts.mu must be locked (for reading or writing). +func (ts *TaskSet) forEachTaskLocked(f func(t *Task)) { + for t := range ts.Root.tids { + f(t) + } +} + // A PIDNamespace represents a PID namespace, a bimap between thread IDs and // tasks. See the pid_namespaces(7) man page for further details. // diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go index cf2f7ca72..dfc3c0719 100644 --- a/pkg/sentry/kernel/timekeeper_test.go +++ b/pkg/sentry/kernel/timekeeper_test.go @@ -17,12 +17,12 @@ package kernel import ( "testing" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/pgalloc" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) // mockClocks is a sentrytime.Clocks that simply returns the times in the @@ -54,7 +54,7 @@ func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) { func stateTestClocklessTimekeeper(tb testing.TB) *Timekeeper { ctx := contexttest.Context(tb) mfp := pgalloc.MemoryFileProviderFromContext(ctx) - fr, err := mfp.MemoryFile().Allocate(usermem.PageSize, usage.Anonymous) + fr, err := mfp.MemoryFile().Allocate(hostarch.PageSize, usage.Anonymous) if err != nil { tb.Fatalf("failed to allocate memory: %v", err) } diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go index 9e5c2d26f..cc0917504 100644 --- a/pkg/sentry/kernel/vdso.go +++ b/pkg/sentry/kernel/vdso.go @@ -17,10 +17,10 @@ package kernel import ( "fmt" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/usermem" ) // vdsoParams are the parameters exposed to the VDSO. @@ -96,7 +96,7 @@ func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSO // access returns a mapping of the param page. func (v *VDSOParamPage) access() (safemem.Block, error) { - bs, err := v.mfp.MemoryFile().MapInternal(v.fr, usermem.ReadWrite) + bs, err := v.mfp.MemoryFile().MapInternal(v.fr, hostarch.ReadWrite) if err != nil { return safemem.Block{}, err } |