summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/threads.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel/threads.go')
-rw-r--r--pkg/sentry/kernel/threads.go465
1 files changed, 465 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
new file mode 100644
index 000000000..656bbd46c
--- /dev/null
+++ b/pkg/sentry/kernel/threads.go
@@ -0,0 +1,465 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+ "sync"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// TasksLimit is the maximum number of threads for untrusted application.
+// Linux doesn't really limit this directly, rather it is limited by total
+// memory size, stacks allocated and a global maximum. There's no real reason
+// for us to limit it either, (esp. since threads are backed by go routines),
+// and we would expect to hit resource limits long before hitting this number.
+// However, for correctness, we still check that the user doesn't exceed this
+// number.
+//
+// Note that because of the way futexes are implemented, there *are* in fact
+// serious restrictions on valid thread IDs. They are limited to 2^30 - 1
+// (kernel/fork.c:MAX_THREADS).
+const TasksLimit = (1 << 16)
+
+// ThreadID is a generic thread identifier.
+type ThreadID int32
+
+// String returns a decimal representation of the ThreadID.
+func (tid ThreadID) String() string {
+ return fmt.Sprintf("%d", tid)
+}
+
+// InitTID is the TID given to the first task added to each PID namespace. The
+// thread group led by InitTID is called the namespace's init process. The
+// death of a PID namespace's init process causes all tasks visible in that
+// namespace to be killed.
+const InitTID ThreadID = 1
+
+// A TaskSet comprises all tasks in a system.
+//
+// +stateify savable
+type TaskSet struct {
+ // mu protects all relationships betweens tasks and thread groups in the
+ // TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
+ mu sync.RWMutex `state:"nosave"`
+
+ // Root is the root PID namespace, in which all tasks in the TaskSet are
+ // visible. The Root pointer is immutable.
+ Root *PIDNamespace
+
+ // sessions is the set of all sessions.
+ sessions sessionList
+
+ // stopCount is the number of active external stops applicable to all tasks
+ // in the TaskSet (calls to TaskSet.BeginExternalStop that have not been
+ // paired with a call to TaskSet.EndExternalStop). stopCount is protected
+ // by mu.
+ //
+ // stopCount is not saved for the same reason as Task.stopCount; it is
+ // always reset to zero after restore.
+ stopCount int32 `state:"nosave"`
+
+ // liveGoroutines is the number of non-exited task goroutines in the
+ // TaskSet.
+ //
+ // liveGoroutines is not saved; it is reset as task goroutines are
+ // restarted by Task.Start.
+ liveGoroutines sync.WaitGroup `state:"nosave"`
+
+ // runningGoroutines is the number of running task goroutines in the
+ // TaskSet.
+ //
+ // runningGoroutines is not saved; its counter value is required to be zero
+ // at time of save (but note that this is not necessarily the same thing as
+ // sync.WaitGroup's zero value).
+ runningGoroutines sync.WaitGroup `state:"nosave"`
+}
+
+// newTaskSet returns a new, empty TaskSet.
+func newTaskSet() *TaskSet {
+ ts := &TaskSet{}
+ ts.Root = newPIDNamespace(ts, nil /* parent */, auth.NewRootUserNamespace())
+ return ts
+}
+
+// forEachThreadGroupLocked applies f to each thread group in ts.
+//
+// Preconditions: ts.mu must be locked (for reading or writing).
+func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
+ for tg := range ts.Root.tgids {
+ f(tg)
+ }
+}
+
+// A PIDNamespace represents a PID namespace, a bimap between thread IDs and
+// tasks. See the pid_namespaces(7) man page for further details.
+//
+// N.B. A task is said to be visible in a PID namespace if the PID namespace
+// contains a thread ID that maps to that task.
+//
+// +stateify savable
+type PIDNamespace struct {
+ // owner is the TaskSet that this PID namespace belongs to. The owner
+ // pointer is immutable.
+ owner *TaskSet
+
+ // parent is the PID namespace of the process that created this one. If
+ // this is the root PID namespace, parent is nil. The parent pointer is
+ // immutable.
+ //
+ // Invariant: All tasks that are visible in this namespace are also visible
+ // in all ancestor namespaces.
+ parent *PIDNamespace
+
+ // userns is the user namespace with which this PID namespace is
+ // associated. Privileged operations on this PID namespace must have
+ // appropriate capabilities in userns. The userns pointer is immutable.
+ userns *auth.UserNamespace
+
+ // The following fields are protected by owner.mu.
+
+ // last is the last ThreadID to be allocated in this namespace.
+ last ThreadID
+
+ // tasks is a mapping from ThreadIDs in this namespace to tasks visible in
+ // the namespace.
+ tasks map[ThreadID]*Task
+
+ // tids is a mapping from tasks visible in this namespace to their
+ // identifiers in this namespace.
+ tids map[*Task]ThreadID
+
+ // tgids is a mapping from thread groups visible in this namespace to
+ // their identifiers in this namespace.
+ //
+ // The content of tgids is equivalent to tids[tg.leader]. This exists
+ // primarily as an optimization to quickly find all thread groups.
+ tgids map[*ThreadGroup]ThreadID
+
+ // sessions is a mapping from SessionIDs in this namespace to sessions
+ // visible in the namespace.
+ sessions map[SessionID]*Session
+
+ // sids is a mapping from sessions visible in this namespace to their
+ // identifiers in this namespace.
+ sids map[*Session]SessionID
+
+ // processGroups is a mapping from ProcessGroupIDs in this namespace to
+ // process groups visible in the namespace.
+ processGroups map[ProcessGroupID]*ProcessGroup
+
+ // pgids is a mapping from process groups visible in this namespace to
+ // their identifiers in this namespace.
+ pgids map[*ProcessGroup]ProcessGroupID
+
+ // exiting indicates that the namespace's init process is exiting or has
+ // exited.
+ exiting bool
+}
+
+func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace {
+ return &PIDNamespace{
+ owner: ts,
+ parent: parent,
+ userns: userns,
+ tasks: make(map[ThreadID]*Task),
+ tids: make(map[*Task]ThreadID),
+ tgids: make(map[*ThreadGroup]ThreadID),
+ sessions: make(map[SessionID]*Session),
+ sids: make(map[*Session]SessionID),
+ processGroups: make(map[ProcessGroupID]*ProcessGroup),
+ pgids: make(map[*ProcessGroup]ProcessGroupID),
+ }
+}
+
+// NewChild returns a new, empty PID namespace that is a child of ns. Authority
+// over the new PID namespace is controlled by userns.
+func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace {
+ return newPIDNamespace(ns.owner, ns, userns)
+}
+
+// TaskWithID returns the task with thread ID tid in PID namespace ns. If no
+// task has that TID, TaskWithID returns nil.
+func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task {
+ ns.owner.mu.RLock()
+ t := ns.tasks[tid]
+ ns.owner.mu.RUnlock()
+ return t
+}
+
+// ThreadGroupWithID returns the thread group lead by the task with thread ID
+// tid in PID namespace ns. If no task has that TID, or if the task with that
+// TID is not a thread group leader, ThreadGroupWithID returns nil.
+func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ t := ns.tasks[tid]
+ if t == nil {
+ return nil
+ }
+ if t != t.tg.leader {
+ return nil
+ }
+ return t.tg
+}
+
+// IDOfTask returns the TID assigned to the given task in PID namespace ns. If
+// the task is not visible in that namespace, IDOfTask returns 0. (This return
+// value is significant in some cases, e.g. getppid() is documented as
+// returning 0 if the caller's parent is in an ancestor namespace and
+// consequently not visible to the caller.) If the task is nil, IDOfTask returns
+// 0.
+func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID {
+ ns.owner.mu.RLock()
+ id := ns.tids[t]
+ ns.owner.mu.RUnlock()
+ return id
+}
+
+// IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns.
+// If the task is not visible in that namespace, IDOfThreadGroup returns 0.
+func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID {
+ ns.owner.mu.RLock()
+ id := ns.tgids[tg]
+ ns.owner.mu.RUnlock()
+ return id
+}
+
+// Tasks returns a snapshot of the tasks in ns.
+func (ns *PIDNamespace) Tasks() []*Task {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ tasks := make([]*Task, 0, len(ns.tasks))
+ for t := range ns.tids {
+ tasks = append(tasks, t)
+ }
+ return tasks
+}
+
+// ThreadGroups returns a snapshot of the thread groups in ns.
+func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
+ return ns.ThreadGroupsAppend(nil)
+}
+
+// ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs.
+func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ for tg := range ns.tgids {
+ tgs = append(tgs, tg)
+ }
+ return tgs
+}
+
+// UserNamespace returns the user namespace associated with PID namespace ns.
+func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace {
+ return ns.userns
+}
+
+// A threadGroupNode defines the relationship between a thread group and the
+// rest of the system. Conceptually, threadGroupNode is data belonging to the
+// owning TaskSet, as if TaskSet contained a field `nodes
+// map[*ThreadGroup]*threadGroupNode`. However, for practical reasons,
+// threadGroupNode is embedded in the ThreadGroup it represents.
+// (threadGroupNode is an anonymous field in ThreadGroup; this is to expose
+// threadGroupEntry's methods on ThreadGroup to make it implement
+// threadGroupLinker.)
+//
+// +stateify savable
+type threadGroupNode struct {
+ // pidns is the PID namespace containing the thread group and all of its
+ // member tasks. The pidns pointer is immutable.
+ pidns *PIDNamespace
+
+ // eventQueue is notified whenever a event of interest to Task.Wait occurs
+ // in a child of this thread group, or a ptrace tracee of a task in this
+ // thread group. Events are defined in task_exit.go.
+ //
+ // Note that we cannot check and save this wait queue similarly to other
+ // wait queues, as the queue will not be empty by the time of saving, due
+ // to the wait sourced from Exec().
+ eventQueue waiter.Queue `state:"nosave"`
+
+ // leader is the thread group's leader, which is the oldest task in the
+ // thread group; usually the last task in the thread group to call
+ // execve(), or if no such task exists then the first task in the thread
+ // group, which was created by a call to fork() or clone() without
+ // CLONE_THREAD. Once a thread group has been made visible to the rest of
+ // the system by TaskSet.newTask, leader is never nil.
+ //
+ // Note that it's possible for the leader to exit without causing the rest
+ // of the thread group to exit; in such a case, leader will still be valid
+ // and non-nil, but leader will not be in tasks.
+ //
+ // leader is protected by the TaskSet mutex.
+ leader *Task
+
+ // If execing is not nil, it is a task in the thread group that has killed
+ // all other tasks so that it can become the thread group leader and
+ // perform an execve. (execing may already be the thread group leader.)
+ //
+ // execing is analogous to Linux's signal_struct::group_exit_task.
+ //
+ // execing is protected by the TaskSet mutex.
+ execing *Task
+
+ // tasks is all tasks in the thread group that have not yet been reaped.
+ //
+ // tasks is protected by both the TaskSet mutex and the signal mutex:
+ // Mutating tasks requires locking the TaskSet mutex for writing *and*
+ // locking the signal mutex. Reading tasks requires locking the TaskSet
+ // mutex *or* locking the signal mutex.
+ tasks taskList
+
+ // tasksCount is the number of tasks in the thread group that have not yet
+ // been reaped; equivalently, tasksCount is the number of tasks in tasks.
+ //
+ // tasksCount is protected by both the TaskSet mutex and the signal mutex,
+ // as with tasks.
+ tasksCount int
+
+ // liveTasks is the number of tasks in the thread group that have not yet
+ // reached TaskExitZombie.
+ //
+ // liveTasks is protected by the TaskSet mutex (NOT the signal mutex).
+ liveTasks int
+
+ // activeTasks is the number of tasks in the thread group that have not yet
+ // reached TaskExitInitiated.
+ //
+ // activeTasks is protected by both the TaskSet mutex and the signal mutex,
+ // as with tasks.
+ activeTasks int
+}
+
+// PIDNamespace returns the PID namespace containing tg.
+func (tg *ThreadGroup) PIDNamespace() *PIDNamespace {
+ return tg.pidns
+}
+
+// TaskSet returns the TaskSet containing tg.
+func (tg *ThreadGroup) TaskSet() *TaskSet {
+ return tg.pidns.owner
+}
+
+// Leader returns tg's leader.
+func (tg *ThreadGroup) Leader() *Task {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ return tg.leader
+}
+
+// Count returns the number of non-exited threads in the group.
+func (tg *ThreadGroup) Count() int {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ var count int
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ count++
+ }
+ return count
+}
+
+// MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for
+// all tasks in tg.
+func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+
+ var tasks []ThreadID
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ if id, ok := pidns.tids[t]; ok {
+ tasks = append(tasks, id)
+ }
+ }
+ return tasks
+}
+
+// ID returns tg's leader's thread ID in its own PID namespace. If tg's leader
+// is dead, ID returns 0.
+func (tg *ThreadGroup) ID() ThreadID {
+ tg.pidns.owner.mu.RLock()
+ id := tg.pidns.tgids[tg]
+ tg.pidns.owner.mu.RUnlock()
+ return id
+}
+
+// A taskNode defines the relationship between a task and the rest of the
+// system. The comments on threadGroupNode also apply to taskNode.
+//
+// +stateify savable
+type taskNode struct {
+ // tg is the thread group that this task belongs to. The tg pointer is
+ // immutable.
+ tg *ThreadGroup `state:"wait"`
+
+ // taskEntry links into tg.tasks. Note that this means that
+ // Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread
+ // group. See threadGroupNode.tasks for synchronization info.
+ taskEntry
+
+ // parent is the task's parent. parent may be nil.
+ //
+ // parent is protected by the TaskSet mutex.
+ parent *Task
+
+ // children is this task's children.
+ //
+ // children is protected by the TaskSet mutex.
+ children map[*Task]struct{}
+
+ // If childPIDNamespace is not nil, all new tasks created by this task will
+ // be members of childPIDNamespace rather than this one. (As a corollary,
+ // this task becomes unable to create sibling tasks in the same thread
+ // group.)
+ //
+ // childPIDNamespace is exclusive to the task goroutine.
+ childPIDNamespace *PIDNamespace
+}
+
+// ThreadGroup returns the thread group containing t.
+func (t *Task) ThreadGroup() *ThreadGroup {
+ return t.tg
+}
+
+// PIDNamespace returns the PID namespace containing t.
+func (t *Task) PIDNamespace() *PIDNamespace {
+ return t.tg.pidns
+}
+
+// TaskSet returns the TaskSet containing t.
+func (t *Task) TaskSet() *TaskSet {
+ return t.tg.pidns.owner
+}
+
+// Timekeeper returns the system Timekeeper.
+func (t *Task) Timekeeper() *Timekeeper {
+ return t.k.timekeeper
+}
+
+// Parent returns t's parent.
+func (t *Task) Parent() *Task {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ return t.parent
+}
+
+// ThreadID returns t's thread ID in its own PID namespace. If the task is
+// dead, ThreadID returns 0.
+func (t *Task) ThreadID() ThreadID {
+ return t.tg.pidns.IDOfTask(t)
+}