diff options
author | Michael Pratt <mpratt@google.com> | 2019-04-03 16:21:38 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2019-04-03 16:22:43 -0700 |
commit | 4968dd1341a04e93557bdd9f4b4b83eb508e026d (patch) | |
tree | 50ef3c28ec24fad937f029f257cbe3338222445f | |
parent | 82529becaee6f5050cb3ebb4aaa7a798357c1cf1 (diff) |
Cache ThreadGroups in PIDNamespace
If there are thousands of threads, ThreadGroupsAppend becomes very
expensive as it must iterate over all Tasks to find the ThreadGroup
leaders.
Reduce the cost by maintaining a map of ThreadGroups which can be used
to grab them all directly.
The one somewhat visible change is to convert PID namespace init
children zapping to a group-directed SIGKILL, as Linux did in
82058d668465 "signal: Use group_send_sig_info to kill all processes in a
pid namespace".
In a benchmark that creates N threads which sleep for two minutes, we
see approximately this much CPU time in ThreadGroupsAppend:
Before:
1 thread: 0ms
1024 threads: 30ms - 9130ms
4096 threads: 50ms - 2000ms
8192 threads: 18160ms
16384 threads: 17210ms
After:
1 thread: 0ms
1024 threads: 0ms
4096 threads: 0ms
8192 threads: 0ms
16384 threads: 0ms
The profiling is actually extremely noisy (likely due to cache effects),
as some runs show almost no samples at 1024, 4096 threads, but obviously
this does not scale to lots of threads.
PiperOrigin-RevId: 241828039
Change-Id: I17827c90045df4b3c49b3174f3a05bca3026a72c
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 10 | ||||
-rw-r--r-- | pkg/sentry/kernel/sessions.go | 18 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_exec.go | 2 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_exit.go | 18 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_start.go | 7 | ||||
-rw-r--r-- | pkg/sentry/kernel/threads.go | 24 |
6 files changed, 48 insertions, 31 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index f7f471aaa..1cd2653ff 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -866,14 +866,14 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { defer k.tasks.mu.RUnlock() var lastErr error - for t := range k.tasks.Root.tids { - if t == t.tg.leader && t.ContainerID() == cid { - t.tg.signalHandlers.mu.Lock() - defer t.tg.signalHandlers.mu.Unlock() + for tg := range k.tasks.Root.tgids { + if tg.leader.ContainerID() == cid { + tg.signalHandlers.mu.Lock() infoCopy := *info - if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil { + if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { lastErr = err } + tg.signalHandlers.mu.Unlock() } } return lastErr diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index ae6daac60..65e2b73c4 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -240,14 +240,14 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error { defer tasks.mu.RUnlock() var lastErr error - for t := range tasks.Root.tids { - if t == t.tg.leader && t.tg.ProcessGroup() == pg { - t.tg.signalHandlers.mu.Lock() - defer t.tg.signalHandlers.mu.Unlock() + for tg := range tasks.Root.tgids { + if tg.ProcessGroup() == pg { + tg.signalHandlers.mu.Lock() infoCopy := *info - if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil { + if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { lastErr = err } + tg.signalHandlers.mu.Unlock() } } return lastErr @@ -268,7 +268,7 @@ func (tg *ThreadGroup) CreateSession() error { // Precondition: callers must hold TaskSet.mu for writing. func (tg *ThreadGroup) createSession() error { // Get the ID for this thread in the current namespace. - id := tg.pidns.tids[tg.leader] + id := tg.pidns.tgids[tg] // Check if this ThreadGroup already leads a Session, or // if the proposed group is already taken. @@ -337,7 +337,7 @@ func (tg *ThreadGroup) createSession() error { // Ensure a translation is added to all namespaces. for ns := tg.pidns; ns != nil; ns = ns.parent { - local := ns.tids[tg.leader] + local := ns.tgids[tg] ns.sids[s] = SessionID(local) ns.sessions[SessionID(local)] = s ns.pgids[pg] = ProcessGroupID(local) @@ -356,7 +356,7 @@ func (tg *ThreadGroup) CreateProcessGroup() error { defer tg.pidns.owner.mu.Unlock() // Get the ID for this thread in the current namespace. - id := tg.pidns.tids[tg.leader] + id := tg.pidns.tgids[tg] // Per above, check for a Session leader or existing group. for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() { @@ -401,7 +401,7 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // Ensure this translation is added to all namespaces. for ns := tg.pidns; ns != nil; ns = ns.parent { - local := ns.tids[tg.leader] + local := ns.tgids[tg] ns.pgids[pg] = ProcessGroupID(local) ns.processGroups[ProcessGroupID(local)] = pg } diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index a9b74da8e..9fca90a1c 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -234,6 +234,8 @@ func (t *Task) promoteLocked() { ns.tids[t] = leaderTID ns.tasks[oldTID] = oldLeader ns.tasks[leaderTID] = t + // Neither the ThreadGroup nor TGID change, so no need to + // update ns.tgids. } // Inherit the old leader's start time. diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index b9c558ccb..1a0734ab6 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -329,14 +329,15 @@ func (t *Task) exitChildren() { // signal." - pid_namespaces(7) t.Debugf("Init process terminating, killing namespace") t.tg.pidns.exiting = true - for other := range t.tg.pidns.tids { - if other.tg != t.tg { - other.tg.signalHandlers.mu.Lock() - other.sendSignalLocked(&arch.SignalInfo{ - Signo: int32(linux.SIGKILL), - }, false /* group */) - other.tg.signalHandlers.mu.Unlock() + for other := range t.tg.pidns.tgids { + if other == t.tg { + continue } + other.signalHandlers.mu.Lock() + other.leader.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGKILL), + }, true /* group */) + other.signalHandlers.mu.Unlock() } // TODO: The init process waits for all processes in the // namespace to exit before completing its own exit @@ -652,6 +653,9 @@ func (t *Task) exitNotifyLocked(fromPtraceDetach bool) { tid := ns.tids[t] delete(ns.tasks, tid) delete(ns.tids, t) + if t == t.tg.leader { + delete(ns.tgids, t.tg) + } } t.tg.exitedCPUStats.Accumulate(t.CPUStats()) t.tg.ioUsage.Accumulate(t.ioUsage) diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index c82a32c78..b7534c0a2 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -212,11 +212,18 @@ func (ts *TaskSet) assignTIDsLocked(t *Task) error { for _, a := range allocatedTIDs { delete(a.ns.tasks, a.tid) delete(a.ns.tids, t) + if t.tg.leader == nil { + delete(a.ns.tgids, t.tg) + } } return err } ns.tasks[tid] = t ns.tids[t] = tid + if t.tg.leader == nil { + // New thread group. + ns.tgids[t.tg] = tid + } allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid}) } return nil diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go index bdb907905..4af1b7dfa 100644 --- a/pkg/sentry/kernel/threads.go +++ b/pkg/sentry/kernel/threads.go @@ -100,10 +100,8 @@ func newTaskSet() *TaskSet { // // Preconditions: ts.mu must be locked (for reading or writing). func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) { - for t := range ts.Root.tids { - if t == t.tg.leader { - f(t.tg) - } + for tg := range ts.Root.tgids { + f(tg) } } @@ -145,6 +143,13 @@ type PIDNamespace struct { // identifiers in this namespace. tids map[*Task]ThreadID + // tgids is a mapping from thread groups visible in this namespace to + // their identifiers in this namespace. + // + // The content of tgids is equivalent to tids[tg.leader]. This exists + // primarily as an optimization to quickly find all thread groups. + tgids map[*ThreadGroup]ThreadID + // sessions is a mapping from SessionIDs in this namespace to sessions // visible in the namespace. sessions map[SessionID]*Session @@ -173,6 +178,7 @@ func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespa userns: userns, tasks: make(map[ThreadID]*Task), tids: make(map[*Task]ThreadID), + tgids: make(map[*ThreadGroup]ThreadID), sessions: make(map[SessionID]*Session), sids: make(map[*Session]SessionID), processGroups: make(map[ProcessGroupID]*ProcessGroup), @@ -227,7 +233,7 @@ func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID { func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID { ns.owner.mu.RLock() defer ns.owner.mu.RUnlock() - return ns.tids[tg.leader] + return ns.tgids[tg] } // Tasks returns a snapshot of the tasks in ns. @@ -250,10 +256,8 @@ func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup { func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup { ns.owner.mu.RLock() defer ns.owner.mu.RUnlock() - for t := range ns.tids { - if t == t.tg.leader { - tgs = append(tgs, t.tg) - } + for tg := range ns.tgids { + tgs = append(tgs, tg) } return tgs } @@ -387,7 +391,7 @@ func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID { func (tg *ThreadGroup) ID() ThreadID { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() - return tg.pidns.tids[tg.leader] + return tg.pidns.tgids[tg] } // A taskNode defines the relationship between a task and the rest of the |