// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // TaskConfig defines the configuration of a new Task (see below). type TaskConfig struct { // Kernel is the owning Kernel. Kernel *Kernel // Parent is the new task's parent. Parent may be nil. Parent *Task // If InheritParent is not nil, use InheritParent's parent as the new // task's parent. InheritParent *Task // ThreadGroup is the ThreadGroup the new task belongs to. ThreadGroup *ThreadGroup // SignalMask is the new task's initial signal mask. SignalMask linux.SignalSet // TaskImage is the TaskImage of the new task. Ownership of the // TaskImage is transferred to TaskSet.NewTask, whether or not it // succeeds. TaskImage *TaskImage // FSContext is the FSContext of the new task. A reference must be held on // FSContext, which is transferred to TaskSet.NewTask whether or not it // succeeds. FSContext *FSContext // FDTable is the FDTableof the new task. A reference must be held on // FDMap, which is transferred to TaskSet.NewTask whether or not it // succeeds. FDTable *FDTable // Credentials is the Credentials of the new task. Credentials *auth.Credentials // Niceness is the niceness of the new task. Niceness int // NetworkNamespace is the network namespace to be used for the new task. NetworkNamespace *inet.Namespace // AllowedCPUMask contains the cpus that this task can run on. AllowedCPUMask sched.CPUSet // UTSNamespace is the UTSNamespace of the new task. UTSNamespace *UTSNamespace // IPCNamespace is the IPCNamespace of the new task. IPCNamespace *IPCNamespace // AbstractSocketNamespace is the AbstractSocketNamespace of the new task. AbstractSocketNamespace *AbstractSocketNamespace // MountNamespaceVFS2 is the MountNamespace of the new task. MountNamespaceVFS2 *vfs.MountNamespace // RSeqAddr is a pointer to the the userspace linux.RSeq structure. RSeqAddr hostarch.Addr // RSeqSignature is the signature that the rseq abort IP must be signed // with. RSeqSignature uint32 // ContainerID is the container the new task belongs to. ContainerID string } // NewTask creates a new task defined by cfg. // // NewTask does not start the returned task; the caller must call Task.Start. // // If successful, NewTask transfers references held by cfg to the new task. // Otherwise, NewTask releases them. func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { t, err := ts.newTask(cfg) if err != nil { cfg.TaskImage.release() cfg.FSContext.DecRef(ctx) cfg.FDTable.DecRef(ctx) cfg.IPCNamespace.DecRef(ctx) if cfg.MountNamespaceVFS2 != nil { cfg.MountNamespaceVFS2.DecRef(ctx) } return nil, err } return t, nil } // newTask is a helper for TaskSet.NewTask that only takes ownership of parts // of cfg if it succeeds. func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { tg := cfg.ThreadGroup image := cfg.TaskImage t := &Task{ taskNode: taskNode{ tg: tg, parent: cfg.Parent, children: make(map[*Task]struct{}), }, runState: (*runApp)(nil), interruptChan: make(chan struct{}, 1), signalMask: cfg.SignalMask, signalStack: linux.SignalStack{Flags: linux.SS_DISABLE}, image: *image, fsContext: cfg.FSContext, fdTable: cfg.FDTable, p: cfg.Kernel.Platform.NewContext(), k: cfg.Kernel, ptraceTracees: make(map[*Task]struct{}), allowedCPUMask: cfg.AllowedCPUMask.Copy(), ioUsage: &usage.IO{}, niceness: cfg.Niceness, netns: cfg.NetworkNamespace, utsns: cfg.UTSNamespace, ipcns: cfg.IPCNamespace, abstractSockets: cfg.AbstractSocketNamespace, mountNamespaceVFS2: cfg.MountNamespaceVFS2, rseqCPU: -1, rseqAddr: cfg.RSeqAddr, rseqSignature: cfg.RSeqSignature, futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, cgroups: make(map[Cgroup]struct{}), } t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu t.ptraceTracer.Store((*Task)(nil)) // We don't construct t.blockingTimer until Task.run(); see that function // for justification. // Make the new task (and possibly thread group) visible to the rest of // the system atomically. ts.mu.Lock() defer ts.mu.Unlock() tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() if tg.exiting || tg.execing != nil { // If the caller is in the same thread group, then what we return // doesn't matter too much since the caller will exit before it returns // to userspace. If the caller isn't in the same thread group, then // we're in uncharted territory and can return whatever we want. return nil, linuxerr.EINTR } if err := ts.assignTIDsLocked(t); err != nil { return nil, err } // Below this point, newTask is expected not to fail (there is no rollback // of assignTIDsLocked or any of the following). // Logging on t's behalf will panic if t.logPrefix hasn't been // initialized. This is the earliest point at which we can do so // (since t now has thread IDs). t.updateInfoLocked() if cfg.InheritParent != nil { t.parent = cfg.InheritParent.parent } if t.parent != nil { t.parent.children[t] = struct{}{} } if VFS2Enabled { t.EnterInitialCgroups(t.parent) } if tg.leader == nil { // New thread group. tg.leader = t if parentPG := tg.parentPG(); parentPG == nil { tg.createSession() } else { // Inherit the process group and terminal. parentPG.incRefWithParent(parentPG) tg.processGroup = parentPG tg.tty = t.parent.tg.tty } } tg.tasks.PushBack(t) tg.tasksCount++ tg.liveTasks++ tg.activeTasks++ // Propagate external TaskSet stops to the new task. t.stopCount = ts.stopCount t.mu.Lock() defer t.mu.Unlock() t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t]) t.startTime = t.k.RealtimeClock().Now() return t, nil } // assignTIDsLocked ensures that new task t is visible in all PID namespaces in // which it should be visible. // // Preconditions: ts.mu must be locked for writing. func (ts *TaskSet) assignTIDsLocked(t *Task) error { type allocatedTID struct { ns *PIDNamespace tid ThreadID } var allocatedTIDs []allocatedTID for ns := t.tg.pidns; ns != nil; ns = ns.parent { tid, err := ns.allocateTID() if err != nil { // Failure. Remove the tids we already allocated in descendant // namespaces. for _, a := range allocatedTIDs { delete(a.ns.tasks, a.tid) delete(a.ns.tids, t) if t.tg.leader == nil { delete(a.ns.tgids, t.tg) } } return err } ns.tasks[tid] = t ns.tids[t] = tid if t.tg.leader == nil { // New thread group. ns.tgids[t.tg] = tid } allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid}) } return nil } // allocateTID returns an unused ThreadID from ns. // // Preconditions: ns.owner.mu must be locked for writing. func (ns *PIDNamespace) allocateTID() (ThreadID, error) { if ns.exiting { // "In this case, a subsequent fork(2) into this PID namespace will // fail with the error ENOMEM; it is not possible to create a new // processes [sic] in a PID namespace whose init process has // terminated." - pid_namespaces(7) return 0, linuxerr.ENOMEM } tid := ns.last for { // Next. tid++ if tid > TasksLimit { tid = InitTID + 1 } // Is it available? tidInUse := func() bool { if _, ok := ns.tasks[tid]; ok { return true } if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok { return true } if _, ok := ns.sessions[SessionID(tid)]; ok { return true } return false }() if !tidInUse { ns.last = tid return tid, nil } // Did we do a full cycle? if tid == ns.last { // No tid available. return 0, linuxerr.EAGAIN } } } // Start starts the task goroutine. Start must be called exactly once for each // task returned by NewTask. // // 'tid' must be the task's TID in the root PID namespace and it's used for // debugging purposes only (set as parameter to Task.run to make it visible // in stack dumps). func (t *Task) Start(tid ThreadID) { // If the task was restored, it may be "starting" after having already exited. if t.runState == nil { return } t.goroutineStopped.Add(1) t.tg.liveGoroutines.Add(1) t.tg.pidns.owner.liveGoroutines.Add(1) t.tg.pidns.owner.runningGoroutines.Add(1) // Task is now running in system mode. t.accountTaskGoroutineLeave(TaskGoroutineNonexistent) // Use the task's TID in the root PID namespace to make it visible in stack dumps. go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops }