diff options
Diffstat (limited to 'pkg/sentry/kernel/task_clone.go')
-rw-r--r-- | pkg/sentry/kernel/task_clone.go | 475 |
1 files changed, 475 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go new file mode 100644 index 000000000..3a74abdfb --- /dev/null +++ b/pkg/sentry/kernel/task_clone.go @@ -0,0 +1,475 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bpf" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// SharingOptions controls what resources are shared by a new task created by +// Task.Clone, or an existing task affected by Task.Unshare. +type SharingOptions struct { + // If NewAddressSpace is true, the task should have an independent virtual + // address space. + NewAddressSpace bool + + // If NewSignalHandlers is true, the task should use an independent set of + // signal handlers. + NewSignalHandlers bool + + // If NewThreadGroup is true, the task should be the leader of its own + // thread group. TerminationSignal is the signal that the thread group + // will send to its parent when it exits. If NewThreadGroup is false, + // TerminationSignal is ignored. + NewThreadGroup bool + TerminationSignal linux.Signal + + // If NewPIDNamespace is true: + // + // - In the context of Task.Clone, the new task should be the init task + // (TID 1) in a new PID namespace. + // + // - In the context of Task.Unshare, the task should create a new PID + // namespace, and all subsequent clones of the task should be members of + // the new PID namespace. + NewPIDNamespace bool + + // If NewUserNamespace is true, the task should have an independent user + // namespace. + NewUserNamespace bool + + // If NewNetworkNamespace is true, the task should have an independent + // network namespace. (Note that network namespaces are not really + // implemented; see comment on Task.netns for details.) + NewNetworkNamespace bool + + // If NewFiles is true, the task should use an independent file descriptor + // table. + NewFiles bool + + // If NewFSContext is true, the task should have an independent FSContext. + NewFSContext bool + + // If NewUTSNamespace is true, the task should have an independent UTS + // namespace. + NewUTSNamespace bool + + // If NewIPCNamespace is true, the task should have an independent IPC + // namespace. + NewIPCNamespace bool +} + +// CloneOptions controls the behavior of Task.Clone. +type CloneOptions struct { + // SharingOptions defines the set of resources that the new task will share + // with its parent. + SharingOptions + + // Stack is the initial stack pointer of the new task. If Stack is 0, the + // new task will start with the same stack pointer as its parent. + Stack usermem.Addr + + // If SetTLS is true, set the new task's TLS (thread-local storage) + // descriptor to TLS. If SetTLS is false, TLS is ignored. + SetTLS bool + TLS usermem.Addr + + // If ChildClearTID is true, when the child exits, 0 is written to the + // address ChildTID in the child's memory, and if the write is successful a + // futex wake on the same address is performed. + // + // If ChildSetTID is true, the child's thread ID (in the child's PID + // namespace) is written to address ChildTID in the child's memory. (As in + // Linux, failed writes are silently ignored.) + ChildClearTID bool + ChildSetTID bool + ChildTID usermem.Addr + + // If ParentSetTID is true, the child's thread ID (in the parent's PID + // namespace) is written to address ParentTID in the parent's memory. (As + // in Linux, failed writes are silently ignored.) + // + // Older versions of the clone(2) man page state that CLONE_PARENT_SETTID + // causes the child's thread ID to be written to ptid in both the parent + // and child's memory, but this is a documentation error fixed by + // 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID"). + ParentSetTID bool + ParentTID usermem.Addr + + // If Vfork is true, place the parent in vforkStop until the cloned task + // releases its TaskContext. + Vfork bool + + // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for + // this clone(), and do not ptrace-attach the caller's tracer to the new + // task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate). + Untraced bool + + // If InheritTracer is true, ptrace-attach the caller's tracer to the new + // task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported + // for it. If both Untraced and InheritTracer are true, no event will be + // reported, but tracer inheritance will still occur. + InheritTracer bool +} + +// Clone implements the clone(2) syscall and returns the thread ID of the new +// task in t's PID namespace. Clone may return both a non-zero thread ID and a +// non-nil error. +// +// Preconditions: The caller must be running Task.doSyscallInvoke on the task +// goroutine. +func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { + // Since signal actions may refer to application signal handlers by virtual + // address, any set of signal handlers must refer to the same address + // space. + if !opts.NewSignalHandlers && opts.NewAddressSpace { + return 0, nil, syserror.EINVAL + } + // In order for the behavior of thread-group-directed signals to be sane, + // all tasks in a thread group must share signal handlers. + if !opts.NewThreadGroup && opts.NewSignalHandlers { + return 0, nil, syserror.EINVAL + } + // All tasks in a thread group must be in the same PID namespace. + if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) { + return 0, nil, syserror.EINVAL + } + // The two different ways of specifying a new PID namespace are + // incompatible. + if opts.NewPIDNamespace && t.childPIDNamespace != nil { + return 0, nil, syserror.EINVAL + } + // Thread groups and FS contexts cannot span user namespaces. + if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) { + return 0, nil, syserror.EINVAL + } + + // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a + // single clone(2) or unshare(2) call, the user namespace is guaranteed to + // be created first, giving the child (clone(2)) or caller (unshare(2)) + // privileges over the remaining namespaces created by the call." - + // user_namespaces(7) + creds := t.Credentials() + var userns *auth.UserNamespace + if opts.NewUserNamespace { + var err error + // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and + // the caller is in a chroot environment (i.e., the caller's root + // directory does not match the root directory of the mount namespace + // in which it resides)." - clone(2). Neither chroot(2) nor + // user_namespaces(7) document this. + if t.IsChrooted() { + return 0, nil, syserror.EPERM + } + userns, err = creds.NewChildUserNamespace() + if err != nil { + return 0, nil, err + } + } + if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapability(linux.CAP_SYS_ADMIN) { + return 0, nil, syserror.EPERM + } + + utsns := t.UTSNamespace() + if opts.NewUTSNamespace { + // Note that this must happen after NewUserNamespace so we get + // the new userns if there is one. + utsns = t.UTSNamespace().Clone(userns) + } + + ipcns := t.IPCNamespace() + if opts.NewIPCNamespace { + // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC + // namespace" + ipcns = NewIPCNamespace() + } + + tc, err := t.tc.Fork(t, !opts.NewAddressSpace) + if err != nil { + return 0, nil, err + } + // clone() returns 0 in the child. + tc.Arch.SetReturn(0) + if opts.Stack != 0 { + tc.Arch.SetStack(uintptr(opts.Stack)) + } + if opts.SetTLS { + tc.Arch.StateData().Regs.Fs_base = uint64(opts.TLS) + } + + pidns := t.tg.pidns + if t.childPIDNamespace != nil { + pidns = t.childPIDNamespace + } else if opts.NewPIDNamespace { + pidns = pidns.NewChild(userns) + } + tg := t.tg + parent := t.parent + if opts.NewThreadGroup { + sh := t.tg.signalHandlers + if opts.NewSignalHandlers { + sh = sh.Fork() + } + tg = NewThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock) + parent = t + } + cfg := &TaskConfig{ + Kernel: t.k, + Parent: parent, + ThreadGroup: tg, + TaskContext: tc, + TaskResources: t.tr.Fork(!opts.NewFiles, !opts.NewFSContext), + Niceness: t.Niceness(), + Credentials: creds.Fork(), + NetworkNamespaced: t.netns, + AllowedCPUMask: t.CPUMask(), + UTSNamespace: utsns, + IPCNamespace: ipcns, + } + if opts.NewNetworkNamespace { + cfg.NetworkNamespaced = true + } + nt, err := t.tg.pidns.owner.NewTask(cfg) + if err != nil { + if opts.NewThreadGroup { + tg.release() + } + return 0, nil, err + } + + // "A child process created via fork(2) inherits a copy of its parent's + // alternate signal stack settings" - sigaltstack(2). + // + // However kernel/fork.c:copy_process() adds a limitation to this: + // "sigaltstack should be cleared when sharing the same VM". + if opts.NewAddressSpace || opts.Vfork { + nt.SetSignalStack(t.SignalStack()) + } + + if userns != nil { + if err := nt.SetUserNamespace(userns); err != nil { + // This shouldn't be possible: userns was created from nt.creds, so + // nt should have CAP_SYS_ADMIN in userns. + panic("Task.Clone: SetUserNamespace failed: " + err.Error()) + } + } + + // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to + // nt that it must receive before its task goroutine starts running. + tid := nt.k.tasks.Root.IDOfTask(nt) + defer nt.Start(tid) + + // "If fork/clone and execve are allowed by @prog, any child processes will + // be constrained to the same filters and system call ABI as the parent." - + // Documentation/prctl/seccomp_filter.txt + nt.syscallFilters = append([]bpf.Program(nil), t.syscallFilters...) + if opts.Vfork { + nt.vforkParent = t + } + + if opts.ChildClearTID { + nt.SetClearTID(opts.ChildTID) + } + if opts.ChildSetTID { + // Can't use Task.CopyOut, which assumes AddressSpaceActive. + usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{}) + } + ntid := t.tg.pidns.IDOfTask(nt) + if opts.ParentSetTID { + t.CopyOut(opts.ParentTID, ntid) + } + + kind := ptraceCloneKindClone + if opts.Vfork { + kind = ptraceCloneKindVfork + } else if opts.TerminationSignal == linux.SIGCHLD { + kind = ptraceCloneKindFork + } + if t.ptraceClone(kind, nt, opts) { + if opts.Vfork { + return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil + } + return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil + } + if opts.Vfork { + t.maybeBeginVforkStop(nt) + return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil + } + return ntid, nil, nil +} + +// maybeBeginVforkStop checks if a previously-started vfork child is still +// running and has not yet released its MM, such that its parent t should enter +// a vforkStop. +// +// Preconditions: The caller must be running on t's task goroutine. +func (t *Task) maybeBeginVforkStop(child *Task) { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.killedLocked() { + child.vforkParent = nil + return + } + if child.vforkParent == t { + t.beginInternalStopLocked((*vforkStop)(nil)) + } +} + +func (t *Task) unstopVforkParent() { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if p := t.vforkParent; p != nil { + p.tg.signalHandlers.mu.Lock() + defer p.tg.signalHandlers.mu.Unlock() + if _, ok := p.stop.(*vforkStop); ok { + p.endInternalStopLocked() + } + // Parent no longer needs to be unstopped. + t.vforkParent = nil + } +} + +type runSyscallAfterPtraceEventClone struct { + vforkChild *Task + + // If vforkChild is not nil, vforkChildTID is its thread ID in the parent's + // PID namespace. vforkChildTID must be stored since the child may exit and + // release its TID before the PTRACE_EVENT stop ends. + vforkChildTID ThreadID +} + +func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState { + if r.vforkChild != nil { + t.maybeBeginVforkStop(r.vforkChild) + return &runSyscallAfterVforkStop{r.vforkChildTID} + } + return (*runSyscallExit)(nil) +} + +type runSyscallAfterVforkStop struct { + // childTID has the same meaning as + // runSyscallAfterPtraceEventClone.vforkChildTID. + childTID ThreadID +} + +func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState { + t.ptraceVforkDone(r.childTID) + return (*runSyscallExit)(nil) +} + +// Unshare changes the set of resources t shares with other tasks, as specified +// by opts. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Unshare(opts *SharingOptions) error { + // In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and + // NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if + // t is the only task using its MM, which due to clone(2)'s rules imply + // that it is also the only task using its signal handlers / in its thread + // group, and cause EINVAL to be returned otherwise. + // + // Since we don't count the number of tasks using each address space or set + // of signal handlers, we reject NewSignalHandlers and NewAddressSpace + // altogether, and interpret NewThreadGroup as requiring that t be the only + // member of its thread group. This seems to be logically coherent, in the + // sense that clone(2) allows a task to share signal handlers and address + // spaces with tasks in other thread groups. + if opts.NewAddressSpace || opts.NewSignalHandlers { + return syserror.EINVAL + } + if opts.NewThreadGroup { + t.tg.signalHandlers.mu.Lock() + if t.tg.tasksCount != 1 { + t.tg.signalHandlers.mu.Unlock() + return syserror.EINVAL + } + t.tg.signalHandlers.mu.Unlock() + // This isn't racy because we're the only living task, and therefore + // the only task capable of creating new ones, in our thread group. + } + if opts.NewUserNamespace { + if t.IsChrooted() { + return syserror.EPERM + } + // This temporary is needed because Go. + creds := t.Credentials() + newUserNS, err := creds.NewChildUserNamespace() + if err != nil { + return err + } + err = t.SetUserNamespace(newUserNS) + if err != nil { + return err + } + } + haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) + if opts.NewPIDNamespace { + if !haveCapSysAdmin { + return syserror.EPERM + } + t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace()) + } + t.mu.Lock() + defer t.mu.Unlock() + if opts.NewNetworkNamespace { + if !haveCapSysAdmin { + return syserror.EPERM + } + t.netns = true + } + if opts.NewUTSNamespace { + if !haveCapSysAdmin { + return syserror.EPERM + } + // Note that this must happen after NewUserNamespace, so the + // new user namespace is used if there is one. + t.utsns = t.utsns.Clone(t.creds.UserNamespace) + } + if opts.NewIPCNamespace { + if !haveCapSysAdmin { + return syserror.EPERM + } + // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC + // namespace" + t.ipcns = NewIPCNamespace() + } + if opts.NewFiles { + oldFDMap := t.tr.FDMap + t.tr.FDMap = oldFDMap.Fork() + oldFDMap.DecRef() + } + if opts.NewFSContext { + oldFS := t.tr.FSContext + t.tr.FSContext = oldFS.Fork() + oldFS.DecRef() + } + return nil +} + +// vforkStop is a TaskStop imposed on a task that creates a child with +// CLONE_VFORK or vfork(2), that ends when the child task ceases to use its +// current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so +// that the child and parent share mappings until the child execve()s into a +// new process image or exits.) +type vforkStop struct{} + +// StopIgnoresKill implements TaskStop.Killable. +func (*vforkStop) Killable() bool { return true } |