// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "math" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) // "For a process to have permission to send a signal it must // - either be privileged (CAP_KILL), or // - the real or effective user ID of the sending process must be equal to the // real or saved set-user-ID of the target process. // // In the case of SIGCONT it suffices when the sending and receiving processes // belong to the same session." - kill(2) // // Equivalent to kernel/signal.c:check_kill_permission. func mayKill(t *kernel.Task, target *kernel.Task, sig linux.Signal) bool { // kernel/signal.c:check_kill_permission also allows a signal if the // sending and receiving tasks share a thread group, which is not // mentioned in kill(2) since kill does not allow task-level // granularity in signal sending. if t.ThreadGroup() == target.ThreadGroup() { return true } if t.HasCapabilityIn(linux.CAP_KILL, target.UserNamespace()) { return true } creds := t.Credentials() tcreds := target.Credentials() if creds.EffectiveKUID == tcreds.SavedKUID || creds.EffectiveKUID == tcreds.RealKUID || creds.RealKUID == tcreds.SavedKUID || creds.RealKUID == tcreds.RealKUID { return true } if sig == linux.SIGCONT && target.ThreadGroup().Session() == t.ThreadGroup().Session() { return true } return false } // Kill implements linux syscall kill(2). func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := kernel.ThreadID(args[0].Int()) sig := linux.Signal(args[1].Int()) switch { case pid > 0: // "If pid is positive, then signal sig is sent to the process with the // ID specified by pid." - kill(2) // This loops to handle races with execve where target dies between // TaskWithID and SendGroupSignal. Compare Linux's // kernel/signal.c:kill_pid_info(). for { target := t.PIDNamespace().TaskWithID(pid) if target == nil { return 0, nil, syserror.ESRCH } if !mayKill(t, target, sig) { return 0, nil, syserror.EPERM } info := &arch.SignalInfo{ Signo: int32(sig), Code: arch.SignalInfoUser, } info.SetPid(int32(target.PIDNamespace().IDOfTask(t))) info.SetUid(int32(t.Credentials().RealKUID.In(target.UserNamespace()).OrOverflow())) if err := target.SendGroupSignal(info); err != syserror.ESRCH { return 0, nil, err } } case pid == -1: // "If pid equals -1, then sig is sent to every process for which the // calling process has permission to send signals, except for process 1 // (init), but see below. ... POSIX.1-2001 requires that kill(-1,sig) // send sig to all processes that the calling process may send signals // to, except possibly for some implementation-defined system // processes. Linux allows a process to signal itself, but on Linux the // call kill(-1,sig) does not signal the calling process." var ( lastErr error delivered int ) for _, tg := range t.PIDNamespace().ThreadGroups() { if tg == t.ThreadGroup() { continue } if t.PIDNamespace().IDOfThreadGroup(tg) == kernel.InitTID { continue } // If pid == -1, the returned error is the last non-EPERM error // from any call to group_send_sig_info. if !mayKill(t, tg.Leader(), sig) { continue } // Here and below, whether or not kill returns an error may // depend on the iteration order. We at least implement the // semantics documented by the man page: "On success (at least // one signal was sent), zero is returned." info := &arch.SignalInfo{ Signo: int32(sig), Code: arch.SignalInfoUser, } info.SetPid(int32(tg.PIDNamespace().IDOfTask(t))) info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) err := tg.SendSignal(info) if err == syserror.ESRCH { // ESRCH is ignored because it means the task // exited while we were iterating. This is a // race which would not normally exist on // Linux, so we suppress it. continue } delivered++ if err != nil { lastErr = err } } if delivered > 0 { return 0, nil, lastErr } return 0, nil, syserror.ESRCH default: // "If pid equals 0, then sig is sent to every process in the process // group of the calling process." // // "If pid is less than -1, then sig is sent to every process // in the process group whose ID is -pid." pgid := kernel.ProcessGroupID(-pid) if pgid == 0 { pgid = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup()) } // If pid != -1 (i.e. signalling a process group), the returned error // is the last error from any call to group_send_sig_info. lastErr := syserror.ESRCH for _, tg := range t.PIDNamespace().ThreadGroups() { if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid { if !mayKill(t, tg.Leader(), sig) { lastErr = syserror.EPERM continue } info := &arch.SignalInfo{ Signo: int32(sig), Code: arch.SignalInfoUser, } info.SetPid(int32(tg.PIDNamespace().IDOfTask(t))) info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) // See note above regarding ESRCH race above. if err := tg.SendSignal(info); err != syserror.ESRCH { lastErr = err } } } return 0, nil, lastErr } } func tkillSigInfo(sender, receiver *kernel.Task, sig linux.Signal) *arch.SignalInfo { info := &arch.SignalInfo{ Signo: int32(sig), Code: arch.SignalInfoTkill, } info.SetPid(int32(receiver.PIDNamespace().IDOfThreadGroup(sender.ThreadGroup()))) info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) return info } // Tkill implements linux syscall tkill(2). func Tkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { tid := kernel.ThreadID(args[0].Int()) sig := linux.Signal(args[1].Int()) // N.B. Inconsistent with man page, linux actually rejects calls with // tid <=0 by EINVAL. This isn't the same for all signal calls. if tid <= 0 { return 0, nil, syserror.EINVAL } target := t.PIDNamespace().TaskWithID(tid) if target == nil { return 0, nil, syserror.ESRCH } if !mayKill(t, target, sig) { return 0, nil, syserror.EPERM } return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig)) } // Tgkill implements linux syscall tgkill(2). func Tgkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { tgid := kernel.ThreadID(args[0].Int()) tid := kernel.ThreadID(args[1].Int()) sig := linux.Signal(args[2].Int()) // N.B. Inconsistent with man page, linux actually rejects calls with // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls. if tgid <= 0 || tid <= 0 { return 0, nil, syserror.EINVAL } targetTG := t.PIDNamespace().ThreadGroupWithID(tgid) target := t.PIDNamespace().TaskWithID(tid) if targetTG == nil || target == nil || target.ThreadGroup() != targetTG { return 0, nil, syserror.ESRCH } if !mayKill(t, target, sig) { return 0, nil, syserror.EPERM } return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig)) } // RtSigaction implements linux syscall rt_sigaction(2). func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { sig := linux.Signal(args[0].Int()) newactarg := args[1].Pointer() oldactarg := args[2].Pointer() sigsetsize := args[3].SizeT() if sigsetsize != linux.SignalSetSize { return 0, nil, syserror.EINVAL } var newactptr *arch.SignalAct if newactarg != 0 { newact, err := t.CopyInSignalAct(newactarg) if err != nil { return 0, nil, err } newactptr = &newact } oldact, err := t.ThreadGroup().SetSignalAct(sig, newactptr) if err != nil { return 0, nil, err } if oldactarg != 0 { if err := t.CopyOutSignalAct(oldactarg, &oldact); err != nil { return 0, nil, err } } return 0, nil, nil } // Sigreturn implements linux syscall sigreturn(2). func Sigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { ctrl, err := t.SignalReturn(false) return 0, ctrl, err } // RtSigreturn implements linux syscall rt_sigreturn(2). func RtSigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { ctrl, err := t.SignalReturn(true) return 0, ctrl, err } // RtSigprocmask implements linux syscall rt_sigprocmask(2). func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { how := args[0].Int() setaddr := args[1].Pointer() oldaddr := args[2].Pointer() sigsetsize := args[3].SizeT() if sigsetsize != linux.SignalSetSize { return 0, nil, syserror.EINVAL } oldmask := t.SignalMask() if setaddr != 0 { mask, err := CopyInSigSet(t, setaddr, sigsetsize) if err != nil { return 0, nil, err } switch how { case linux.SIG_BLOCK: t.SetSignalMask(oldmask | mask) case linux.SIG_UNBLOCK: t.SetSignalMask(oldmask &^ mask) case linux.SIG_SETMASK: t.SetSignalMask(mask) default: return 0, nil, syserror.EINVAL } } if oldaddr != 0 { return 0, nil, copyOutSigSet(t, oldaddr, oldmask) } return 0, nil, nil } // Sigaltstack implements linux syscall sigaltstack(2). func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { setaddr := args[0].Pointer() oldaddr := args[1].Pointer() alt := t.SignalStack() if oldaddr != 0 { if err := t.CopyOutSignalStack(oldaddr, &alt); err != nil { return 0, nil, err } } if setaddr != 0 { alt, err := t.CopyInSignalStack(setaddr) if err != nil { return 0, nil, err } // The signal stack cannot be changed if the task is currently // on the stack. This is enforced at the lowest level because // these semantics apply to changing the signal stack via a // ucontext during a signal handler. if !t.SetSignalStack(alt) { return 0, nil, syserror.EPERM } } return 0, nil, nil } // Pause implements linux syscall pause(2). func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND) } // RtSigpending implements linux syscall rt_sigpending(2). func RtSigpending(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() pending := t.PendingSignals() _, err := pending.CopyOut(t, addr) return 0, nil, err } // RtSigtimedwait implements linux syscall rt_sigtimedwait(2). func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { sigset := args[0].Pointer() siginfo := args[1].Pointer() timespec := args[2].Pointer() sigsetsize := args[3].SizeT() mask, err := CopyInSigSet(t, sigset, sigsetsize) if err != nil { return 0, nil, err } var timeout time.Duration if timespec != 0 { d, err := copyTimespecIn(t, timespec) if err != nil { return 0, nil, err } if !d.Valid() { return 0, nil, syserror.EINVAL } timeout = time.Duration(d.ToNsecCapped()) } else { timeout = time.Duration(math.MaxInt64) } si, err := t.Sigtimedwait(mask, timeout) if err != nil { return 0, nil, err } if siginfo != 0 { si.FixSignalCodeForUser() if _, err := si.CopyOut(t, siginfo); err != nil { return 0, nil, err } } return uintptr(si.Signo), nil, nil } // RtSigqueueinfo implements linux syscall rt_sigqueueinfo(2). func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := kernel.ThreadID(args[0].Int()) sig := linux.Signal(args[1].Int()) infoAddr := args[2].Pointer() // Copy in the info. // // We must ensure that the Signo is set (Linux overrides this in the // same way), and that the code is in the allowed set. This same logic // appears below in RtSigtgqueueinfo and should be kept in sync. var info arch.SignalInfo if _, err := info.CopyIn(t, infoAddr); err != nil { return 0, nil, err } info.Signo = int32(sig) // This must loop to handle the race with execve described in Kill. for { // Deliver to the given task's thread group. target := t.PIDNamespace().TaskWithID(pid) if target == nil { return 0, nil, syserror.ESRCH } // If the sender is not the receiver, it can't use si_codes used by the // kernel or SI_TKILL. if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t { return 0, nil, syserror.EPERM } if !mayKill(t, target, sig) { return 0, nil, syserror.EPERM } if err := target.SendGroupSignal(&info); err != syserror.ESRCH { return 0, nil, err } } } // RtTgsigqueueinfo implements linux syscall rt_tgsigqueueinfo(2). func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { tgid := kernel.ThreadID(args[0].Int()) tid := kernel.ThreadID(args[1].Int()) sig := linux.Signal(args[2].Int()) infoAddr := args[3].Pointer() // N.B. Inconsistent with man page, linux actually rejects calls with // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls. if tgid <= 0 || tid <= 0 { return 0, nil, syserror.EINVAL } // Copy in the info. See RtSigqueueinfo above. var info arch.SignalInfo if _, err := info.CopyIn(t, infoAddr); err != nil { return 0, nil, err } info.Signo = int32(sig) // Deliver to the given task. targetTG := t.PIDNamespace().ThreadGroupWithID(tgid) target := t.PIDNamespace().TaskWithID(tid) if targetTG == nil || target == nil || target.ThreadGroup() != targetTG { return 0, nil, syserror.ESRCH } // If the sender is not the receiver, it can't use si_codes used by the // kernel or SI_TKILL. if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t { return 0, nil, syserror.EPERM } if !mayKill(t, target, sig) { return 0, nil, syserror.EPERM } return 0, nil, target.SendSignal(&info) } // RtSigsuspend implements linux syscall rt_sigsuspend(2). func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { sigset := args[0].Pointer() // Copy in the signal mask. var mask linux.SignalSet if _, err := mask.CopyIn(t, sigset); err != nil { return 0, nil, err } mask &^= kernel.UnblockableSignals // Swap the mask. oldmask := t.SignalMask() t.SetSignalMask(mask) t.SetSavedSignalMask(oldmask) // Perform the wait. return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND) } // RestartSyscall implements the linux syscall restart_syscall(2). func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if r := t.SyscallRestartBlock(); r != nil { n, err := r.Restart(t) return n, nil, err } // The restart block should never be nil here, but it's possible // ERESTART_RESTARTBLOCK was set by ptrace without the current syscall // setting up a restart block. If ptrace didn't manipulate the return value, // finding a nil restart block is a bug. Linux ensures that the restart // function is never null by (re)initializing it with one that translates // the restart into EINTR. We'll emulate that behaviour. t.Debugf("Restart block missing in restart_syscall(2). Did ptrace inject a return value of ERESTART_RESTARTBLOCK?") return 0, nil, syserror.EINTR } // sharedSignalfd is shared between the two calls. func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) { // Copy in the signal mask. mask, err := CopyInSigSet(t, sigset, sigsetsize) if err != nil { return 0, nil, err } // Always check for valid flags, even if not creating. if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 { return 0, nil, syserror.EINVAL } // Is this a change to an existing signalfd? // // The spec indicates that this should adjust the mask. if fd != -1 { file := t.GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() // Is this a signalfd? if s, ok := file.FileOperations.(*signalfd.SignalOperations); ok { s.SetMask(mask) return 0, nil, nil } // Not a signalfd. return 0, nil, syserror.EINVAL } // Create a new file. file, err := signalfd.New(t, mask) if err != nil { return 0, nil, err } defer file.DecRef() // Set appropriate flags. file.SetFlags(fs.SettableFileFlags{ NonBlocking: flags&linux.SFD_NONBLOCK != 0, }) // Create a new descriptor. fd, err = t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.SFD_CLOEXEC != 0, }) if err != nil { return 0, nil, err } // Done. return uintptr(fd), nil, nil } // Signalfd implements the linux syscall signalfd(2). func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() sigset := args[1].Pointer() sigsetsize := args[2].SizeT() return sharedSignalfd(t, fd, sigset, sigsetsize, 0) } // Signalfd4 implements the linux syscall signalfd4(2). func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() sigset := args[1].Pointer() sigsetsize := args[2].SizeT() flags := args[3].Int() return sharedSignalfd(t, fd, sigset, sigsetsize, flags) }