diff options
-rw-r--r-- | pkg/sentry/kernel/task_exit.go | 164 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_thread.go | 10 | ||||
-rw-r--r-- | test/syscalls/linux/BUILD | 2 | ||||
-rw-r--r-- | test/syscalls/linux/wait.cc | 185 |
4 files changed, 236 insertions, 125 deletions
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 6e9701b01..2e1e46582 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -782,6 +782,10 @@ type WaitOptions struct { // for. CloneTasks bool + // If SiblingChildren is true, events from children tasks of any task + // in the thread group of the waiter are eligible to be waited for. + SiblingChildren bool + // Events is a bitwise combination of the events defined above that specify // what events are of interest to the call to Wait. Events waiter.EventMask @@ -869,87 +873,109 @@ func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() - // Without the (unimplemented) __WNOTHREAD flag, a task can wait on the - // children and tracees of any task in the same thread group. - for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() { - for child := range parent.children { - if !opts.matchesTask(child, parent.tg.pidns) { - continue - } - // Non-leaders don't notify parents on exit and aren't eligible to - // be waited on. - if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked { - anyWaitableTasks = true - if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil { - return wr, nil - } - } - // Check for group stops and continues. Tasks that have passed - // TaskExitInitiated can no longer participate in group stops. - if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 { - continue - } - if child.exitState >= TaskExitInitiated { - continue - } - // If the waiter is in the same thread group as the task's - // tracer, do not report its group stops; they will be reported - // as ptrace stops instead. This also skips checking for group - // continues, but they'll be checked for when scanning tracees - // below. (Per kernel/exit.c:wait_consider_task(): "If a - // ptracer wants to distinguish the two events for its own - // children, it should create a separate process which takes - // the role of real parent.") - if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg { - continue + if opts.SiblingChildren { + // We can wait on the children and tracees of any task in the + // same thread group. + for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() { + wr, any := t.waitParentLocked(opts, parent) + if wr != nil { + return wr, nil } + anyWaitableTasks = anyWaitableTasks || any + } + } else { + // We can only wait on this task. + var wr *WaitResult + wr, anyWaitableTasks = t.waitParentLocked(opts, t) + if wr != nil { + return wr, nil + } + } + + if anyWaitableTasks { + return nil, ErrNoWaitableEvent + } + return nil, syserror.ECHILD +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, bool) { + anyWaitableTasks := false + + for child := range parent.children { + if !opts.matchesTask(child, parent.tg.pidns) { + continue + } + // Non-leaders don't notify parents on exit and aren't eligible to + // be waited on. + if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked { anyWaitableTasks = true - if opts.Events&EventChildGroupStop != 0 { - if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil { - return wr, nil - } - } - if opts.Events&EventGroupContinue != 0 { - if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil { - return wr, nil - } + if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil { + return wr, anyWaitableTasks } } - for tracee := range parent.ptraceTracees { - if !opts.matchesTask(tracee, parent.tg.pidns) { - continue - } - // Non-leaders do notify tracers on exit. - if opts.Events&EventExit != 0 && !tracee.exitTracerAcked { - anyWaitableTasks = true - if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil { - return wr, nil - } - } - if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 { - continue + // Check for group stops and continues. Tasks that have passed + // TaskExitInitiated can no longer participate in group stops. + if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 { + continue + } + if child.exitState >= TaskExitInitiated { + continue + } + // If the waiter is in the same thread group as the task's + // tracer, do not report its group stops; they will be reported + // as ptrace stops instead. This also skips checking for group + // continues, but they'll be checked for when scanning tracees + // below. (Per kernel/exit.c:wait_consider_task(): "If a + // ptracer wants to distinguish the two events for its own + // children, it should create a separate process which takes + // the role of real parent.") + if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg { + continue + } + anyWaitableTasks = true + if opts.Events&EventChildGroupStop != 0 { + if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil { + return wr, anyWaitableTasks } - if tracee.exitState >= TaskExitInitiated { - continue + } + if opts.Events&EventGroupContinue != 0 { + if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil { + return wr, anyWaitableTasks } + } + } + for tracee := range parent.ptraceTracees { + if !opts.matchesTask(tracee, parent.tg.pidns) { + continue + } + // Non-leaders do notify tracers on exit. + if opts.Events&EventExit != 0 && !tracee.exitTracerAcked { anyWaitableTasks = true - if opts.Events&EventTraceeStop != 0 { - if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil { - return wr, nil - } + if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil { + return wr, anyWaitableTasks } - if opts.Events&EventGroupContinue != 0 { - if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil { - return wr, nil - } + } + if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 { + continue + } + if tracee.exitState >= TaskExitInitiated { + continue + } + anyWaitableTasks = true + if opts.Events&EventTraceeStop != 0 { + if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil { + return wr, anyWaitableTasks + } + } + if opts.Events&EventGroupContinue != 0 { + if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil { + return wr, anyWaitableTasks } } } - if anyWaitableTasks { - return nil, ErrNoWaitableEvent - } - return nil, syserror.ECHILD + return nil, anyWaitableTasks } // Preconditions: The TaskSet mutex must be locked for writing. diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index cc441460c..14fa7ef92 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -183,7 +183,7 @@ func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // wait4 waits for the given child process to exit. func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) { - if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WALL|linux.WCLONE) != 0 { + if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { return 0, syscall.EINVAL } wopts := kernel.WaitOptions{ @@ -227,6 +227,9 @@ func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusage if options&linux.WNOHANG == 0 { wopts.BlockInterruptErr = kernel.ERESTARTSYS } + if options&linux.WNOTHREAD == 0 { + wopts.SiblingChildren = true + } wr, err := t.Wait(&wopts) if err != nil { @@ -278,7 +281,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal options := int(args[3].Uint()) rusageAddr := args[4].Pointer() - if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT) != 0 { + if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD) != 0 { return 0, nil, syscall.EINVAL } if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 { @@ -310,6 +313,9 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if options&linux.WNOHANG == 0 { wopts.BlockInterruptErr = kernel.ERESTARTSYS } + if options&linux.WNOTHREAD == 0 { + wopts.SiblingChildren = true + } wr, err := t.Wait(&wopts) if err != nil { diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index e8caf31fc..014679ec5 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -3179,7 +3179,9 @@ cc_binary( "//test/util:signal_util", "//test/util:test_main", "//test/util:test_util", + "//test/util:thread_util", "@com_google_absl//absl/strings", + "@com_google_absl//absl/synchronization", "@com_google_absl//absl/time", "@com_google_googletest//:gtest", ], diff --git a/test/syscalls/linux/wait.cc b/test/syscalls/linux/wait.cc index 50d0725a7..da3b97828 100644 --- a/test/syscalls/linux/wait.cc +++ b/test/syscalls/linux/wait.cc @@ -21,11 +21,13 @@ #include <unistd.h> #include <functional> +#include <tuple> #include <vector> #include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/strings/str_cat.h" +#include "absl/synchronization/mutex.h" #include "absl/time/clock.h" #include "absl/time/time.h" #include "test/util/cleanup.h" @@ -34,6 +36,7 @@ #include "test/util/posix_error.h" #include "test/util/signal_util.h" #include "test/util/test_util.h" +#include "test/util/thread_util.h" using ::testing::UnorderedElementsAre; @@ -42,10 +45,8 @@ using ::testing::UnorderedElementsAre; // // NOTE(b/22640830,b/27680907,b/29049891): Some functionality is not tested as // it is not currently supported by gVisor: -// * UID in waitid(2) siginfo. // * Process groups. // * Core dump status (WCOREDUMP). -// * Linux only option __WNOTHREAD. // // Tests for waiting on stopped/continued children are in sigstop.cc. @@ -357,13 +358,22 @@ INSTANTIATE_TEST_SUITE_P( return static_cast<pid_t>(si.si_pid); })); -// Fixture for tests parameterized by a function that takes the PID of a -// specific child to wait for, waits for it to exit, and checks that it exits -// with the given code. +// Fixture for tests parameterized by a (sysno, function) tuple. The function +// takes the PID of a specific child to wait for, waits for it to exit, and +// checks that it exits with the given code. class WaitSpecificChildTest - : public ::testing::TestWithParam<std::function<PosixError(pid_t, int)>> { + : public ::testing::TestWithParam< + std::tuple<int, std::function<PosixError(pid_t, int, int)>>> { protected: - PosixError WaitFor(pid_t pid, int code) { return GetParam()(pid, code); } + int Sysno() { return std::get<0>(GetParam()); } + + PosixError WaitForWithOptions(pid_t pid, int options, int code) { + return std::get<1>(GetParam())(pid, options, code); + } + + PosixError WaitFor(pid_t pid, int code) { + return std::get<1>(GetParam())(pid, 0, code); + } }; // Wait for specific child to exit. @@ -432,6 +442,75 @@ TEST_P(WaitSpecificChildTest, AfterExit) { EXPECT_NO_ERRNO(WaitFor(child, 0)); } +// Wait for child of sibling thread. +TEST_P(WaitSpecificChildTest, SiblingChildren) { + absl::Mutex mu; + pid_t child; + bool ready = false; + bool stop = false; + + ScopedThread t([&] { + absl::MutexLock ml(&mu); + EXPECT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds()); + ready = true; + mu.Await(absl::Condition(&stop)); + }); + + // N.B. This must be declared after ScopedThread, so it is destructed first, + // thus waking the thread. + absl::MutexLock ml(&mu); + mu.Await(absl::Condition(&ready)); + + EXPECT_NO_ERRNO(WaitFor(child, 0)); + + // Keep the sibling alive until after we've waited so the child isn't + // reparented. + stop = true; +} + +// Waiting for child of sibling thread not allowed with WNOTHREAD. +TEST_P(WaitSpecificChildTest, SiblingChildrenWNOTHREAD) { + // Linux added WNOTHREAD support to waitid(2) in + // 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to + // accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7. + // + // Skip the test if it isn't supported yet. + if (Sysno() == SYS_waitid) { + int ret = waitid(P_ALL, 0, nullptr, WEXITED | WNOHANG | __WNOTHREAD); + SKIP_IF(ret < 0 && errno == EINVAL); + } + + absl::Mutex mu; + pid_t child; + bool ready = false; + bool stop = false; + + ScopedThread t([&] { + absl::MutexLock ml(&mu); + EXPECT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds()); + ready = true; + mu.Await(absl::Condition(&stop)); + + // This thread can wait on child. + EXPECT_NO_ERRNO(WaitForWithOptions(child, __WNOTHREAD, 0)); + }); + + // N.B. This must be declared after ScopedThread, so it is destructed first, + // thus waking the thread. + absl::MutexLock ml(&mu); + mu.Await(absl::Condition(&ready)); + + // This thread can't wait on child. + EXPECT_THAT( + WaitForWithOptions(child, __WNOTHREAD, 0), + PosixErrorIs(ECHILD, ::testing::AnyOf(::testing::StrEq("waitid"), + ::testing::StrEq("wait4")))); + + // Keep the sibling alive until after we've waited so the child isn't + // reparented. + stop = true; +} + // Wait for specific child to exit. // A non-CLONE_THREAD child which sends SIGCHLD upon exit behaves much like // a forked process. @@ -551,55 +630,53 @@ TEST_P(WaitSpecificChildTest, AfterChildExecve) { EXPECT_NO_ERRNO(WaitFor(child, 0)); } -INSTANTIATE_TEST_SUITE_P( - Waiters, WaitSpecificChildTest, - ::testing::Values( - [](pid_t pid, int code) -> PosixError { - int status; - auto const rv = Wait4(pid, &status, 0, nullptr); - MaybeSave(); - if (rv < 0) { - return PosixError(errno, "wait4"); - } else if (rv != pid) { - return PosixError(EINVAL, absl::StrCat("unexpected pid: got ", rv, - ", wanted ", pid)); - } - if (!WIFEXITED(status) || WEXITSTATUS(status) != code) { - return PosixError( - EINVAL, absl::StrCat("unexpected wait status: got ", status, - ", wanted ", code)); - } - return NoError(); - }, - [](pid_t pid, int code) -> PosixError { - siginfo_t si; - auto const rv = Waitid(P_PID, pid, &si, WEXITED); - MaybeSave(); - if (rv < 0) { - return PosixError(errno, "waitid"); - } - if (si.si_pid != pid) { - return PosixError(EINVAL, - absl::StrCat("unexpected pid: got ", si.si_pid, +PosixError CheckWait4(pid_t pid, int options, int code) { + int status; + auto const rv = Wait4(pid, &status, options, nullptr); + MaybeSave(); + if (rv < 0) { + return PosixError(errno, "wait4"); + } else if (rv != pid) { + return PosixError( + EINVAL, absl::StrCat("unexpected pid: got ", rv, ", wanted ", pid)); + } + if (!WIFEXITED(status) || WEXITSTATUS(status) != code) { + return PosixError(EINVAL, absl::StrCat("unexpected wait status: got ", + status, ", wanted ", code)); + } + return NoError(); +}; + +PosixError CheckWaitid(pid_t pid, int options, int code) { + siginfo_t si; + auto const rv = Waitid(P_PID, pid, &si, options | WEXITED); + MaybeSave(); + if (rv < 0) { + return PosixError(errno, "waitid"); + } + if (si.si_pid != pid) { + return PosixError(EINVAL, absl::StrCat("unexpected pid: got ", si.si_pid, ", wanted ", pid)); - } - if (si.si_signo != SIGCHLD) { - return PosixError( - EINVAL, absl::StrCat("unexpected signo: got ", si.si_signo, - ", wanted ", SIGCHLD)); - } - if (si.si_status != code) { - return PosixError( - EINVAL, absl::StrCat("unexpected status: got ", si.si_status, - ", wanted ", code)); - } - if (si.si_code != CLD_EXITED) { - return PosixError(EINVAL, - absl::StrCat("unexpected code: got ", si.si_code, + } + if (si.si_signo != SIGCHLD) { + return PosixError(EINVAL, absl::StrCat("unexpected signo: got ", + si.si_signo, ", wanted ", SIGCHLD)); + } + if (si.si_status != code) { + return PosixError(EINVAL, absl::StrCat("unexpected status: got ", + si.si_status, ", wanted ", code)); + } + if (si.si_code != CLD_EXITED) { + return PosixError(EINVAL, absl::StrCat("unexpected code: got ", si.si_code, ", wanted ", CLD_EXITED)); - } - return NoError(); - })); + } + return NoError(); +} + +INSTANTIATE_TEST_SUITE_P( + Waiters, WaitSpecificChildTest, + ::testing::Values(std::make_tuple(SYS_wait4, CheckWait4), + std::make_tuple(SYS_waitid, CheckWaitid))); // WIFEXITED, WIFSIGNALED, WTERMSIG indicate signal exit. TEST(WaitTest, SignalExit) { |