// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <signal.h>
#include <sys/mman.h>
#include <sys/ptrace.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

#include <functional>
#include <tuple>
#include <vector>

#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/strings/str_cat.h"
#include "absl/synchronization/mutex.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "test/util/cleanup.h"
#include "test/util/file_descriptor.h"
#include "test/util/logging.h"
#include "test/util/multiprocess_util.h"
#include "test/util/posix_error.h"
#include "test/util/signal_util.h"
#include "test/util/test_util.h"
#include "test/util/thread_util.h"
#include "test/util/time_util.h"

using ::testing::UnorderedElementsAre;

// These unit tests focus on the wait4(2) system call, but include a basic
// checks for the i386 waitpid(2) syscall, which is a subset of wait4(2).
//
// NOTE(b/22640830,b/27680907,b/29049891): Some functionality is not tested as
// it is not currently supported by gVisor:
// * Process groups.
// * Core dump status (WCOREDUMP).
//
// Tests for waiting on stopped/continued children are in sigstop.cc.

namespace gvisor {
namespace testing {

namespace {

// The CloneChild function seems to need more than one page of stack space.
static const size_t kStackSize = 2 * kPageSize;

// The child thread created in CloneAndExit runs this function.
// This child does not have the TLS setup, so it must not use glibc functions.
int CloneChild(void* priv) {
  int64_t sleep = reinterpret_cast<int64_t>(priv);
  SleepSafe(absl::Seconds(sleep));

  // glibc's _exit(2) function wrapper will helpfully call exit_group(2),
  // exiting the entire process.
  syscall(__NR_exit, 0);
  return 1;
}

// ForkAndExit forks a child process which exits with exit_code, after
// sleeping for the specified duration (seconds).
pid_t ForkAndExit(int exit_code, int64_t sleep) {
  pid_t child = fork();
  if (child == 0) {
    SleepSafe(absl::Seconds(sleep));
    _exit(exit_code);
  }
  return child;
}

int64_t clock_gettime_nsecs(clockid_t id) {
  struct timespec ts;
  TEST_PCHECK(clock_gettime(id, &ts) == 0);
  return (ts.tv_sec * 1000000000 + ts.tv_nsec);
}

void spin(int64_t sec) {
  int64_t ns = sec * 1000000000;
  int64_t start = clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID);
  int64_t end = start + ns;

  do {
    constexpr int kLoopCount = 1000000;  // large and arbitrary
    // volatile to prevent the compiler from skipping this loop.
    for (volatile int i = 0; i < kLoopCount; i++) {
    }
  } while (clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID) < end);
}

// ForkSpinAndExit forks a child process which exits with exit_code, after
// spinning for the specified duration (seconds).
pid_t ForkSpinAndExit(int exit_code, int64_t spintime) {
  pid_t child = fork();
  if (child == 0) {
    spin(spintime);
    _exit(exit_code);
  }
  return child;
}

absl::Duration RusageCpuTime(const struct rusage& ru) {
  return absl::DurationFromTimeval(ru.ru_utime) +
         absl::DurationFromTimeval(ru.ru_stime);
}

// Returns the address of the top of the stack.
// Free with FreeStack.
uintptr_t AllocStack() {
  void* addr = mmap(nullptr, kStackSize, PROT_READ | PROT_WRITE,
                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

  if (addr == MAP_FAILED) {
    return reinterpret_cast<uintptr_t>(MAP_FAILED);
  }

  return reinterpret_cast<uintptr_t>(addr) + kStackSize;
}

// Frees a stack page allocated with AllocStack.
int FreeStack(uintptr_t addr) {
  addr -= kStackSize;
  return munmap(reinterpret_cast<void*>(addr), kPageSize);
}

// CloneAndExit clones a child thread, which exits with 0 after sleeping for
// the specified duration (must be in seconds). extra_flags are ORed against
// the standard clone(2) flags.
int CloneAndExit(int64_t sleep, uintptr_t stack, int extra_flags) {
  return clone(CloneChild, reinterpret_cast<void*>(stack),
               CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_VM | extra_flags,
               reinterpret_cast<void*>(sleep));
}

// Simple wrappers around wait4(2) and waitid(2) that ignore interrupts.
constexpr auto Wait4 = RetryEINTR(wait4);
constexpr auto Waitid = RetryEINTR(waitid);

// Fixture for tests parameterized by a function that waits for any child to
// exit with the given options, checks that it exited with the given code, and
// then returns its PID.
//
// N.B. These tests run in a multi-threaded environment. We assume that
// background threads do not create child processes and are not themselves
// created with clone(... | SIGCHLD). Either may cause these tests to
// erroneously wait on child processes/threads.
class WaitAnyChildTest : public ::testing::TestWithParam<
                             std::function<PosixErrorOr<pid_t>(int, int)>> {
 protected:
  PosixErrorOr<pid_t> WaitAny(int code) { return WaitAnyWithOptions(code, 0); }

  PosixErrorOr<pid_t> WaitAnyWithOptions(int code, int options) {
    return GetParam()(code, options);
  }
};

// Wait for any child to exit.
TEST_P(WaitAnyChildTest, Fork) {
  pid_t child;
  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());

  EXPECT_THAT(WaitAny(0), IsPosixErrorOkAndHolds(child));
}

// Call wait4 for any process after the child has already exited.
TEST_P(WaitAnyChildTest, AfterExit) {
  pid_t child;
  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());

  absl::SleepFor(absl::Seconds(5));

  EXPECT_THAT(WaitAny(0), IsPosixErrorOkAndHolds(child));
}

// Wait for multiple children to exit, waiting for either at a time.
TEST_P(WaitAnyChildTest, MultipleFork) {
  pid_t child1, child2;
  ASSERT_THAT(child1 = ForkAndExit(0, 0), SyscallSucceeds());
  ASSERT_THAT(child2 = ForkAndExit(0, 0), SyscallSucceeds());

  std::vector<pid_t> pids;
  pids.push_back(ASSERT_NO_ERRNO_AND_VALUE(WaitAny(0)));
  pids.push_back(ASSERT_NO_ERRNO_AND_VALUE(WaitAny(0)));
  EXPECT_THAT(pids, UnorderedElementsAre(child1, child2));
}

// Wait for any child to exit.
// A non-CLONE_THREAD child which sends SIGCHLD upon exit behaves much like
// a forked process.
TEST_P(WaitAnyChildTest, CloneSIGCHLD) {
  uintptr_t stack;
  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
  auto free =
      Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });

  int child;
  ASSERT_THAT(child = CloneAndExit(0, stack, SIGCHLD), SyscallSucceeds());

  EXPECT_THAT(WaitAny(0), IsPosixErrorOkAndHolds(child));
}

// Wait for a child thread and process.
TEST_P(WaitAnyChildTest, ForkAndClone) {
  pid_t process;
  ASSERT_THAT(process = ForkAndExit(0, 0), SyscallSucceeds());

  uintptr_t stack;
  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
  auto free =
      Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });

  int thread;
  // Send SIGCHLD for normal wait semantics.
  ASSERT_THAT(thread = CloneAndExit(0, stack, SIGCHLD), SyscallSucceeds());

  std::vector<pid_t> pids;
  pids.push_back(ASSERT_NO_ERRNO_AND_VALUE(WaitAny(0)));
  pids.push_back(ASSERT_NO_ERRNO_AND_VALUE(WaitAny(0)));
  EXPECT_THAT(pids, UnorderedElementsAre(process, thread));
}

// Return immediately if no child has exited.
TEST_P(WaitAnyChildTest, WaitWNOHANG) {
  EXPECT_THAT(WaitAnyWithOptions(0, WNOHANG),
              PosixErrorIs(ECHILD, ::testing::_));
}

// Bad options passed
TEST_P(WaitAnyChildTest, BadOption) {
  EXPECT_THAT(WaitAnyWithOptions(0, 123456),
              PosixErrorIs(EINVAL, ::testing::_));
}

TEST_P(WaitAnyChildTest, WaitedChildRusage) {
  struct rusage before;
  ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &before), SyscallSucceeds());

  pid_t child;
  constexpr absl::Duration kSpin = absl::Seconds(3);
  ASSERT_THAT(child = ForkSpinAndExit(0, absl::ToInt64Seconds(kSpin)),
              SyscallSucceeds());
  ASSERT_THAT(WaitAny(0), IsPosixErrorOkAndHolds(child));

  struct rusage after;
  ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &after), SyscallSucceeds());

  EXPECT_GE(RusageCpuTime(after) - RusageCpuTime(before), kSpin);
}

TEST_P(WaitAnyChildTest, IgnoredChildRusage) {
  // "POSIX.1-2001 specifies that if the disposition of SIGCHLD is
  // set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see
  // sigaction(2)), then children that terminate do not become zombies and a
  // call to wait() or waitpid() will block until all children have terminated,
  // and then fail with errno set to ECHILD." - waitpid(2)
  //
  // "RUSAGE_CHILDREN: Return resource usage statistics for all children of the
  // calling process that have terminated *and been waited for*." -
  // getrusage(2), emphasis added

  struct sigaction sa;
  sa.sa_handler = SIG_IGN;
  const auto cleanup_sigact =
      ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGCHLD, sa));

  struct rusage before;
  ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &before), SyscallSucceeds());

  const absl::Duration start =
      absl::Nanoseconds(clock_gettime_nsecs(CLOCK_MONOTONIC));

  constexpr absl::Duration kSpin = absl::Seconds(3);

  // ForkAndSpin uses CLOCK_THREAD_CPUTIME_ID, which is lower resolution than,
  // and may diverge from, CLOCK_MONOTONIC, so we allow a small grace period but
  // still check that we blocked for a while.
  constexpr absl::Duration kSpinGrace = absl::Milliseconds(100);

  pid_t child;
  ASSERT_THAT(child = ForkSpinAndExit(0, absl::ToInt64Seconds(kSpin)),
              SyscallSucceeds());
  ASSERT_THAT(WaitAny(0), PosixErrorIs(ECHILD, ::testing::_));
  const absl::Duration end =
      absl::Nanoseconds(clock_gettime_nsecs(CLOCK_MONOTONIC));
  EXPECT_GE(end - start, kSpin - kSpinGrace);

  struct rusage after;
  ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &after), SyscallSucceeds());
  EXPECT_EQ(before.ru_utime.tv_sec, after.ru_utime.tv_sec);
  EXPECT_EQ(before.ru_utime.tv_usec, after.ru_utime.tv_usec);
  EXPECT_EQ(before.ru_stime.tv_sec, after.ru_stime.tv_sec);
  EXPECT_EQ(before.ru_stime.tv_usec, after.ru_stime.tv_usec);
}

INSTANTIATE_TEST_SUITE_P(
    Waiters, WaitAnyChildTest,
    ::testing::Values(
        [](int code, int options) -> PosixErrorOr<pid_t> {
          int status;
          auto const pid = Wait4(-1, &status, options, nullptr);
          MaybeSave();
          if (pid < 0) {
            return PosixError(errno, "wait4");
          }
          if (!WIFEXITED(status) || WEXITSTATUS(status) != code) {
            return PosixError(
                EINVAL, absl::StrCat("unexpected wait status: got ", status,
                                     ", wanted ", code));
          }
          return static_cast<pid_t>(pid);
        },
        [](int code, int options) -> PosixErrorOr<pid_t> {
          siginfo_t si;
          auto const rv = Waitid(P_ALL, 0, &si, WEXITED | options);
          MaybeSave();
          if (rv < 0) {
            return PosixError(errno, "waitid");
          }
          if (si.si_signo != SIGCHLD) {
            return PosixError(
                EINVAL, absl::StrCat("unexpected signo: got ", si.si_signo,
                                     ", wanted ", SIGCHLD));
          }
          if (si.si_status != code) {
            return PosixError(
                EINVAL, absl::StrCat("unexpected status: got ", si.si_status,
                                     ", wanted ", code));
          }
          if (si.si_code != CLD_EXITED) {
            return PosixError(EINVAL,
                              absl::StrCat("unexpected code: got ", si.si_code,
                                           ", wanted ", CLD_EXITED));
          }
          auto const uid = getuid();
          if (si.si_uid != uid) {
            return PosixError(EINVAL,
                              absl::StrCat("unexpected uid: got ", si.si_uid,
                                           ", wanted ", uid));
          }
          return static_cast<pid_t>(si.si_pid);
        }));

// Fixture for tests parameterized by a (sysno, function) tuple. The function
// takes the PID of a specific child to wait for, waits for it to exit, and
// checks that it exits with the given code.
class WaitSpecificChildTest
    : public ::testing::TestWithParam<
          std::tuple<int, std::function<PosixError(pid_t, int, int)>>> {
 protected:
  int Sysno() { return std::get<0>(GetParam()); }

  PosixError WaitForWithOptions(pid_t pid, int options, int code) {
    return std::get<1>(GetParam())(pid, options, code);
  }

  PosixError WaitFor(pid_t pid, int code) {
    return std::get<1>(GetParam())(pid, 0, code);
  }
};

// Wait for specific child to exit.
TEST_P(WaitSpecificChildTest, Fork) {
  pid_t child;
  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());

  EXPECT_NO_ERRNO(WaitFor(child, 0));
}

// Non-zero exit codes are correctly propagated.
TEST_P(WaitSpecificChildTest, NormalExit) {
  pid_t child;
  ASSERT_THAT(child = ForkAndExit(42, 0), SyscallSucceeds());

  EXPECT_NO_ERRNO(WaitFor(child, 42));
}

// Wait for multiple children to exit.
TEST_P(WaitSpecificChildTest, MultipleFork) {
  pid_t child1, child2;
  ASSERT_THAT(child1 = ForkAndExit(0, 0), SyscallSucceeds());
  ASSERT_THAT(child2 = ForkAndExit(0, 0), SyscallSucceeds());

  EXPECT_NO_ERRNO(WaitFor(child1, 0));
  EXPECT_NO_ERRNO(WaitFor(child2, 0));
}

// Wait for multiple children to exit, out of the order they were created.
TEST_P(WaitSpecificChildTest, MultipleForkOutOfOrder) {
  pid_t child1, child2;
  ASSERT_THAT(child1 = ForkAndExit(0, 0), SyscallSucceeds());
  ASSERT_THAT(child2 = ForkAndExit(0, 0), SyscallSucceeds());

  EXPECT_NO_ERRNO(WaitFor(child2, 0));
  EXPECT_NO_ERRNO(WaitFor(child1, 0));
}

// Wait for specific child to exit, entering wait4 before the exit occurs.
TEST_P(WaitSpecificChildTest, ForkSleep) {
  pid_t child;
  ASSERT_THAT(child = ForkAndExit(0, 5), SyscallSucceeds());

  EXPECT_NO_ERRNO(WaitFor(child, 0));
}

// Wait should block until the child exits.
TEST_P(WaitSpecificChildTest, ForkBlock) {
  pid_t child;

  auto start = absl::Now();
  ASSERT_THAT(child = ForkAndExit(0, 5), SyscallSucceeds());

  EXPECT_NO_ERRNO(WaitFor(child, 0));

  EXPECT_GE(absl::Now() - start, absl::Seconds(5));
}

// Waiting after the child has already exited returns immediately.
TEST_P(WaitSpecificChildTest, AfterExit) {
  pid_t child;
  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());

  absl::SleepFor(absl::Seconds(5));

  EXPECT_NO_ERRNO(WaitFor(child, 0));
}

// Wait for child of sibling thread.
TEST_P(WaitSpecificChildTest, SiblingChildren) {
  absl::Mutex mu;
  pid_t child;
  bool ready = false;
  bool stop = false;

  ScopedThread t([&] {
    absl::MutexLock ml(&mu);
    EXPECT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
    ready = true;
    mu.Await(absl::Condition(&stop));
  });

  // N.B. This must be declared after ScopedThread, so it is destructed first,
  // thus waking the thread.
  absl::MutexLock ml(&mu);
  mu.Await(absl::Condition(&ready));

  EXPECT_NO_ERRNO(WaitFor(child, 0));

  // Keep the sibling alive until after we've waited so the child isn't
  // reparented.
  stop = true;
}

// Waiting for child of sibling thread not allowed with WNOTHREAD.
TEST_P(WaitSpecificChildTest, SiblingChildrenWNOTHREAD) {
  // Linux added WNOTHREAD support to waitid(2) in
  // 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to
  // accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7.
  //
  // Skip the test if it isn't supported yet.
  if (Sysno() == SYS_waitid) {
    int ret = waitid(P_ALL, 0, nullptr, WEXITED | WNOHANG | __WNOTHREAD);
    SKIP_IF(ret < 0 && errno == EINVAL);
  }

  absl::Mutex mu;
  pid_t child;
  bool ready = false;
  bool stop = false;

  ScopedThread t([&] {
    absl::MutexLock ml(&mu);
    EXPECT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
    ready = true;
    mu.Await(absl::Condition(&stop));

    // This thread can wait on child.
    EXPECT_NO_ERRNO(WaitForWithOptions(child, __WNOTHREAD, 0));
  });

  // N.B. This must be declared after ScopedThread, so it is destructed first,
  // thus waking the thread.
  absl::MutexLock ml(&mu);
  mu.Await(absl::Condition(&ready));

  // This thread can't wait on child.
  EXPECT_THAT(WaitForWithOptions(child, __WNOTHREAD, 0),
              PosixErrorIs(ECHILD, ::testing::_));

  // Keep the sibling alive until after we've waited so the child isn't
  // reparented.
  stop = true;
}

// Wait for specific child to exit.
// A non-CLONE_THREAD child which sends SIGCHLD upon exit behaves much like
// a forked process.
TEST_P(WaitSpecificChildTest, CloneSIGCHLD) {
  uintptr_t stack;
  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
  auto free =
      Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });

  int child;
  ASSERT_THAT(child = CloneAndExit(0, stack, SIGCHLD), SyscallSucceeds());

  EXPECT_NO_ERRNO(WaitFor(child, 0));
}

// Wait for specific child to exit.
// A non-CLONE_THREAD child which does not send SIGCHLD upon exit can be waited
// on, but returns ECHILD.
TEST_P(WaitSpecificChildTest, CloneNoSIGCHLD) {
  uintptr_t stack;
  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
  auto free =
      Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });

  int child;
  ASSERT_THAT(child = CloneAndExit(0, stack, 0), SyscallSucceeds());

  EXPECT_THAT(WaitFor(child, 0), PosixErrorIs(ECHILD, ::testing::_));
}

// Waiting after the child has already exited returns immediately.
TEST_P(WaitSpecificChildTest, CloneAfterExit) {
  uintptr_t stack;
  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
  auto free =
      Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });

  int child;
  // Send SIGCHLD for normal wait semantics.
  ASSERT_THAT(child = CloneAndExit(0, stack, SIGCHLD), SyscallSucceeds());

  absl::SleepFor(absl::Seconds(5));

  EXPECT_NO_ERRNO(WaitFor(child, 0));
}

// A CLONE_THREAD child cannot be waited on.
TEST_P(WaitSpecificChildTest, CloneThread) {
  uintptr_t stack;
  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
  auto free =
      Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });

  int child;
  ASSERT_THAT(child = CloneAndExit(15, stack, CLONE_THREAD), SyscallSucceeds());
  auto start = absl::Now();

  EXPECT_THAT(WaitFor(child, 0), PosixErrorIs(ECHILD, ::testing::_));

  // Ensure wait4 didn't block.
  EXPECT_LE(absl::Now() - start, absl::Seconds(10));

  // Since we can't wait on the child, we sleep to try to avoid freeing its
  // stack before it exits.
  absl::SleepFor(absl::Seconds(5));
}

// A child that does not send a SIGCHLD on exit may be waited on with
// the __WCLONE flag.
TEST_P(WaitSpecificChildTest, CloneWCLONE) {
  // Linux added WCLONE support to waitid(2) in
  // 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to
  // accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7.
  //
  // Skip the test if it isn't supported yet.
  if (Sysno() == SYS_waitid) {
    int ret = waitid(P_ALL, 0, nullptr, WEXITED | WNOHANG | __WCLONE);
    SKIP_IF(ret < 0 && errno == EINVAL);
  }

  uintptr_t stack;
  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
  auto free =
      Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });

  int child;
  ASSERT_THAT(child = CloneAndExit(0, stack, 0), SyscallSucceeds());

  EXPECT_NO_ERRNO(WaitForWithOptions(child, __WCLONE, 0));
}

// A forked child cannot be waited on with WCLONE.
TEST_P(WaitSpecificChildTest, ForkWCLONE) {
  // Linux added WCLONE support to waitid(2) in
  // 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to
  // accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7.
  //
  // Skip the test if it isn't supported yet.
  if (Sysno() == SYS_waitid) {
    int ret = waitid(P_ALL, 0, nullptr, WEXITED | WNOHANG | __WCLONE);
    SKIP_IF(ret < 0 && errno == EINVAL);
  }

  pid_t child;
  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());

  EXPECT_THAT(WaitForWithOptions(child, WNOHANG | __WCLONE, 0),
              PosixErrorIs(ECHILD, ::testing::_));

  EXPECT_NO_ERRNO(WaitFor(child, 0));
}

// Any type of child can be waited on with WALL.
TEST_P(WaitSpecificChildTest, WALL) {
  // Linux added WALL support to waitid(2) in
  // 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to
  // accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7.
  //
  // Skip the test if it isn't supported yet.
  if (Sysno() == SYS_waitid) {
    int ret = waitid(P_ALL, 0, nullptr, WEXITED | WNOHANG | __WALL);
    SKIP_IF(ret < 0 && errno == EINVAL);
  }

  pid_t child;
  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());

  EXPECT_NO_ERRNO(WaitForWithOptions(child, __WALL, 0));

  uintptr_t stack;
  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
  auto free =
      Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });

  ASSERT_THAT(child = CloneAndExit(0, stack, 0), SyscallSucceeds());

  EXPECT_NO_ERRNO(WaitForWithOptions(child, __WALL, 0));
}

// Return ECHILD for bad child.
TEST_P(WaitSpecificChildTest, BadChild) {
  EXPECT_THAT(WaitFor(42, 0), PosixErrorIs(ECHILD, ::testing::_));
}

// Wait for a child process that only exits after calling execve(2) from a
// non-leader thread.
TEST_P(WaitSpecificChildTest, AfterChildExecve) {
  ExecveArray const owned_child_argv = {"/bin/true"};
  char* const* const child_argv = owned_child_argv.get();

  uintptr_t stack;
  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
  auto free =
      Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });

  pid_t const child = fork();
  if (child == 0) {
    // Give the parent some time to start waiting.
    SleepSafe(absl::Seconds(5));
    // Pass CLONE_VFORK to block the original thread in the child process until
    // the clone thread calls execve, annihilating them both. (This means that
    // if clone returns at all, something went wrong.)
    //
    // N.B. clone(2) is not officially async-signal-safe, but at minimum glibc's
    // x86_64 implementation is safe. See glibc
    // sysdeps/unix/sysv/linux/x86_64/clone.S.
    clone(
        +[](void* arg) {
          auto child_argv = static_cast<char* const*>(arg);
          execve(child_argv[0], child_argv, /* envp = */ nullptr);
          return errno;
        },
        reinterpret_cast<void*>(stack),
        CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM |
            CLONE_VFORK,
        const_cast<char**>(child_argv));
    _exit(errno);
  }
  ASSERT_THAT(child, SyscallSucceeds());
  EXPECT_NO_ERRNO(WaitFor(child, 0));
}

PosixError CheckWait4(pid_t pid, int options, int code) {
  int status;
  auto const rv = Wait4(pid, &status, options, nullptr);
  MaybeSave();
  if (rv < 0) {
    return PosixError(errno, "wait4");
  } else if (rv != pid) {
    return PosixError(
        EINVAL, absl::StrCat("unexpected pid: got ", rv, ", wanted ", pid));
  }
  if (!WIFEXITED(status) || WEXITSTATUS(status) != code) {
    return PosixError(EINVAL, absl::StrCat("unexpected wait status: got ",
                                           status, ", wanted ", code));
  }
  return NoError();
};

PosixError CheckWaitid(pid_t pid, int options, int code) {
  siginfo_t si;
  auto const rv = Waitid(P_PID, pid, &si, options | WEXITED);
  MaybeSave();
  if (rv < 0) {
    return PosixError(errno, "waitid");
  }
  if (si.si_pid != pid) {
    return PosixError(EINVAL, absl::StrCat("unexpected pid: got ", si.si_pid,
                                           ", wanted ", pid));
  }
  if (si.si_signo != SIGCHLD) {
    return PosixError(EINVAL, absl::StrCat("unexpected signo: got ",
                                           si.si_signo, ", wanted ", SIGCHLD));
  }
  if (si.si_status != code) {
    return PosixError(EINVAL, absl::StrCat("unexpected status: got ",
                                           si.si_status, ", wanted ", code));
  }
  if (si.si_code != CLD_EXITED) {
    return PosixError(EINVAL, absl::StrCat("unexpected code: got ", si.si_code,
                                           ", wanted ", CLD_EXITED));
  }
  return NoError();
}

INSTANTIATE_TEST_SUITE_P(
    Waiters, WaitSpecificChildTest,
    ::testing::Values(std::make_tuple(SYS_wait4, CheckWait4),
                      std::make_tuple(SYS_waitid, CheckWaitid)));

// WIFEXITED, WIFSIGNALED, WTERMSIG indicate signal exit.
TEST(WaitTest, SignalExit) {
  pid_t child;
  ASSERT_THAT(child = ForkAndExit(0, 10), SyscallSucceeds());

  EXPECT_THAT(kill(child, SIGKILL), SyscallSucceeds());

  int status;
  EXPECT_THAT(Wait4(child, &status, 0, nullptr),
              SyscallSucceedsWithValue(child));

  EXPECT_FALSE(WIFEXITED(status));
  EXPECT_TRUE(WIFSIGNALED(status));
  EXPECT_EQ(SIGKILL, WTERMSIG(status));
}

// waitid requires at least one option.
TEST(WaitTest, WaitidOptions) {
  EXPECT_THAT(Waitid(P_ALL, 0, nullptr, 0), SyscallFailsWithErrno(EINVAL));
}

// waitid does not wait for a child to exit if not passed WEXITED.
TEST(WaitTest, WaitidNoWEXITED) {
  pid_t child;
  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
  EXPECT_THAT(Waitid(P_ALL, 0, nullptr, WSTOPPED),
              SyscallFailsWithErrno(ECHILD));
  EXPECT_THAT(Waitid(P_ALL, 0, nullptr, WEXITED), SyscallSucceeds());
}

// WNOWAIT allows the same wait result to be returned again.
TEST(WaitTest, WaitidWNOWAIT) {
  pid_t child;
  ASSERT_THAT(child = ForkAndExit(42, 0), SyscallSucceeds());

  siginfo_t info;
  ASSERT_THAT(Waitid(P_PID, child, &info, WEXITED | WNOWAIT),
              SyscallSucceeds());
  EXPECT_EQ(child, info.si_pid);
  EXPECT_EQ(SIGCHLD, info.si_signo);
  EXPECT_EQ(CLD_EXITED, info.si_code);
  EXPECT_EQ(42, info.si_status);

  ASSERT_THAT(Waitid(P_PID, child, &info, WEXITED), SyscallSucceeds());
  EXPECT_EQ(child, info.si_pid);
  EXPECT_EQ(SIGCHLD, info.si_signo);
  EXPECT_EQ(CLD_EXITED, info.si_code);
  EXPECT_EQ(42, info.si_status);

  EXPECT_THAT(Waitid(P_PID, child, &info, WEXITED),
              SyscallFailsWithErrno(ECHILD));
}

// waitpid(pid, status, options) is equivalent to
// wait4(pid, status, options, nullptr).
// This is a dedicated syscall on i386, glibc maps it to wait4 on amd64.
TEST(WaitTest, WaitPid) {
  pid_t child;
  ASSERT_THAT(child = ForkAndExit(42, 0), SyscallSucceeds());

  int status;
  EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0),
              SyscallSucceedsWithValue(child));

  EXPECT_TRUE(WIFEXITED(status));
  EXPECT_EQ(42, WEXITSTATUS(status));
}

// Test that signaling a zombie succeeds. This is a signals test that is in this
// file for some reason.
TEST(WaitTest, KillZombie) {
  pid_t child;
  ASSERT_THAT(child = ForkAndExit(42, 0), SyscallSucceeds());

  // Sleep for three seconds to ensure the child has exited.
  absl::SleepFor(absl::Seconds(3));

  // The child is now a zombie. Check that killing it returns 0.
  EXPECT_THAT(kill(child, SIGTERM), SyscallSucceeds());
  EXPECT_THAT(kill(child, 0), SyscallSucceeds());

  EXPECT_THAT(Wait4(child, nullptr, 0, nullptr),
              SyscallSucceedsWithValue(child));
}

TEST(WaitTest, Wait4Rusage) {
  pid_t child;
  constexpr absl::Duration kSpin = absl::Seconds(3);
  ASSERT_THAT(child = ForkSpinAndExit(21, absl::ToInt64Seconds(kSpin)),
              SyscallSucceeds());

  int status;
  struct rusage rusage = {};
  ASSERT_THAT(Wait4(child, &status, 0, &rusage),
              SyscallSucceedsWithValue(child));

  EXPECT_TRUE(WIFEXITED(status));
  EXPECT_EQ(21, WEXITSTATUS(status));

  EXPECT_GE(RusageCpuTime(rusage), kSpin);
}

TEST(WaitTest, WaitidRusage) {
  pid_t child;
  constexpr absl::Duration kSpin = absl::Seconds(3);
  ASSERT_THAT(child = ForkSpinAndExit(27, absl::ToInt64Seconds(kSpin)),
              SyscallSucceeds());

  siginfo_t si = {};
  struct rusage rusage = {};

  // From waitid(2):
  // The  raw  waitid()  system  call  takes a fifth argument, of type
  // struct rusage *. If this argument is non-NULL, then  it  is  used
  // to return resource  usage  information  about  the  child,  in the
  // same manner as wait4(2).
  EXPECT_THAT(
      RetryEINTR(syscall)(SYS_waitid, P_PID, child, &si, WEXITED, &rusage),
      SyscallSucceeds());
  EXPECT_EQ(si.si_signo, SIGCHLD);
  EXPECT_EQ(si.si_code, CLD_EXITED);
  EXPECT_EQ(si.si_status, 27);
  EXPECT_EQ(si.si_pid, child);

  EXPECT_GE(RusageCpuTime(rusage), kSpin);
}

// After bf959931ddb88c4e4366e96dd22e68fa0db9527c ("wait/ptrace: assume __WALL
// if the child is traced") (Linux 4.7), tracees are always eligible for
// waiting, regardless of type.
TEST(WaitTest, TraceeWALL) {
  int fds[2];
  ASSERT_THAT(pipe(fds), SyscallSucceeds());
  FileDescriptor rfd(fds[0]);
  FileDescriptor wfd(fds[1]);

  pid_t child = fork();
  if (child == 0) {
    // Child.
    rfd.reset();

    TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, nullptr, nullptr) == 0);

    // Notify parent that we're now a tracee.
    wfd.reset();

    _exit(0);
  }
  ASSERT_THAT(child, SyscallSucceeds());

  wfd.reset();

  // Wait for child to become tracee.
  char c;
  EXPECT_THAT(ReadFd(rfd.get(), &c, sizeof(c)), SyscallSucceedsWithValue(0));

  // We can wait on the fork child with WCLONE, as it is a tracee.
  int status;
  if (IsRunningOnGvisor()) {
    ASSERT_THAT(Wait4(child, &status, __WCLONE, nullptr),
                SyscallSucceedsWithValue(child));

    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) << status;
  } else {
    // On older versions of Linux, we may get ECHILD.
    ASSERT_THAT(Wait4(child, &status, __WCLONE, nullptr),
                ::testing::AnyOf(SyscallSucceedsWithValue(child),
                                 SyscallFailsWithErrno(ECHILD)));
  }
}

}  // namespace

}  // namespace testing
}  // namespace gvisor