diff options
author | Jing Chen <chjing@google.com> | 2021-04-29 15:19:22 -0700 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2021-04-29 15:22:09 -0700 |
commit | eefa00f4aecc7c72a1866357fbd1bbb58aa6fc5e (patch) | |
tree | 53bbc21d6863aa69230cd545e7ffe396687207a2 | |
parent | 669523f7d225006d1fbc7c6a2b347736019908e2 (diff) |
Implement epoll_pwait2.
PiperOrigin-RevId: 371216407
-rw-r--r-- | pkg/sentry/strace/linux64_amd64.go | 1 | ||||
-rw-r--r-- | pkg/sentry/strace/linux64_arm64.go | 1 | ||||
-rw-r--r-- | pkg/sentry/syscalls/epoll.go | 8 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/linux64.go | 2 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_epoll.go | 56 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/epoll.go | 52 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/vfs2.go | 2 | ||||
-rw-r--r-- | test/syscalls/linux/epoll.cc | 53 | ||||
-rw-r--r-- | test/util/test_util.h | 9 |
9 files changed, 160 insertions, 24 deletions
diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go index 71b92eaee..d66befe81 100644 --- a/pkg/sentry/strace/linux64_amd64.go +++ b/pkg/sentry/strace/linux64_amd64.go @@ -371,6 +371,7 @@ var linuxAMD64 = SyscallMap{ 433: makeSyscallInfo("fspick", FD, Path, Hex), 434: makeSyscallInfo("pidfd_open", Hex, Hex), 435: makeSyscallInfo("clone3", Hex, Hex), + 441: makeSyscallInfo("epoll_pwait2", FD, EpollEvents, Hex, Timespec, SigSet), } func init() { diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go index bd7361a52..1a2d7d75f 100644 --- a/pkg/sentry/strace/linux64_arm64.go +++ b/pkg/sentry/strace/linux64_arm64.go @@ -312,6 +312,7 @@ var linuxARM64 = SyscallMap{ 433: makeSyscallInfo("fspick", FD, Path, Hex), 434: makeSyscallInfo("pidfd_open", Hex, Hex), 435: makeSyscallInfo("clone3", Hex, Hex), + 441: makeSyscallInfo("epoll_pwait2", FD, EpollEvents, Hex, Timespec, SigSet), } func init() { diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go index e115683f8..3b4d79889 100644 --- a/pkg/sentry/syscalls/epoll.go +++ b/pkg/sentry/syscalls/epoll.go @@ -119,7 +119,7 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { } // WaitEpoll implements the epoll_wait(2) linux syscall. -func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEvent, error) { +func WaitEpoll(t *kernel.Task, fd int32, max int, timeoutInNanos int64) ([]linux.EpollEvent, error) { // Get epoll from the file descriptor. epollfile := t.GetFile(fd) if epollfile == nil { @@ -136,7 +136,7 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEve // Try to read events and return right away if we got them or if the // caller requested a non-blocking "wait". r := e.ReadEvents(max) - if len(r) != 0 || timeout == 0 { + if len(r) != 0 || timeoutInNanos == 0 { return r, nil } @@ -144,8 +144,8 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEve // and register with the epoll object for readability events. var haveDeadline bool var deadline ktime.Time - if timeout > 0 { - timeoutDur := time.Duration(timeout) * time.Millisecond + if timeoutInNanos > 0 { + timeoutDur := time.Duration(timeoutInNanos) * time.Nanosecond deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur) haveDeadline = true } diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 2d2212605..090c5ffcb 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -404,6 +404,7 @@ var AMD64 = &kernel.SyscallTable{ 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + 441: syscalls.Supported("epoll_pwait2", EpollPwait2), }, Emulate: map[hostarch.Addr]uintptr{ 0xffffffffff600000: 96, // vsyscall gettimeofday(2) @@ -722,6 +723,7 @@ var ARM64 = &kernel.SyscallTable{ 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + 441: syscalls.Supported("epoll_pwait2", EpollPwait2), }, Emulate: map[hostarch.Addr]uintptr{}, Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index 7f460d30b..69cbc98d0 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -16,6 +16,7 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" @@ -104,14 +105,8 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } } -// EpollWait implements the epoll_wait(2) linux syscall. -func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - epfd := args[0].Int() - eventsAddr := args[1].Pointer() - maxEvents := int(args[2].Int()) - timeout := int(args[3].Int()) - - r, err := syscalls.WaitEpoll(t, epfd, maxEvents, timeout) +func waitEpoll(t *kernel.Task, fd int32, eventsAddr hostarch.Addr, max int, timeoutInNanos int64) (uintptr, *kernel.SyscallControl, error) { + r, err := syscalls.WaitEpoll(t, fd, max, timeoutInNanos) if err != nil { return 0, nil, syserror.ConvertIntr(err, syserror.EINTR) } @@ -123,6 +118,17 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } return uintptr(len(r)), nil, nil + +} + +// EpollWait implements the epoll_wait(2) linux syscall. +func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + eventsAddr := args[1].Pointer() + maxEvents := int(args[2].Int()) + // Convert milliseconds to nanoseconds. + timeoutInNanos := int64(args[3].Int()) * 1000000 + return waitEpoll(t, epfd, eventsAddr, maxEvents, timeoutInNanos) } // EpollPwait implements the epoll_pwait(2) linux syscall. @@ -144,4 +150,38 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return EpollWait(t, args) } +// EpollPwait2 implements the epoll_pwait(2) linux syscall. +func EpollPwait2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + eventsAddr := args[1].Pointer() + maxEvents := int(args[2].Int()) + timeoutPtr := args[3].Pointer() + maskAddr := args[4].Pointer() + maskSize := uint(args[5].Uint()) + haveTimeout := timeoutPtr != 0 + + var timeoutInNanos int64 = -1 + if haveTimeout { + timeout, err := copyTimespecIn(t, timeoutPtr) + if err != nil { + return 0, nil, err + } + timeoutInNanos = timeout.ToNsec() + + } + + if maskAddr != 0 { + mask, err := CopyInSigSet(t, maskAddr, maskSize) + if err != nil { + return 0, nil, err + } + + oldmask := t.SignalMask() + t.SetSignalMask(mask) + t.SetSavedSignalMask(oldmask) + } + + return waitEpoll(t, epfd, eventsAddr, maxEvents, timeoutInNanos) +} + // LINT.ThenChange(vfs2/epoll.go) diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go index b980aa43e..047d955b6 100644 --- a/pkg/sentry/syscalls/linux/vfs2/epoll.go +++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go @@ -19,6 +19,7 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -118,13 +119,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } } -// EpollWait implements Linux syscall epoll_wait(2). -func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - epfd := args[0].Int() - eventsAddr := args[1].Pointer() - maxEvents := int(args[2].Int()) - timeout := int(args[3].Int()) - +func waitEpoll(t *kernel.Task, epfd int32, eventsAddr hostarch.Addr, maxEvents int, timeoutInNanos int64) (uintptr, *kernel.SyscallControl, error) { var _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS { return 0, nil, syserror.EINVAL @@ -158,7 +153,7 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } return 0, nil, err } - if timeout == 0 { + if timeoutInNanos == 0 { return 0, nil, nil } // In the first iteration of this loop, register with the epoll @@ -173,8 +168,8 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys defer epfile.EventUnregister(&w) } else { // Set up the timer if a timeout was specified. - if timeout > 0 && !haveDeadline { - timeoutDur := time.Duration(timeout) * time.Millisecond + if timeoutInNanos > 0 && !haveDeadline { + timeoutDur := time.Duration(timeoutInNanos) * time.Nanosecond deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur) haveDeadline = true } @@ -186,6 +181,17 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } } } + +} + +// EpollWait implements Linux syscall epoll_wait(2). +func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + eventsAddr := args[1].Pointer() + maxEvents := int(args[2].Int()) + timeoutInNanos := int64(args[3].Int()) * 1000000 + + return waitEpoll(t, epfd, eventsAddr, maxEvents, timeoutInNanos) } // EpollPwait implements Linux syscall epoll_pwait(2). @@ -199,3 +205,29 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return EpollWait(t, args) } + +// EpollPwait2 implements Linux syscall epoll_pwait(2). +func EpollPwait2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + eventsAddr := args[1].Pointer() + maxEvents := int(args[2].Int()) + timeoutPtr := args[3].Pointer() + maskAddr := args[4].Pointer() + maskSize := uint(args[5].Uint()) + haveTimeout := timeoutPtr != 0 + + var timeoutInNanos int64 = -1 + if haveTimeout { + var timeout linux.Timespec + if _, err := timeout.CopyIn(t, timeoutPtr); err != nil { + return 0, nil, err + } + timeoutInNanos = timeout.ToNsec() + } + + if err := setTempSignalSet(t, maskAddr, maskSize); err != nil { + return 0, nil, err + } + + return waitEpoll(t, epfd, eventsAddr, maxEvents, timeoutInNanos) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index c50fd97eb..0fc81e694 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -159,6 +159,7 @@ func Override() { s.Table[327] = syscalls.Supported("preadv2", Preadv2) s.Table[328] = syscalls.Supported("pwritev2", Pwritev2) s.Table[332] = syscalls.Supported("statx", Statx) + s.Table[441] = syscalls.Supported("epoll_pwait2", EpollPwait2) s.Init() // Override ARM64. @@ -269,6 +270,7 @@ func Override() { s.Table[286] = syscalls.Supported("preadv2", Preadv2) s.Table[287] = syscalls.Supported("pwritev2", Pwritev2) s.Table[291] = syscalls.Supported("statx", Statx) + s.Table[441] = syscalls.Supported("epoll_pwait2", EpollPwait2) s.Init() } diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc index b180f633c..af3d27894 100644 --- a/test/syscalls/linux/epoll.cc +++ b/test/syscalls/linux/epoll.cc @@ -39,6 +39,15 @@ namespace { constexpr int kFDsPerEpoll = 3; constexpr uint64_t kMagicConstant = 0x0102030405060708; +#ifndef SYS_epoll_pwait2 +#define SYS_epoll_pwait2 441 +#endif + +int epoll_pwait2(int fd, struct epoll_event* events, int maxevents, + const struct timespec* timeout, const sigset_t* sigset) { + return syscall(SYS_epoll_pwait2, fd, events, maxevents, timeout, sigset); +} + TEST(EpollTest, AllWritable) { auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD()); std::vector<FileDescriptor> eventfds; @@ -144,6 +153,50 @@ TEST(EpollTest, Timeout) { EXPECT_GT(ms_elapsed(begin, end), kTimeoutMs - 1); } +TEST(EpollTest, EpollPwait2Timeout) { + auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD()); + // 200 milliseconds. + constexpr int kTimeoutNs = 200000000; + struct timespec timeout; + timeout.tv_sec = 0; + timeout.tv_nsec = 0; + struct timespec begin; + struct timespec end; + struct epoll_event result[kFDsPerEpoll]; + + std::vector<FileDescriptor> eventfds; + for (int i = 0; i < kFDsPerEpoll; i++) { + eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD())); + ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN, + kMagicConstant + i)); + } + + // Pass valid arguments so that the syscall won't be blocked indefinitely + // nor return errno EINVAL. + // + // The syscall returns immediately when timeout is zero, + // even if no events are available. + SKIP_IF(!IsRunningOnGvisor() && + epoll_pwait2(epollfd.get(), result, kFDsPerEpoll, &timeout, nullptr) < + 0 && + errno == ENOSYS); + + { + const DisableSave ds; // Timing-related. + EXPECT_THAT(clock_gettime(CLOCK_MONOTONIC, &begin), SyscallSucceeds()); + + timeout.tv_nsec = kTimeoutNs; + ASSERT_THAT(RetryEINTR(epoll_pwait2)(epollfd.get(), result, kFDsPerEpoll, + &timeout, nullptr), + SyscallSucceedsWithValue(0)); + EXPECT_THAT(clock_gettime(CLOCK_MONOTONIC, &end), SyscallSucceeds()); + } + + // Check the lower bound on the timeout. Checking for an upper bound is + // fragile because Linux can overrun the timeout due to scheduling delays. + EXPECT_GT(ns_elapsed(begin, end), kTimeoutNs - 1); +} + void* writer(void* arg) { int fd = *reinterpret_cast<int*>(arg); uint64_t tmp = 1; diff --git a/test/util/test_util.h b/test/util/test_util.h index 876ff58db..bcbb388ed 100644 --- a/test/util/test_util.h +++ b/test/util/test_util.h @@ -272,10 +272,15 @@ PosixErrorOr<std::vector<OpenFd>> GetOpenFDs(); // Returns the number of hard links to a path. PosixErrorOr<uint64_t> Links(const std::string& path); +inline uint64_t ns_elapsed(const struct timespec& begin, + const struct timespec& end) { + return (end.tv_sec - begin.tv_sec) * 1000000000 + + (end.tv_nsec - begin.tv_nsec); +} + inline uint64_t ms_elapsed(const struct timespec& begin, const struct timespec& end) { - return (end.tv_sec - begin.tv_sec) * 1000 + - (end.tv_nsec - begin.tv_nsec) / 1000000; + return ns_elapsed(begin, end) / 1000000; } namespace internal { |