From d0b1d0233dc8a8ac837d534cd0664eabb9dd0a71 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Thu, 7 May 2020 12:42:46 -0700 Subject: Move pkg/sentry/vfs/{eventfd,timerfd} to new packages in pkg/sentry/fsimpl. They don't depend on anything in VFS2, so they should be their own packages. PiperOrigin-RevId: 310416807 --- pkg/sentry/fsimpl/eventfd/BUILD | 33 ++++ pkg/sentry/fsimpl/eventfd/eventfd.go | 284 ++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/eventfd/eventfd_test.go | 97 ++++++++++ pkg/sentry/fsimpl/timerfd/BUILD | 17 ++ pkg/sentry/fsimpl/timerfd/timerfd.go | 143 +++++++++++++++ pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/kernel.go | 11 +- pkg/sentry/syscalls/linux/vfs2/BUILD | 2 + pkg/sentry/syscalls/linux/vfs2/eventfd.go | 4 +- pkg/sentry/syscalls/linux/vfs2/timerfd.go | 19 +- pkg/sentry/vfs/BUILD | 4 - pkg/sentry/vfs/eventfd.go | 282 ----------------------------- pkg/sentry/vfs/eventfd_test.go | 96 ---------- pkg/sentry/vfs/timerfd.go | 141 --------------- 14 files changed, 596 insertions(+), 538 deletions(-) create mode 100644 pkg/sentry/fsimpl/eventfd/BUILD create mode 100644 pkg/sentry/fsimpl/eventfd/eventfd.go create mode 100644 pkg/sentry/fsimpl/eventfd/eventfd_test.go create mode 100644 pkg/sentry/fsimpl/timerfd/BUILD create mode 100644 pkg/sentry/fsimpl/timerfd/timerfd.go delete mode 100644 pkg/sentry/vfs/eventfd.go delete mode 100644 pkg/sentry/vfs/eventfd_test.go delete mode 100644 pkg/sentry/vfs/timerfd.go (limited to 'pkg') diff --git a/pkg/sentry/fsimpl/eventfd/BUILD b/pkg/sentry/fsimpl/eventfd/BUILD new file mode 100644 index 000000000..ea167d38c --- /dev/null +++ b/pkg/sentry/fsimpl/eventfd/BUILD @@ -0,0 +1,33 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +licenses(["notice"]) + +go_library( + name = "eventfd", + srcs = ["eventfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fdnotifier", + "//pkg/log", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) + +go_test( + name = "eventfd_test", + size = "small", + srcs = ["eventfd_test.go"], + library = ":eventfd", + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/contexttest", + "//pkg/sentry/vfs", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go new file mode 100644 index 000000000..c573d7935 --- /dev/null +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -0,0 +1,284 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package eventfd implements event fds. +package eventfd + +import ( + "math" + "sync" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// EventFileDescription implements FileDescriptionImpl for file-based event +// notification (eventfd). Eventfds are usually internal to the Sentry but in +// certain situations they may be converted into a host-backed eventfd. +type EventFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + // queue is used to notify interested parties when the event object + // becomes readable or writable. + queue waiter.Queue `state:"zerovalue"` + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // val is the current value of the event counter. + val uint64 + + // semMode specifies whether the event is in "semaphore" mode. + semMode bool + + // hostfd indicates whether this eventfd is passed through to the host. + hostfd int +} + +var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil) + +// New creates a new event fd. +func New(vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { + vd := vfsObj.NewAnonVirtualDentry("[eventfd]") + defer vd.DecRef() + efd := &EventFileDescription{ + val: initVal, + semMode: semMode, + hostfd: -1, + } + if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &efd.vfsfd, nil +} + +// HostFD returns the host eventfd associated with this event. +func (efd *EventFileDescription) HostFD() (int, error) { + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + return efd.hostfd, nil + } + + flags := linux.EFD_NONBLOCK + if efd.semMode { + flags |= linux.EFD_SEMAPHORE + } + + fd, _, errno := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0) + if errno != 0 { + return -1, errno + } + + if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil { + if closeErr := syscall.Close(int(fd)); closeErr != nil { + log.Warningf("close(%d) eventfd failed: %v", fd, closeErr) + } + return -1, err + } + + efd.hostfd = int(fd) + return efd.hostfd, nil +} + +// Release implements FileDescriptionImpl.Release() +func (efd *EventFileDescription) Release() { + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.RemoveFD(int32(efd.hostfd)) + if closeErr := syscall.Close(int(efd.hostfd)); closeErr != nil { + log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr) + } + efd.hostfd = -1 + } +} + +// Read implements FileDescriptionImpl.Read. +func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + if dst.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := efd.read(ctx, dst); err != nil { + return 0, err + } + return 8, nil +} + +// Write implements FileDescriptionImpl.Write. +func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + if src.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := efd.write(ctx, src); err != nil { + return 0, err + } + return 8, nil +} + +// Preconditions: Must be called with efd.mu locked. +func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error { + var buf [8]byte + if _, err := syscall.Read(efd.hostfd, buf[:]); err != nil { + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err + } + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error { + efd.mu.Lock() + if efd.hostfd >= 0 { + defer efd.mu.Unlock() + return efd.hostReadLocked(ctx, dst) + } + + // We can't complete the read if the value is currently zero. + if efd.val == 0 { + efd.mu.Unlock() + return syserror.ErrWouldBlock + } + + // Update the value based on the mode the event is operating in. + var val uint64 + if efd.semMode { + val = 1 + // Consistent with Linux, this is done even if writing to memory fails. + efd.val-- + } else { + val = efd.val + efd.val = 0 + } + + efd.mu.Unlock() + + // Notify writers. We do this even if we were already writable because + // it is possible that a writer is waiting to write the maximum value + // to the event. + efd.queue.Notify(waiter.EventOut) + + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +// Preconditions: Must be called with efd.mu locked. +func (efd *EventFileDescription) hostWriteLocked(val uint64) error { + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := syscall.Write(efd.hostfd, buf[:]) + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err +} + +func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error { + var buf [8]byte + if _, err := src.CopyIn(ctx, buf[:]); err != nil { + return err + } + val := usermem.ByteOrder.Uint64(buf[:]) + + return efd.Signal(val) +} + +// Signal is an internal function to signal the event fd. +func (efd *EventFileDescription) Signal(val uint64) error { + if val == math.MaxUint64 { + return syscall.EINVAL + } + + efd.mu.Lock() + + if efd.hostfd >= 0 { + defer efd.mu.Unlock() + return efd.hostWriteLocked(val) + } + + // We only allow writes that won't cause the value to go over the max + // uint64 minus 1. + if val > math.MaxUint64-1-efd.val { + efd.mu.Unlock() + return syserror.ErrWouldBlock + } + + efd.val += val + efd.mu.Unlock() + + // Always trigger a notification. + efd.queue.Notify(waiter.EventIn) + + return nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + efd.mu.Lock() + defer efd.mu.Unlock() + + if efd.hostfd >= 0 { + return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask) + } + + ready := waiter.EventMask(0) + if efd.val > 0 { + ready |= waiter.EventIn + } + + if efd.val < math.MaxUint64-1 { + ready |= waiter.EventOut + } + + return mask & ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (efd *EventFileDescription) EventRegister(entry *waiter.Entry, mask waiter.EventMask) { + efd.queue.EventRegister(entry, mask) + + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.UpdateFD(int32(efd.hostfd)) + } +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) { + efd.queue.EventUnregister(entry) + + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.UpdateFD(int32(efd.hostfd)) + } +} diff --git a/pkg/sentry/fsimpl/eventfd/eventfd_test.go b/pkg/sentry/fsimpl/eventfd/eventfd_test.go new file mode 100644 index 000000000..20e3adffc --- /dev/null +++ b/pkg/sentry/fsimpl/eventfd/eventfd_test.go @@ -0,0 +1,97 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package eventfd + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +func TestEventFD(t *testing.T) { + initVals := []uint64{ + 0, + // Using a non-zero initial value verifies that writing to an + // eventfd signals when the eventfd's counter was already + // non-zero. + 343, + } + + for _, initVal := range initVals { + ctx := contexttest.Context(t) + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + + // Make a new eventfd that is writable. + eventfd, err := New(vfsObj, initVal, false, linux.O_RDWR) + if err != nil { + t.Fatalf("New() failed: %v", err) + } + defer eventfd.DecRef() + + // Register a callback for a write event. + w, ch := waiter.NewChannelEntry(nil) + eventfd.EventRegister(&w, waiter.EventIn) + defer eventfd.EventUnregister(&w) + + data := []byte("00000124") + // Create and submit a write request. + n, err := eventfd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatal(err) + } + if n != 8 { + t.Errorf("eventfd.write wrote %d bytes, not full int64", n) + } + + // Check if the callback fired due to the write event. + select { + case <-ch: + default: + t.Errorf("Didn't get notified of EventIn after write") + } + } +} + +func TestEventFDStat(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + + // Make a new eventfd that is writable. + eventfd, err := New(vfsObj, 0, false, linux.O_RDWR) + if err != nil { + t.Fatalf("New() failed: %v", err) + } + defer eventfd.DecRef() + + statx, err := eventfd.Stat(ctx, vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS, + }) + if err != nil { + t.Fatalf("eventfd.Stat failed: %v", err) + } + if statx.Size != 0 { + t.Errorf("eventfd size should be 0") + } +} diff --git a/pkg/sentry/fsimpl/timerfd/BUILD b/pkg/sentry/fsimpl/timerfd/BUILD new file mode 100644 index 000000000..fbb02a271 --- /dev/null +++ b/pkg/sentry/fsimpl/timerfd/BUILD @@ -0,0 +1,17 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "timerfd", + srcs = ["timerfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/context", + "//pkg/sentry/kernel/time", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go new file mode 100644 index 000000000..60c92d626 --- /dev/null +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -0,0 +1,143 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package timerfd implements timer fds. +package timerfd + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// TimerFileDescription implements FileDescriptionImpl for timer fds. It also +// implements ktime.TimerListener. +type TimerFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + events waiter.Queue + timer *ktime.Timer + + // val is the number of timer expirations since the last successful + // call to PRead, or SetTime. val must be accessed using atomic memory + // operations. + val uint64 +} + +var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil) +var _ ktime.TimerListener = (*TimerFileDescription)(nil) + +// New returns a new timer fd. +func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { + vd := vfsObj.NewAnonVirtualDentry("[timerfd]") + defer vd.DecRef() + tfd := &TimerFileDescription{} + tfd.timer = ktime.NewTimer(clock, tfd) + if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &tfd.vfsfd, nil +} + +// Read implements FileDescriptionImpl.Read. +func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + const sizeofUint64 = 8 + if dst.NumBytes() < sizeofUint64 { + return 0, syserror.EINVAL + } + if val := atomic.SwapUint64(&tfd.val, 0); val != 0 { + var buf [sizeofUint64]byte + usermem.ByteOrder.PutUint64(buf[:], val) + if _, err := dst.CopyOut(ctx, buf[:]); err != nil { + // Linux does not undo consuming the number of + // expirations even if writing to userspace fails. + return 0, err + } + return sizeofUint64, nil + } + return 0, syserror.ErrWouldBlock +} + +// Clock returns the timer fd's Clock. +func (tfd *TimerFileDescription) Clock() ktime.Clock { + return tfd.timer.Clock() +} + +// GetTime returns the associated Timer's setting and the time at which it was +// observed. +func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) { + return tfd.timer.Get() +} + +// SetTime atomically changes the associated Timer's setting, resets the number +// of expirations to 0, and returns the previous setting and the time at which +// it was observed. +func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) { + return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) }) +} + +// Readiness implements waiter.Waitable.Readiness. +func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + var ready waiter.EventMask + if atomic.LoadUint64(&tfd.val) != 0 { + ready |= waiter.EventIn + } + return ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + tfd.events.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) { + tfd.events.EventUnregister(e) +} + +// PauseTimer pauses the associated Timer. +func (tfd *TimerFileDescription) PauseTimer() { + tfd.timer.Pause() +} + +// ResumeTimer resumes the associated Timer. +func (tfd *TimerFileDescription) ResumeTimer() { + tfd.timer.Resume() +} + +// Release implements FileDescriptionImpl.Release() +func (tfd *TimerFileDescription) Release() { + tfd.timer.Destroy() +} + +// Notify implements ktime.TimerListener.Notify. +func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { + atomic.AddUint64(&tfd.val, exp) + tfd.events.Notify(waiter.EventIn) + return ktime.Setting{}, false +} + +// Destroy implements ktime.TimerListener.Destroy. +func (tfd *TimerFileDescription) Destroy() {} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index e47af66d6..8104f50f3 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -172,6 +172,7 @@ go_library( "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", + "//pkg/sentry/fsimpl/timerfd", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index c91b9dce2..271ea5faf 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -48,10 +48,11 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" + oldtimerfd "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -1068,11 +1069,11 @@ func (k *Kernel) pauseTimeLocked() { if t.fdTable != nil { t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { - if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.PauseTimer() } } else { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { tfd.PauseTimer() } } @@ -1104,11 +1105,11 @@ func (k *Kernel) resumeTimeLocked() { if t.fdTable != nil { t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { - if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.ResumeTimer() } } else { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { tfd.ResumeTimer() } } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 14838aa2c..c32f942fb 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -39,8 +39,10 @@ go_library( "//pkg/gohacks", "//pkg/sentry/arch", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/eventfd", "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/signalfd", + "//pkg/sentry/fsimpl/timerfd", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go index bd2194972..aff1a2070 100644 --- a/pkg/sentry/syscalls/linux/vfs2/eventfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go @@ -17,6 +17,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" ) @@ -31,12 +32,13 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, syserror.EINVAL } + vfsObj := t.Kernel().VFS() fileFlags := uint32(linux.O_RDWR) if flags&linux.EFD_NONBLOCK != 0 { fileFlags |= linux.O_NONBLOCK } semMode := flags&linux.EFD_SEMAPHORE != 0 - eventfd, err := t.Kernel().VFS().NewEventFD(initVal, semMode, fileFlags) + eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go index 839a07db1..5ac79bc09 100644 --- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go @@ -17,9 +17,9 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -32,9 +32,12 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.EINVAL } - var fileFlags uint32 + // Timerfds aren't writable per se (their implementation of Write just + // returns EINVAL), but they are "opened for writing", which is necessary + // to actually reach said implementation of Write. + fileFlags := uint32(linux.O_RDWR) if flags&linux.TFD_NONBLOCK != 0 { - fileFlags = linux.O_NONBLOCK + fileFlags |= linux.O_NONBLOCK } var clock ktime.Clock @@ -46,10 +49,8 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel default: return 0, nil, syserror.EINVAL } - // Timerfds aren't writable per se (their implementation of Write just - // returns EINVAL), but they are "opened for writing", which is necessary - // to actually reach said implementation of Write. - file, err := t.Kernel().VFS().NewTimerFD(clock, linux.O_RDWR|fileFlags) + vfsObj := t.Kernel().VFS() + file, err := timerfd.New(vfsObj, clock, fileFlags) if err != nil { return 0, nil, err } @@ -80,7 +81,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne } defer file.DecRef() - tfd, ok := file.Impl().(*vfs.TimerFileDescription) + tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { return 0, nil, syserror.EINVAL } @@ -114,7 +115,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne } defer file.DecRef() - tfd, ok := file.Impl().(*vfs.TimerFileDescription) + tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { return 0, nil, syserror.EINVAL } diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 86046dd99..94d69c1cc 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -25,7 +25,6 @@ go_library( "device.go", "epoll.go", "epoll_interest_list.go", - "eventfd.go", "file_description.go", "file_description_impl_util.go", "filesystem.go", @@ -37,7 +36,6 @@ go_library( "pathname.go", "permissions.go", "resolving_path.go", - "timerfd.go", "vfs.go", ], visibility = ["//pkg/sentry:internal"], @@ -71,7 +69,6 @@ go_test( name = "vfs_test", size = "small", srcs = [ - "eventfd_test.go", "file_description_impl_util_test.go", "mount_test.go", ], @@ -83,6 +80,5 @@ go_test( "//pkg/sync", "//pkg/syserror", "//pkg/usermem", - "//pkg/waiter", ], ) diff --git a/pkg/sentry/vfs/eventfd.go b/pkg/sentry/vfs/eventfd.go deleted file mode 100644 index f39dacacf..000000000 --- a/pkg/sentry/vfs/eventfd.go +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "math" - "sync" - "syscall" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fdnotifier" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -// EventFileDescription implements FileDescriptionImpl for file-based event -// notification (eventfd). Eventfds are usually internal to the Sentry but in -// certain situations they may be converted into a host-backed eventfd. -type EventFileDescription struct { - vfsfd FileDescription - FileDescriptionDefaultImpl - DentryMetadataFileDescriptionImpl - - // queue is used to notify interested parties when the event object - // becomes readable or writable. - queue waiter.Queue `state:"zerovalue"` - - // mu protects the fields below. - mu sync.Mutex `state:"nosave"` - - // val is the current value of the event counter. - val uint64 - - // semMode specifies whether the event is in "semaphore" mode. - semMode bool - - // hostfd indicates whether this eventfd is passed through to the host. - hostfd int -} - -var _ FileDescriptionImpl = (*EventFileDescription)(nil) - -// NewEventFD creates a new event fd. -func (vfs *VirtualFilesystem) NewEventFD(initVal uint64, semMode bool, flags uint32) (*FileDescription, error) { - vd := vfs.NewAnonVirtualDentry("[eventfd]") - defer vd.DecRef() - efd := &EventFileDescription{ - val: initVal, - semMode: semMode, - hostfd: -1, - } - if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ - UseDentryMetadata: true, - DenyPRead: true, - DenyPWrite: true, - }); err != nil { - return nil, err - } - return &efd.vfsfd, nil -} - -// HostFD returns the host eventfd associated with this event. -func (efd *EventFileDescription) HostFD() (int, error) { - efd.mu.Lock() - defer efd.mu.Unlock() - if efd.hostfd >= 0 { - return efd.hostfd, nil - } - - flags := linux.EFD_NONBLOCK - if efd.semMode { - flags |= linux.EFD_SEMAPHORE - } - - fd, _, errno := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0) - if errno != 0 { - return -1, errno - } - - if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil { - if closeErr := syscall.Close(int(fd)); closeErr != nil { - log.Warningf("close(%d) eventfd failed: %v", fd, closeErr) - } - return -1, err - } - - efd.hostfd = int(fd) - return efd.hostfd, nil -} - -// Release implements FileDescriptionImpl.Release() -func (efd *EventFileDescription) Release() { - efd.mu.Lock() - defer efd.mu.Unlock() - if efd.hostfd >= 0 { - fdnotifier.RemoveFD(int32(efd.hostfd)) - if closeErr := syscall.Close(int(efd.hostfd)); closeErr != nil { - log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr) - } - efd.hostfd = -1 - } -} - -// Read implements FileDescriptionImpl.Read. -func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ ReadOptions) (int64, error) { - if dst.NumBytes() < 8 { - return 0, syscall.EINVAL - } - if err := efd.read(ctx, dst); err != nil { - return 0, err - } - return 8, nil -} - -// Write implements FileDescriptionImpl.Write. -func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ WriteOptions) (int64, error) { - if src.NumBytes() < 8 { - return 0, syscall.EINVAL - } - if err := efd.write(ctx, src); err != nil { - return 0, err - } - return 8, nil -} - -// Preconditions: Must be called with efd.mu locked. -func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error { - var buf [8]byte - if _, err := syscall.Read(efd.hostfd, buf[:]); err != nil { - if err == syscall.EWOULDBLOCK { - return syserror.ErrWouldBlock - } - return err - } - _, err := dst.CopyOut(ctx, buf[:]) - return err -} - -func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error { - efd.mu.Lock() - if efd.hostfd >= 0 { - defer efd.mu.Unlock() - return efd.hostReadLocked(ctx, dst) - } - - // We can't complete the read if the value is currently zero. - if efd.val == 0 { - efd.mu.Unlock() - return syserror.ErrWouldBlock - } - - // Update the value based on the mode the event is operating in. - var val uint64 - if efd.semMode { - val = 1 - // Consistent with Linux, this is done even if writing to memory fails. - efd.val-- - } else { - val = efd.val - efd.val = 0 - } - - efd.mu.Unlock() - - // Notify writers. We do this even if we were already writable because - // it is possible that a writer is waiting to write the maximum value - // to the event. - efd.queue.Notify(waiter.EventOut) - - var buf [8]byte - usermem.ByteOrder.PutUint64(buf[:], val) - _, err := dst.CopyOut(ctx, buf[:]) - return err -} - -// Preconditions: Must be called with efd.mu locked. -func (efd *EventFileDescription) hostWriteLocked(val uint64) error { - var buf [8]byte - usermem.ByteOrder.PutUint64(buf[:], val) - _, err := syscall.Write(efd.hostfd, buf[:]) - if err == syscall.EWOULDBLOCK { - return syserror.ErrWouldBlock - } - return err -} - -func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error { - var buf [8]byte - if _, err := src.CopyIn(ctx, buf[:]); err != nil { - return err - } - val := usermem.ByteOrder.Uint64(buf[:]) - - return efd.Signal(val) -} - -// Signal is an internal function to signal the event fd. -func (efd *EventFileDescription) Signal(val uint64) error { - if val == math.MaxUint64 { - return syscall.EINVAL - } - - efd.mu.Lock() - - if efd.hostfd >= 0 { - defer efd.mu.Unlock() - return efd.hostWriteLocked(val) - } - - // We only allow writes that won't cause the value to go over the max - // uint64 minus 1. - if val > math.MaxUint64-1-efd.val { - efd.mu.Unlock() - return syserror.ErrWouldBlock - } - - efd.val += val - efd.mu.Unlock() - - // Always trigger a notification. - efd.queue.Notify(waiter.EventIn) - - return nil -} - -// Readiness implements waiter.Waitable.Readiness. -func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { - efd.mu.Lock() - defer efd.mu.Unlock() - - if efd.hostfd >= 0 { - return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask) - } - - ready := waiter.EventMask(0) - if efd.val > 0 { - ready |= waiter.EventIn - } - - if efd.val < math.MaxUint64-1 { - ready |= waiter.EventOut - } - - return mask & ready -} - -// EventRegister implements waiter.Waitable.EventRegister. -func (efd *EventFileDescription) EventRegister(entry *waiter.Entry, mask waiter.EventMask) { - efd.queue.EventRegister(entry, mask) - - efd.mu.Lock() - defer efd.mu.Unlock() - if efd.hostfd >= 0 { - fdnotifier.UpdateFD(int32(efd.hostfd)) - } -} - -// EventUnregister implements waiter.Waitable.EventUnregister. -func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) { - efd.queue.EventUnregister(entry) - - efd.mu.Lock() - defer efd.mu.Unlock() - if efd.hostfd >= 0 { - fdnotifier.UpdateFD(int32(efd.hostfd)) - } -} diff --git a/pkg/sentry/vfs/eventfd_test.go b/pkg/sentry/vfs/eventfd_test.go deleted file mode 100644 index 2dff2d10b..000000000 --- a/pkg/sentry/vfs/eventfd_test.go +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -func TestEventFD(t *testing.T) { - initVals := []uint64{ - 0, - // Using a non-zero initial value verifies that writing to an - // eventfd signals when the eventfd's counter was already - // non-zero. - 343, - } - - for _, initVal := range initVals { - ctx := contexttest.Context(t) - vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { - t.Fatalf("VFS init: %v", err) - } - - // Make a new eventfd that is writable. - eventfd, err := vfsObj.NewEventFD(initVal, false, linux.O_RDWR) - if err != nil { - t.Fatalf("NewEventFD failed: %v", err) - } - defer eventfd.DecRef() - - // Register a callback for a write event. - w, ch := waiter.NewChannelEntry(nil) - eventfd.EventRegister(&w, waiter.EventIn) - defer eventfd.EventUnregister(&w) - - data := []byte("00000124") - // Create and submit a write request. - n, err := eventfd.Write(ctx, usermem.BytesIOSequence(data), WriteOptions{}) - if err != nil { - t.Fatal(err) - } - if n != 8 { - t.Errorf("eventfd.write wrote %d bytes, not full int64", n) - } - - // Check if the callback fired due to the write event. - select { - case <-ch: - default: - t.Errorf("Didn't get notified of EventIn after write") - } - } -} - -func TestEventFDStat(t *testing.T) { - ctx := contexttest.Context(t) - vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { - t.Fatalf("VFS init: %v", err) - } - - // Make a new eventfd that is writable. - eventfd, err := vfsObj.NewEventFD(0, false, linux.O_RDWR) - if err != nil { - t.Fatalf("NewEventFD failed: %v", err) - } - defer eventfd.DecRef() - - statx, err := eventfd.Stat(ctx, StatOptions{ - Mask: linux.STATX_BASIC_STATS, - }) - if err != nil { - t.Fatalf("eventfd.Stat failed: %v", err) - } - if statx.Size != 0 { - t.Errorf("eventfd size should be 0") - } -} diff --git a/pkg/sentry/vfs/timerfd.go b/pkg/sentry/vfs/timerfd.go deleted file mode 100644 index cc536ceaf..000000000 --- a/pkg/sentry/vfs/timerfd.go +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "sync/atomic" - - "gvisor.dev/gvisor/pkg/context" - ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -// TimerFileDescription implements FileDescriptionImpl for timer fds. It also -// implements ktime.TimerListener. -type TimerFileDescription struct { - vfsfd FileDescription - FileDescriptionDefaultImpl - DentryMetadataFileDescriptionImpl - - events waiter.Queue - timer *ktime.Timer - - // val is the number of timer expirations since the last successful - // call to PRead, or SetTime. val must be accessed using atomic memory - // operations. - val uint64 -} - -var _ FileDescriptionImpl = (*TimerFileDescription)(nil) -var _ ktime.TimerListener = (*TimerFileDescription)(nil) - -// NewTimerFD returns a new timer fd. -func (vfs *VirtualFilesystem) NewTimerFD(clock ktime.Clock, flags uint32) (*FileDescription, error) { - vd := vfs.NewAnonVirtualDentry("[timerfd]") - defer vd.DecRef() - tfd := &TimerFileDescription{} - tfd.timer = ktime.NewTimer(clock, tfd) - if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ - UseDentryMetadata: true, - DenyPRead: true, - DenyPWrite: true, - }); err != nil { - return nil, err - } - return &tfd.vfsfd, nil -} - -// Read implements FileDescriptionImpl.Read. -func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { - const sizeofUint64 = 8 - if dst.NumBytes() < sizeofUint64 { - return 0, syserror.EINVAL - } - if val := atomic.SwapUint64(&tfd.val, 0); val != 0 { - var buf [sizeofUint64]byte - usermem.ByteOrder.PutUint64(buf[:], val) - if _, err := dst.CopyOut(ctx, buf[:]); err != nil { - // Linux does not undo consuming the number of - // expirations even if writing to userspace fails. - return 0, err - } - return sizeofUint64, nil - } - return 0, syserror.ErrWouldBlock -} - -// Clock returns the timer fd's Clock. -func (tfd *TimerFileDescription) Clock() ktime.Clock { - return tfd.timer.Clock() -} - -// GetTime returns the associated Timer's setting and the time at which it was -// observed. -func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) { - return tfd.timer.Get() -} - -// SetTime atomically changes the associated Timer's setting, resets the number -// of expirations to 0, and returns the previous setting and the time at which -// it was observed. -func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) { - return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) }) -} - -// Readiness implements waiter.Waitable.Readiness. -func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { - var ready waiter.EventMask - if atomic.LoadUint64(&tfd.val) != 0 { - ready |= waiter.EventIn - } - return ready -} - -// EventRegister implements waiter.Waitable.EventRegister. -func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { - tfd.events.EventRegister(e, mask) -} - -// EventUnregister implements waiter.Waitable.EventUnregister. -func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) { - tfd.events.EventUnregister(e) -} - -// PauseTimer pauses the associated Timer. -func (tfd *TimerFileDescription) PauseTimer() { - tfd.timer.Pause() -} - -// ResumeTimer resumes the associated Timer. -func (tfd *TimerFileDescription) ResumeTimer() { - tfd.timer.Resume() -} - -// Release implements FileDescriptionImpl.Release() -func (tfd *TimerFileDescription) Release() { - tfd.timer.Destroy() -} - -// Notify implements ktime.TimerListener.Notify. -func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { - atomic.AddUint64(&tfd.val, exp) - tfd.events.Notify(waiter.EventIn) - return ktime.Setting{}, false -} - -// Destroy implements ktime.TimerListener.Destroy. -func (tfd *TimerFileDescription) Destroy() {} -- cgit v1.2.3