// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package eventfd implements event fds.
package eventfd

import (
	"math"
	"sync"

	"golang.org/x/sys/unix"
	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/context"
	"gvisor.dev/gvisor/pkg/fdnotifier"
	"gvisor.dev/gvisor/pkg/hostarch"
	"gvisor.dev/gvisor/pkg/log"
	"gvisor.dev/gvisor/pkg/sentry/vfs"
	"gvisor.dev/gvisor/pkg/syserror"
	"gvisor.dev/gvisor/pkg/usermem"
	"gvisor.dev/gvisor/pkg/waiter"
)

// EventFileDescription implements vfs.FileDescriptionImpl for file-based event
// notification (eventfd). Eventfds are usually internal to the Sentry but in
// certain situations they may be converted into a host-backed eventfd.
//
// +stateify savable
type EventFileDescription struct {
	vfsfd vfs.FileDescription
	vfs.FileDescriptionDefaultImpl
	vfs.DentryMetadataFileDescriptionImpl
	vfs.NoLockFD

	// queue is used to notify interested parties when the event object
	// becomes readable or writable.
	queue waiter.Queue

	// mu protects the fields below.
	mu sync.Mutex `state:"nosave"`

	// val is the current value of the event counter.
	val uint64

	// semMode specifies whether the event is in "semaphore" mode.
	semMode bool

	// hostfd indicates whether this eventfd is passed through to the host.
	hostfd int
}

var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil)

// New creates a new event fd.
func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) {
	vd := vfsObj.NewAnonVirtualDentry("[eventfd]")
	defer vd.DecRef(ctx)
	efd := &EventFileDescription{
		val:     initVal,
		semMode: semMode,
		hostfd:  -1,
	}
	if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{
		UseDentryMetadata: true,
		DenyPRead:         true,
		DenyPWrite:        true,
	}); err != nil {
		return nil, err
	}
	return &efd.vfsfd, nil
}

// HostFD returns the host eventfd associated with this event.
func (efd *EventFileDescription) HostFD() (int, error) {
	efd.mu.Lock()
	defer efd.mu.Unlock()
	if efd.hostfd >= 0 {
		return efd.hostfd, nil
	}

	flags := linux.EFD_NONBLOCK
	if efd.semMode {
		flags |= linux.EFD_SEMAPHORE
	}

	fd, _, errno := unix.Syscall(unix.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0)
	if errno != 0 {
		return -1, errno
	}

	if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil {
		if closeErr := unix.Close(int(fd)); closeErr != nil {
			log.Warningf("close(%d) eventfd failed: %v", fd, closeErr)
		}
		return -1, err
	}

	efd.hostfd = int(fd)
	return efd.hostfd, nil
}

// Release implements vfs.FileDescriptionImpl.Release.
func (efd *EventFileDescription) Release(context.Context) {
	efd.mu.Lock()
	defer efd.mu.Unlock()
	if efd.hostfd >= 0 {
		fdnotifier.RemoveFD(int32(efd.hostfd))
		if closeErr := unix.Close(int(efd.hostfd)); closeErr != nil {
			log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr)
		}
		efd.hostfd = -1
	}
}

// Read implements vfs.FileDescriptionImpl.Read.
func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
	if dst.NumBytes() < 8 {
		return 0, unix.EINVAL
	}
	if err := efd.read(ctx, dst); err != nil {
		return 0, err
	}
	return 8, nil
}

// Write implements vfs.FileDescriptionImpl.Write.
func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
	if src.NumBytes() < 8 {
		return 0, unix.EINVAL
	}
	if err := efd.write(ctx, src); err != nil {
		return 0, err
	}
	return 8, nil
}

// Preconditions: Must be called with efd.mu locked.
func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error {
	var buf [8]byte
	if _, err := unix.Read(efd.hostfd, buf[:]); err != nil {
		if err == unix.EWOULDBLOCK {
			return syserror.ErrWouldBlock
		}
		return err
	}
	_, err := dst.CopyOut(ctx, buf[:])
	return err
}

func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error {
	efd.mu.Lock()
	if efd.hostfd >= 0 {
		defer efd.mu.Unlock()
		return efd.hostReadLocked(ctx, dst)
	}

	// We can't complete the read if the value is currently zero.
	if efd.val == 0 {
		efd.mu.Unlock()
		return syserror.ErrWouldBlock
	}

	// Update the value based on the mode the event is operating in.
	var val uint64
	if efd.semMode {
		val = 1
		// Consistent with Linux, this is done even if writing to memory fails.
		efd.val--
	} else {
		val = efd.val
		efd.val = 0
	}

	efd.mu.Unlock()

	// Notify writers. We do this even if we were already writable because
	// it is possible that a writer is waiting to write the maximum value
	// to the event.
	efd.queue.Notify(waiter.WritableEvents)

	var buf [8]byte
	hostarch.ByteOrder.PutUint64(buf[:], val)
	_, err := dst.CopyOut(ctx, buf[:])
	return err
}

// Preconditions: Must be called with efd.mu locked.
func (efd *EventFileDescription) hostWriteLocked(val uint64) error {
	var buf [8]byte
	hostarch.ByteOrder.PutUint64(buf[:], val)
	_, err := unix.Write(efd.hostfd, buf[:])
	if err == unix.EWOULDBLOCK {
		return syserror.ErrWouldBlock
	}
	return err
}

func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error {
	var buf [8]byte
	if _, err := src.CopyIn(ctx, buf[:]); err != nil {
		return err
	}
	val := hostarch.ByteOrder.Uint64(buf[:])

	return efd.Signal(val)
}

// Signal is an internal function to signal the event fd.
func (efd *EventFileDescription) Signal(val uint64) error {
	if val == math.MaxUint64 {
		return unix.EINVAL
	}

	efd.mu.Lock()

	if efd.hostfd >= 0 {
		defer efd.mu.Unlock()
		return efd.hostWriteLocked(val)
	}

	// We only allow writes that won't cause the value to go over the max
	// uint64 minus 1.
	if val > math.MaxUint64-1-efd.val {
		efd.mu.Unlock()
		return syserror.ErrWouldBlock
	}

	efd.val += val
	efd.mu.Unlock()

	// Always trigger a notification.
	efd.queue.Notify(waiter.ReadableEvents)

	return nil
}

// Readiness implements waiter.Waitable.Readiness.
func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
	efd.mu.Lock()
	defer efd.mu.Unlock()

	if efd.hostfd >= 0 {
		return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask)
	}

	ready := waiter.EventMask(0)
	if efd.val > 0 {
		ready |= waiter.ReadableEvents
	}

	if efd.val < math.MaxUint64-1 {
		ready |= waiter.WritableEvents
	}

	return mask & ready
}

// EventRegister implements waiter.Waitable.EventRegister.
func (efd *EventFileDescription) EventRegister(entry *waiter.Entry, mask waiter.EventMask) {
	efd.queue.EventRegister(entry, mask)

	efd.mu.Lock()
	defer efd.mu.Unlock()
	if efd.hostfd >= 0 {
		fdnotifier.UpdateFD(int32(efd.hostfd))
	}
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) {
	efd.queue.EventUnregister(entry)

	efd.mu.Lock()
	defer efd.mu.Unlock()
	if efd.hostfd >= 0 {
		fdnotifier.UpdateFD(int32(efd.hostfd))
	}
}