// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gofer

import (
	"sync/atomic"

	"golang.org/x/sys/unix"
	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/context"
	"gvisor.dev/gvisor/pkg/errors/linuxerr"
	"gvisor.dev/gvisor/pkg/fdnotifier"
	"gvisor.dev/gvisor/pkg/hostarch"
	"gvisor.dev/gvisor/pkg/lisafs"
	"gvisor.dev/gvisor/pkg/metric"
	"gvisor.dev/gvisor/pkg/p9"
	"gvisor.dev/gvisor/pkg/safemem"
	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
	"gvisor.dev/gvisor/pkg/sentry/memmap"
	"gvisor.dev/gvisor/pkg/sentry/vfs"
	"gvisor.dev/gvisor/pkg/sync"
	"gvisor.dev/gvisor/pkg/usermem"
	"gvisor.dev/gvisor/pkg/waiter"
)

// specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device
// special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is
// in effect) regular files. specialFileFD differs from regularFileFD by using
// per-FD handles instead of shared per-dentry handles, and never buffering I/O.
//
// +stateify savable
type specialFileFD struct {
	fileDescription

	// releaseMu synchronizes the closing of fd.handle with fd.sync(). It's safe
	// to access fd.handle without locking for operations that require a ref to
	// be held by the caller, e.g. vfs.FileDescriptionImpl implementations.
	releaseMu sync.RWMutex `state:"nosave"`

	// handle is used for file I/O. handle is immutable.
	handle handle `state:"nosave"`

	// isRegularFile is true if this FD represents a regular file which is only
	// possible when filesystemOptions.regularFilesUseSpecialFileFD is in
	// effect. isRegularFile is immutable.
	isRegularFile bool

	// seekable is true if this file description represents a file for which
	// file offset is significant, i.e. a regular file, character device or
	// block device. seekable is immutable.
	seekable bool

	// haveQueue is true if this file description represents a file for which
	// queue may send I/O readiness events. haveQueue is immutable.
	haveQueue bool `state:"nosave"`
	queue     waiter.Queue

	// If seekable is true, off is the file offset. off is protected by mu.
	mu  sync.Mutex `state:"nosave"`
	off int64

	// If haveBuf is non-zero, this FD represents a pipe, and buf contains data
	// read from the pipe from previous calls to specialFileFD.savePipeData().
	// haveBuf and buf are protected by bufMu. haveBuf is accessed using atomic
	// memory operations.
	bufMu   sync.Mutex `state:"nosave"`
	haveBuf uint32
	buf     []byte

	// If handle.fd >= 0, hostFileMapper caches mappings of handle.fd, and
	// hostFileMapperInitOnce is used to initialize it on first use.
	hostFileMapperInitOnce sync.Once `state:"nosave"`
	hostFileMapper         fsutil.HostFileMapper

	// If handle.fd >= 0, fileRefs counts references on memmap.File offsets.
	// fileRefs is protected by fileRefsMu.
	fileRefsMu sync.Mutex `state:"nosave"`
	fileRefs   fsutil.FrameRefSet
}

func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) {
	ftype := d.fileType()
	seekable := ftype == linux.S_IFREG || ftype == linux.S_IFCHR || ftype == linux.S_IFBLK
	haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0
	fd := &specialFileFD{
		handle:        h,
		isRegularFile: ftype == linux.S_IFREG,
		seekable:      seekable,
		haveQueue:     haveQueue,
	}
	fd.LockFD.Init(&d.locks)
	if haveQueue {
		if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil {
			return nil, err
		}
	}
	if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
		DenyPRead:  !seekable,
		DenyPWrite: !seekable,
	}); err != nil {
		if haveQueue {
			fdnotifier.RemoveFD(h.fd)
		}
		return nil, err
	}
	d.fs.syncMu.Lock()
	d.fs.specialFileFDs[fd] = struct{}{}
	d.fs.syncMu.Unlock()
	if fd.vfsfd.IsWritable() && (atomic.LoadUint32(&d.mode)&0111 != 0) {
		metric.SuspiciousOperationsMetric.Increment("opened_write_execute_file")
	}
	if h.fd >= 0 {
		fsmetric.GoferOpensHost.Increment()
	} else {
		fsmetric.GoferOpens9P.Increment()
	}
	return fd, nil
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *specialFileFD) Release(ctx context.Context) {
	if fd.haveQueue {
		fdnotifier.RemoveFD(fd.handle.fd)
	}
	fd.releaseMu.Lock()
	fd.handle.close(ctx)
	fd.releaseMu.Unlock()

	fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
	fs.syncMu.Lock()
	delete(fs.specialFileFDs, fd)
	fs.syncMu.Unlock()
}

// OnClose implements vfs.FileDescriptionImpl.OnClose.
func (fd *specialFileFD) OnClose(ctx context.Context) error {
	if !fd.vfsfd.IsWritable() {
		return nil
	}
	if fs := fd.filesystem(); fs.opts.lisaEnabled {
		return fd.handle.fdLisa.Flush(ctx)
	}
	return fd.handle.file.flush(ctx)
}

// Readiness implements waiter.Waitable.Readiness.
func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
	if fd.haveQueue {
		return fdnotifier.NonBlockingPoll(fd.handle.fd, mask)
	}
	return fd.fileDescription.Readiness(mask)
}

// EventRegister implements waiter.Waitable.EventRegister.
func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
	if fd.haveQueue {
		fd.queue.EventRegister(e, mask)
		fdnotifier.UpdateFD(fd.handle.fd)
		return
	}
	fd.fileDescription.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
	if fd.haveQueue {
		fd.queue.EventUnregister(e)
		fdnotifier.UpdateFD(fd.handle.fd)
		return
	}
	fd.fileDescription.EventUnregister(e)
}

func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
	if fd.isRegularFile {
		d := fd.dentry()
		return d.doAllocate(ctx, offset, length, func() error {
			if d.fs.opts.lisaEnabled {
				return fd.handle.fdLisa.Allocate(ctx, mode, offset, length)
			}
			return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
		})
	}
	return fd.FileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
}

// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
	start := fsmetric.StartReadWait()
	defer func() {
		if fd.handle.fd >= 0 {
			fsmetric.GoferReadsHost.Increment()
			fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start)
		} else {
			fsmetric.GoferReads9P.Increment()
			fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start)
		}
	}()

	if fd.seekable && offset < 0 {
		return 0, linuxerr.EINVAL
	}

	// Check that flags are supported.
	//
	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
	if opts.Flags&^linux.RWF_HIPRI != 0 {
		return 0, linuxerr.EOPNOTSUPP
	}

	if d := fd.dentry(); d.cachedMetadataAuthoritative() {
		d.touchAtime(fd.vfsfd.Mount())
	}

	bufN := int64(0)
	if atomic.LoadUint32(&fd.haveBuf) != 0 {
		var err error
		fd.bufMu.Lock()
		if len(fd.buf) != 0 {
			var n int
			n, err = dst.CopyOut(ctx, fd.buf)
			dst = dst.DropFirst(n)
			fd.buf = fd.buf[n:]
			if len(fd.buf) == 0 {
				atomic.StoreUint32(&fd.haveBuf, 0)
				fd.buf = nil
			}
			bufN = int64(n)
			if offset >= 0 {
				offset += bufN
			}
		}
		fd.bufMu.Unlock()
		if err != nil {
			return bufN, err
		}
	}

	rw := getHandleReadWriter(ctx, &fd.handle, offset)
	n, err := dst.CopyOutFrom(ctx, rw)
	putHandleReadWriter(rw)
	if linuxerr.Equals(linuxerr.EAGAIN, err) {
		err = linuxerr.ErrWouldBlock
	}
	return bufN + n, err
}

// Read implements vfs.FileDescriptionImpl.Read.
func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
	if !fd.seekable {
		return fd.PRead(ctx, dst, -1, opts)
	}

	fd.mu.Lock()
	n, err := fd.PRead(ctx, dst, fd.off, opts)
	fd.off += n
	fd.mu.Unlock()
	return n, err
}

// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
	n, _, err := fd.pwrite(ctx, src, offset, opts)
	return n, err
}

// pwrite returns the number of bytes written, final offset, error. The final
// offset should be ignored by PWrite.
func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
	if fd.seekable && offset < 0 {
		return 0, offset, linuxerr.EINVAL
	}

	// Check that flags are supported.
	//
	// TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
	if opts.Flags&^linux.RWF_HIPRI != 0 {
		return 0, offset, linuxerr.EOPNOTSUPP
	}

	d := fd.dentry()
	if fd.isRegularFile {
		// If the regular file fd was opened with O_APPEND, make sure the file
		// size is updated. There is a possible race here if size is modified
		// externally after metadata cache is updated.
		if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
			if err := d.updateFromGetattr(ctx); err != nil {
				return 0, offset, err
			}
		}

		// We need to hold the metadataMu *while* writing to a regular file.
		d.metadataMu.Lock()
		defer d.metadataMu.Unlock()

		// Set offset to file size if the regular file was opened with O_APPEND.
		if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
			// Holding d.metadataMu is sufficient for reading d.size.
			offset = int64(d.size)
		}
		limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
		if err != nil {
			return 0, offset, err
		}
		src = src.TakeFirst64(limit)
	}

	if d.cachedMetadataAuthoritative() {
		if fd.isRegularFile {
			d.touchCMtimeLocked()
		} else {
			d.touchCMtime()
		}
	}

	rw := getHandleReadWriter(ctx, &fd.handle, offset)
	n, err := src.CopyInTo(ctx, rw)
	putHandleReadWriter(rw)
	if linuxerr.Equals(linuxerr.EAGAIN, err) {
		err = linuxerr.ErrWouldBlock
	}
	// Update offset if the offset is valid.
	if offset >= 0 {
		offset += n
	}
	// Update file size for regular files.
	if fd.isRegularFile {
		// d.metadataMu is already locked at this point.
		if uint64(offset) > d.size {
			d.dataMu.Lock()
			defer d.dataMu.Unlock()
			atomic.StoreUint64(&d.size, uint64(offset))
		}
	}
	return int64(n), offset, err
}

// Write implements vfs.FileDescriptionImpl.Write.
func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
	if !fd.seekable {
		return fd.PWrite(ctx, src, -1, opts)
	}

	fd.mu.Lock()
	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
	fd.off = off
	fd.mu.Unlock()
	return n, err
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
	if !fd.seekable {
		return 0, linuxerr.ESPIPE
	}
	fd.mu.Lock()
	defer fd.mu.Unlock()
	newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
	if err != nil {
		return 0, err
	}
	fd.off = newOffset
	return newOffset, nil
}

// Sync implements vfs.FileDescriptionImpl.Sync.
func (fd *specialFileFD) Sync(ctx context.Context) error {
	return fd.sync(ctx, false /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */)
}

func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool, accFsyncFDIDsLisa *[]lisafs.FDID) error {
	// Locks to ensure it didn't race with fd.Release().
	fd.releaseMu.RLock()
	defer fd.releaseMu.RUnlock()

	if !fd.handle.isOpen() {
		return nil
	}
	err := func() error {
		// If we have a host FD, fsyncing it is likely to be faster than an fsync
		// RPC.
		if fd.handle.fd >= 0 {
			ctx.UninterruptibleSleepStart(false)
			err := unix.Fsync(int(fd.handle.fd))
			ctx.UninterruptibleSleepFinish(false)
			return err
		}
		if fs := fd.filesystem(); fs.opts.lisaEnabled {
			if accFsyncFDIDsLisa != nil {
				*accFsyncFDIDsLisa = append(*accFsyncFDIDsLisa, fd.handle.fdLisa.ID())
				return nil
			}
			return fd.handle.fdLisa.Sync(ctx)
		}
		return fd.handle.file.fsync(ctx)
	}()
	if err != nil {
		if !forFilesystemSync {
			return err
		}
		// Only return err if we can reasonably have expected sync to succeed
		// (fd represents a regular file that was opened for writing).
		if fd.isRegularFile && fd.vfsfd.IsWritable() {
			return err
		}
		ctx.Debugf("gofer.specialFileFD.sync: syncing non-writable or non-regular-file FD failed: %v", err)
	}
	return nil
}

// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *specialFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
	if fd.handle.fd < 0 || fd.filesystem().opts.forcePageCache {
		return linuxerr.ENODEV
	}
	// After this point, fd may be used as a memmap.Mappable and memmap.File.
	fd.hostFileMapperInitOnce.Do(fd.hostFileMapper.Init)
	return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts)
}

// AddMapping implements memmap.Mappable.AddMapping.
func (fd *specialFileFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
	fd.hostFileMapper.IncRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())})
	return nil
}

// RemoveMapping implements memmap.Mappable.RemoveMapping.
func (fd *specialFileFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
	fd.hostFileMapper.DecRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())})
}

// CopyMapping implements memmap.Mappable.CopyMapping.
func (fd *specialFileFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
	return fd.AddMapping(ctx, ms, dstAR, offset, writable)
}

// Translate implements memmap.Mappable.Translate.
func (fd *specialFileFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
	mr := optional
	if fd.filesystem().opts.limitHostFDTranslation {
		mr = maxFillRange(required, optional)
	}
	return []memmap.Translation{
		{
			Source: mr,
			File:   fd,
			Offset: mr.Start,
			Perms:  hostarch.AnyAccess,
		},
	}, nil
}

// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
func (fd *specialFileFD) InvalidateUnsavable(ctx context.Context) error {
	return nil
}

// IncRef implements memmap.File.IncRef.
func (fd *specialFileFD) IncRef(fr memmap.FileRange) {
	fd.fileRefsMu.Lock()
	defer fd.fileRefsMu.Unlock()
	fd.fileRefs.IncRefAndAccount(fr)
}

// DecRef implements memmap.File.DecRef.
func (fd *specialFileFD) DecRef(fr memmap.FileRange) {
	fd.fileRefsMu.Lock()
	defer fd.fileRefsMu.Unlock()
	fd.fileRefs.DecRefAndAccount(fr)
}

// MapInternal implements memmap.File.MapInternal.
func (fd *specialFileFD) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) {
	fd.requireHostFD()
	return fd.hostFileMapper.MapInternal(fr, int(fd.handle.fd), at.Write)
}

// FD implements memmap.File.FD.
func (fd *specialFileFD) FD() int {
	fd.requireHostFD()
	return int(fd.handle.fd)
}

func (fd *specialFileFD) requireHostFD() {
	if fd.handle.fd < 0 {
		// This is possible if fd was successfully mmapped before saving, then
		// was restored without a host FD. This is unrecoverable: without a
		// host FD, we can't mmap this file post-restore.
		panic("gofer.specialFileFD can no longer be memory-mapped without a host FD")
	}
}