// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
	"fmt"
	"math"
	"strings"
	"sync/atomic"
	"syscall"

	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/context"
	"gvisor.dev/gvisor/pkg/refs"
	"gvisor.dev/gvisor/pkg/sentry/fs"
	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
	"gvisor.dev/gvisor/pkg/sentry/limits"
	"gvisor.dev/gvisor/pkg/sentry/vfs"
	"gvisor.dev/gvisor/pkg/sync"
	"gvisor.dev/gvisor/pkg/syserror"
)

// FDFlags define flags for an individual descriptor.
//
// +stateify savable
type FDFlags struct {
	// CloseOnExec indicates the descriptor should be closed on exec.
	CloseOnExec bool
}

// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
// representation.
func (f FDFlags) ToLinuxFileFlags() (mask uint) {
	if f.CloseOnExec {
		mask |= linux.O_CLOEXEC
	}
	return
}

// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
// representation.
func (f FDFlags) ToLinuxFDFlags() (mask uint) {
	if f.CloseOnExec {
		mask |= linux.FD_CLOEXEC
	}
	return
}

// descriptor holds the details about a file descriptor, namely a pointer to
// the file itself and the descriptor flags.
//
// Note that this is immutable and can only be changed via operations on the
// descriptorTable.
//
// It contains both VFS1 and VFS2 file types, but only one of them can be set.
//
// +stateify savable
type descriptor struct {
	// TODO(gvisor.dev/issue/1624): Remove fs.File.
	file     *fs.File
	fileVFS2 *vfs.FileDescription
	flags    FDFlags
}

// FDTable is used to manage File references and flags.
//
// +stateify savable
type FDTable struct {
	refs.AtomicRefCount
	k *Kernel

	// mu protects below.
	mu sync.Mutex `state:"nosave"`

	// next is start position to find fd.
	next int32

	// used contains the number of non-nil entries. It must be accessed
	// atomically. It may be read atomically without holding mu (but not
	// written).
	used int32

	// descriptorTable holds descriptors.
	descriptorTable `state:".(map[int32]descriptor)"`
}

func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
	m := make(map[int32]descriptor)
	f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
		m[fd] = descriptor{
			file:     file,
			fileVFS2: fileVFS2,
			flags:    flags,
		}
	})
	return m
}

func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
	f.init() // Initialize table.
	for fd, d := range m {
		f.setAll(fd, d.file, d.fileVFS2, d.flags)

		// Note that we do _not_ need to acquire a extra table reference here. The
		// table reference will already be accounted for in the file, so we drop the
		// reference taken by set above.
		switch {
		case d.file != nil:
			d.file.DecRef()
		case d.fileVFS2 != nil:
			d.fileVFS2.DecRef()
		}
	}
}

// drop drops the table reference.
func (f *FDTable) drop(file *fs.File) {
	// Release locks.
	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF})

	// Send inotify events.
	d := file.Dirent
	var ev uint32
	if fs.IsDir(d.Inode.StableAttr) {
		ev |= linux.IN_ISDIR
	}
	if file.Flags().Write {
		ev |= linux.IN_CLOSE_WRITE
	} else {
		ev |= linux.IN_CLOSE_NOWRITE
	}
	d.InotifyEvent(ev, 0)

	// Drop the table reference.
	file.DecRef()
}

// dropVFS2 drops the table reference.
func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
	// Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the
	// entire file.
	err := file.UnlockPOSIX(context.Background(), f, 0, 0, linux.SEEK_SET)
	if err != nil && err != syserror.ENOLCK {
		panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
	}

	// Generate inotify events.
	ev := uint32(linux.IN_CLOSE_NOWRITE)
	if file.IsWritable() {
		ev = linux.IN_CLOSE_WRITE
	}
	file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent)

	// Drop the table's reference.
	file.DecRef()
}

// NewFDTable allocates a new FDTable that may be used by tasks in k.
func (k *Kernel) NewFDTable() *FDTable {
	f := &FDTable{k: k}
	f.init()
	return f
}

// destroy removes all of the file descriptors from the map.
func (f *FDTable) destroy() {
	f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool {
		return true
	})
}

// DecRef implements RefCounter.DecRef with destructor f.destroy.
func (f *FDTable) DecRef() {
	f.DecRefWithDestructor(f.destroy)
}

// Size returns the number of file descriptor slots currently allocated.
func (f *FDTable) Size() int {
	size := atomic.LoadInt32(&f.used)
	return int(size)
}

// forEach iterates over all non-nil files in sorted order.
//
// It is the caller's responsibility to acquire an appropriate lock.
func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
	// retries tracks the number of failed TryIncRef attempts for the same FD.
	retries := 0
	fd := int32(0)
	for {
		file, fileVFS2, flags, ok := f.getAll(fd)
		if !ok {
			break
		}
		switch {
		case file != nil:
			if !file.TryIncRef() {
				retries++
				if retries > 1000 {
					panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, FileOps: %+v", fd, file, file.FileOperations))
				}
				continue // Race caught.
			}
			fn(fd, file, nil, flags)
			file.DecRef()
		case fileVFS2 != nil:
			if !fileVFS2.TryIncRef() {
				retries++
				if retries > 1000 {
					panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, Impl: %+v", fd, fileVFS2, fileVFS2.Impl()))
				}
				continue // Race caught.
			}
			fn(fd, nil, fileVFS2, flags)
			fileVFS2.DecRef()
		}
		retries = 0
		fd++
	}
}

// String is a stringer for FDTable.
func (f *FDTable) String() string {
	var buf strings.Builder
	f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
		switch {
		case file != nil:
			n, _ := file.Dirent.FullName(nil /* root */)
			fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, n)

		case fileVFS2 != nil:
			vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem()
			name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
			if err != nil {
				fmt.Fprintf(&buf, "<err: %v>\n", err)
				return
			}
			fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, name)
		}
	})
	return buf.String()
}

// NewFDs allocates new FDs guaranteed to be the lowest number available
// greater than or equal to the fd parameter. All files will share the set
// flags. Success is guaranteed to be all or none.
func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) {
	if fd < 0 {
		// Don't accept negative FDs.
		return nil, syscall.EINVAL
	}

	// Default limit.
	end := int32(math.MaxInt32)

	// Ensure we don't get past the provided limit.
	if limitSet := limits.FromContext(ctx); limitSet != nil {
		lim := limitSet.Get(limits.NumberOfFiles)
		if lim.Cur != limits.Infinity {
			end = int32(lim.Cur)
		}
		if fd >= end {
			return nil, syscall.EMFILE
		}
	}

	f.mu.Lock()
	defer f.mu.Unlock()

	// From f.next to find available fd.
	if fd < f.next {
		fd = f.next
	}

	// Install all entries.
	for i := fd; i < end && len(fds) < len(files); i++ {
		if d, _, _ := f.get(i); d == nil {
			f.set(i, files[len(fds)], flags) // Set the descriptor.
			fds = append(fds, i)             // Record the file descriptor.
		}
	}

	// Failure? Unwind existing FDs.
	if len(fds) < len(files) {
		for _, i := range fds {
			f.set(i, nil, FDFlags{}) // Zap entry.
		}
		return nil, syscall.EMFILE
	}

	if fd == f.next {
		// Update next search start position.
		f.next = fds[len(fds)-1] + 1
	}

	return fds, nil
}

// NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available
// greater than or equal to the fd parameter. All files will share the set
// flags. Success is guaranteed to be all or none.
func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) {
	if fd < 0 {
		// Don't accept negative FDs.
		return nil, syscall.EINVAL
	}

	// Default limit.
	end := int32(math.MaxInt32)

	// Ensure we don't get past the provided limit.
	if limitSet := limits.FromContext(ctx); limitSet != nil {
		lim := limitSet.Get(limits.NumberOfFiles)
		if lim.Cur != limits.Infinity {
			end = int32(lim.Cur)
		}
		if fd >= end {
			return nil, syscall.EMFILE
		}
	}

	f.mu.Lock()
	defer f.mu.Unlock()

	// From f.next to find available fd.
	if fd < f.next {
		fd = f.next
	}

	// Install all entries.
	for i := fd; i < end && len(fds) < len(files); i++ {
		if d, _, _ := f.getVFS2(i); d == nil {
			f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
			fds = append(fds, i)                 // Record the file descriptor.
		}
	}

	// Failure? Unwind existing FDs.
	if len(fds) < len(files) {
		for _, i := range fds {
			f.setVFS2(i, nil, FDFlags{}) // Zap entry.
		}
		return nil, syscall.EMFILE
	}

	if fd == f.next {
		// Update next search start position.
		f.next = fds[len(fds)-1] + 1
	}

	return fds, nil
}

// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
// the given file description. If it succeeds, it takes a reference on file.
func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
	if minfd < 0 {
		// Don't accept negative FDs.
		return -1, syscall.EINVAL
	}

	// Default limit.
	end := int32(math.MaxInt32)

	// Ensure we don't get past the provided limit.
	if limitSet := limits.FromContext(ctx); limitSet != nil {
		lim := limitSet.Get(limits.NumberOfFiles)
		if lim.Cur != limits.Infinity {
			end = int32(lim.Cur)
		}
		if minfd >= end {
			return -1, syscall.EMFILE
		}
	}

	f.mu.Lock()
	defer f.mu.Unlock()

	// From f.next to find available fd.
	fd := minfd
	if fd < f.next {
		fd = f.next
	}
	for fd < end {
		if d, _, _ := f.getVFS2(fd); d == nil {
			f.setVFS2(fd, file, flags)
			if fd == f.next {
				// Update next search start position.
				f.next = fd + 1
			}
			return fd, nil
		}
		fd++
	}
	return -1, syscall.EMFILE
}

// NewFDAt sets the file reference for the given FD. If there is an active
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
	return f.newFDAt(ctx, fd, file, nil, flags)
}

// NewFDAtVFS2 sets the file reference for the given FD. If there is an active
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error {
	return f.newFDAt(ctx, fd, nil, file, flags)
}

func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error {
	if fd < 0 {
		// Don't accept negative FDs.
		return syscall.EBADF
	}

	// Check the limit for the provided file.
	if limitSet := limits.FromContext(ctx); limitSet != nil {
		if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
			return syscall.EMFILE
		}
	}

	// Install the entry.
	f.mu.Lock()
	defer f.mu.Unlock()
	f.setAll(fd, file, fileVFS2, flags)
	return nil
}

// SetFlags sets the flags for the given file descriptor.
//
// True is returned iff flags were changed.
func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
	if fd < 0 {
		// Don't accept negative FDs.
		return syscall.EBADF
	}

	f.mu.Lock()
	defer f.mu.Unlock()

	file, _, _ := f.get(fd)
	if file == nil {
		// No file found.
		return syscall.EBADF
	}

	// Update the flags.
	f.set(fd, file, flags)
	return nil
}

// SetFlagsVFS2 sets the flags for the given file descriptor.
//
// True is returned iff flags were changed.
func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
	if fd < 0 {
		// Don't accept negative FDs.
		return syscall.EBADF
	}

	f.mu.Lock()
	defer f.mu.Unlock()

	file, _, _ := f.getVFS2(fd)
	if file == nil {
		// No file found.
		return syscall.EBADF
	}

	// Update the flags.
	f.setVFS2(fd, file, flags)
	return nil
}

// Get returns a reference to the file and the flags for the FD or nil if no
// file is defined for the given fd.
//
// N.B. Callers are required to use DecRef when they are done.
//
//go:nosplit
func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) {
	if fd < 0 {
		return nil, FDFlags{}
	}

	for {
		file, flags, _ := f.get(fd)
		if file != nil {
			if !file.TryIncRef() {
				continue // Race caught.
			}
			// Reference acquired.
			return file, flags
		}
		// No file available.
		return nil, FDFlags{}
	}
}

// GetVFS2 returns a reference to the file and the flags for the FD or nil if no
// file is defined for the given fd.
//
// N.B. Callers are required to use DecRef when they are done.
//
//go:nosplit
func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) {
	if fd < 0 {
		return nil, FDFlags{}
	}

	for {
		file, flags, _ := f.getVFS2(fd)
		if file != nil {
			if !file.TryIncRef() {
				continue // Race caught.
			}
			// Reference acquired.
			return file, flags
		}
		// No file available.
		return nil, FDFlags{}
	}
}

// GetFDs returns a sorted list of valid fds.
//
// Precondition: The caller must be running on the task goroutine, or Task.mu
// must be locked.
func (f *FDTable) GetFDs() []int32 {
	fds := make([]int32, 0, int(atomic.LoadInt32(&f.used)))
	f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
		fds = append(fds, fd)
	})
	return fds
}

// GetRefs returns a stable slice of references to all files and bumps the
// reference count on each. The caller must use DecRef on each reference when
// they're done using the slice.
func (f *FDTable) GetRefs() []*fs.File {
	files := make([]*fs.File, 0, f.Size())
	f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
		file.IncRef() // Acquire a reference for caller.
		files = append(files, file)
	})
	return files
}

// GetRefsVFS2 returns a stable slice of references to all files and bumps the
// reference count on each. The caller must use DecRef on each reference when
// they're done using the slice.
func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription {
	files := make([]*vfs.FileDescription, 0, f.Size())
	f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
		file.IncRef() // Acquire a reference for caller.
		files = append(files, file)
	})
	return files
}

// Fork returns an independent FDTable.
func (f *FDTable) Fork() *FDTable {
	clone := f.k.NewFDTable()

	f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
		// The set function here will acquire an appropriate table
		// reference for the clone. We don't need anything else.
		switch {
		case file != nil:
			clone.set(fd, file, flags)
		case fileVFS2 != nil:
			clone.setVFS2(fd, fileVFS2, flags)
		}
	})
	return clone
}

// Remove removes an FD from and returns a non-file iff successful.
//
// N.B. Callers are required to use DecRef when they are done.
func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
	if fd < 0 {
		return nil, nil
	}

	f.mu.Lock()
	defer f.mu.Unlock()

	// Update current available position.
	if fd < f.next {
		f.next = fd
	}

	orig, orig2, _, _ := f.getAll(fd)

	// Add reference for caller.
	switch {
	case orig != nil:
		orig.IncRef()
	case orig2 != nil:
		orig2.IncRef()
	}
	if orig != nil || orig2 != nil {
		f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
	}
	return orig, orig2
}

// RemoveIf removes all FDs where cond is true.
func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
	f.mu.Lock()
	defer f.mu.Unlock()

	f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
		if cond(file, fileVFS2, flags) {
			f.set(fd, nil, FDFlags{}) // Clear from table.
			// Update current available position.
			if fd < f.next {
				f.next = fd
			}
		}
	})
}