// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
	"syscall"

	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/context"
	"gvisor.dev/gvisor/pkg/marshal/primitive"
	"gvisor.dev/gvisor/pkg/sentry/arch"
	"gvisor.dev/gvisor/pkg/sentry/fs"
	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
	"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
	"gvisor.dev/gvisor/pkg/sentry/kernel"
	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
	"gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
	"gvisor.dev/gvisor/pkg/sentry/limits"
	"gvisor.dev/gvisor/pkg/syserror"
	"gvisor.dev/gvisor/pkg/usermem"
)

// fileOpAt performs an operation on the second last component in the path.
func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error {
	// Extract the last component.
	dir, name := fs.SplitLast(path)
	if dir == "/" {
		// Common case: we are accessing a file in the root.
		root := t.FSContext().RootDirectory()
		err := fn(root, root, name, linux.MaxSymlinkTraversals)
		root.DecRef(t)
		return err
	} else if dir == "." && dirFD == linux.AT_FDCWD {
		// Common case: we are accessing a file relative to the current
		// working directory; skip the look-up.
		wd := t.FSContext().WorkingDirectory()
		root := t.FSContext().RootDirectory()
		err := fn(root, wd, name, linux.MaxSymlinkTraversals)
		wd.DecRef(t)
		root.DecRef(t)
		return err
	}

	return fileOpOn(t, dirFD, dir, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error {
		return fn(root, d, name, remainingTraversals)
	})
}

// fileOpOn performs an operation on the last entry of the path.
func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error {
	var (
		d   *fs.Dirent // The file.
		wd  *fs.Dirent // The working directory (if required.)
		rel *fs.Dirent // The relative directory for search (if required.)
		f   *fs.File   // The file corresponding to dirFD (if required.)
		err error
	)

	// Extract the working directory (maybe).
	if len(path) > 0 && path[0] == '/' {
		// Absolute path; rel can be nil.
	} else if dirFD == linux.AT_FDCWD {
		// Need to reference the working directory.
		wd = t.FSContext().WorkingDirectory()
		rel = wd
	} else {
		// Need to extract the given FD.
		f = t.GetFile(dirFD)
		if f == nil {
			return syserror.EBADF
		}
		rel = f.Dirent
		if !fs.IsDir(rel.Inode.StableAttr) {
			f.DecRef(t)
			return syserror.ENOTDIR
		}
	}

	// Grab the root (always required.)
	root := t.FSContext().RootDirectory()

	// Lookup the node.
	remainingTraversals := uint(linux.MaxSymlinkTraversals)
	if resolve {
		d, err = t.MountNamespace().FindInode(t, root, rel, path, &remainingTraversals)
	} else {
		d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals)
	}
	root.DecRef(t)
	if wd != nil {
		wd.DecRef(t)
	}
	if f != nil {
		f.DecRef(t)
	}
	if err != nil {
		return err
	}

	err = fn(root, d, remainingTraversals)
	d.DecRef(t)
	return err
}

// copyInPath copies a path in.
func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string, dirPath bool, err error) {
	path, err = t.CopyInString(addr, linux.PATH_MAX)
	if err != nil {
		return "", false, err
	}
	if path == "" && !allowEmpty {
		return "", false, syserror.ENOENT
	}

	// If the path ends with a /, then checks must be enforced in various
	// ways in the different callers. We pass this back to the caller.
	path, dirPath = fs.TrimTrailingSlashes(path)

	return path, dirPath, nil
}

// LINT.IfChange

func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) {
	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
	if err != nil {
		return 0, err
	}

	resolve := flags&linux.O_NOFOLLOW == 0
	err = fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
		// First check a few things about the filesystem before trying to get the file
		// reference.
		//
		// It's required that Check does not try to open files not that aren't backed by
		// this dirent (e.g. pipes and sockets) because this would result in opening these
		// files an extra time just to check permissions.
		if err := d.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
			return err
		}

		if fs.IsSymlink(d.Inode.StableAttr) && !resolve {
			return syserror.ELOOP
		}

		fileFlags := linuxToFlags(flags)
		// Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
		fileFlags.LargeFile = true
		if fs.IsDir(d.Inode.StableAttr) {
			// Don't allow directories to be opened writable.
			if fileFlags.Write {
				return syserror.EISDIR
			}
		} else {
			// If O_DIRECTORY is set, but the file is not a directory, then fail.
			if fileFlags.Directory {
				return syserror.ENOTDIR
			}
			// If it's a directory, then make sure.
			if dirPath {
				return syserror.ENOTDIR
			}
		}

		file, err := d.Inode.GetFile(t, d, fileFlags)
		if err != nil {
			return syserror.ConvertIntr(err, syserror.ERESTARTSYS)
		}
		defer file.DecRef(t)

		// Truncate is called when O_TRUNC is specified for any kind of
		// existing Dirent. Behavior is delegated to the entry's Truncate
		// implementation.
		if flags&linux.O_TRUNC != 0 {
			if err := d.Inode.Truncate(t, d, 0); err != nil {
				return err
			}
		}

		// Success.
		newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
			CloseOnExec: flags&linux.O_CLOEXEC != 0,
		})
		if err != nil {
			return err
		}

		// Set return result in frame.
		fd = uintptr(newFD)

		// Generate notification for opened file.
		d.InotifyEvent(linux.IN_OPEN, 0)

		return nil
	})
	return fd, err // Use result in frame.
}

func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
	if err != nil {
		return err
	}
	if dirPath {
		return syserror.ENOENT
	}

	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
		if !fs.IsDir(d.Inode.StableAttr) {
			return syserror.ENOTDIR
		}

		// Do we have the appropriate permissions on the parent?
		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
			return err
		}

		// Attempt a creation.
		perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))

		switch mode.FileType() {
		case 0:
			// "Zero file type is equivalent to type S_IFREG." - mknod(2)
			fallthrough
		case linux.ModeRegular:
			// We are not going to return the file, so the actual
			// flags used don't matter, but they cannot be empty or
			// Create will complain.
			flags := fs.FileFlags{Read: true, Write: true}
			file, err := d.Create(t, root, name, flags, perms)
			if err != nil {
				return err
			}
			file.DecRef(t)
			return nil

		case linux.ModeNamedPipe:
			return d.CreateFifo(t, root, name, perms)

		case linux.ModeSocket:
			// While it is possible create a unix domain socket file on linux
			// using mknod(2), in practice this is pretty useless from an
			// application. Linux internally uses mknod() to create the socket
			// node during bind(2), but we implement bind(2) independently. If
			// an application explicitly creates a socket node using mknod(),
			// you can't seem to bind() or connect() to the resulting socket.
			//
			// Instead of emulating this seemingly useless behaviour, we'll
			// indicate that the filesystem doesn't support the creation of
			// sockets.
			return syserror.EOPNOTSUPP

		case linux.ModeCharacterDevice:
			fallthrough
		case linux.ModeBlockDevice:
			// TODO(b/72101894): We don't support creating block or character
			// devices at the moment.
			//
			// When we start supporting block and character devices, we'll
			// need to check for CAP_MKNOD here.
			return syserror.EPERM

		default:
			// "EINVAL - mode requested creation of something other than a
			// regular file, device special file, FIFO or socket." - mknod(2)
			return syserror.EINVAL
		}
	})
}

// Mknod implements the linux syscall mknod(2).
func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	path := args[0].Pointer()
	mode := linux.FileMode(args[1].ModeT())
	// We don't need this argument until we support creation of device nodes.
	_ = args[2].Uint() // dev

	return 0, nil, mknodAt(t, linux.AT_FDCWD, path, mode)
}

// Mknodat implements the linux syscall mknodat(2).
func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	dirFD := args[0].Int()
	path := args[1].Pointer()
	mode := linux.FileMode(args[2].ModeT())
	// We don't need this argument until we support creation of device nodes.
	_ = args[3].Uint() // dev

	return 0, nil, mknodAt(t, dirFD, path, mode)
}

func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) {
	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
	if err != nil {
		return 0, err
	}
	if dirPath {
		return 0, syserror.ENOENT
	}

	fileFlags := linuxToFlags(flags)
	// Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
	fileFlags.LargeFile = true

	err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, parent *fs.Dirent, name string, remainingTraversals uint) error {
		// Resolve the name to see if it exists, and follow any
		// symlinks along the way. We must do the symlink resolution
		// manually because if the symlink target does not exist, we
		// must create the target (and not the symlink itself).
		var (
			found *fs.Dirent
			err   error
		)
		for {
			if !fs.IsDir(parent.Inode.StableAttr) {
				return syserror.ENOTDIR
			}

			// Start by looking up the dirent at 'name'.
			found, err = t.MountNamespace().FindLink(t, root, parent, name, &remainingTraversals)
			if err != nil {
				break
			}
			defer found.DecRef(t)

			// We found something (possibly a symlink). If the
			// O_EXCL flag was passed, then we can immediately
			// return EEXIST.
			if flags&linux.O_EXCL != 0 {
				return syserror.EEXIST
			}

			// If we have a non-symlink, then we can proceed.
			if !fs.IsSymlink(found.Inode.StableAttr) {
				break
			}

			// If O_NOFOLLOW was passed, then don't try to resolve
			// anything.
			if flags&linux.O_NOFOLLOW != 0 {
				return syserror.ELOOP
			}

			// Try to resolve the symlink directly to a Dirent.
			var resolved *fs.Dirent
			resolved, err = found.Inode.Getlink(t)
			if err == nil {
				// No more resolution necessary.
				defer resolved.DecRef(t)
				break
			}
			if err != fs.ErrResolveViaReadlink {
				return err
			}

			// Are we able to resolve further?
			if remainingTraversals == 0 {
				return syscall.ELOOP
			}

			// Resolve the symlink to a path via Readlink.
			var path string
			path, err = found.Inode.Readlink(t)
			if err != nil {
				break
			}
			remainingTraversals--

			// Get the new parent from the target path.
			var newParent *fs.Dirent
			newParentPath, newName := fs.SplitLast(path)
			newParent, err = t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals)
			if err != nil {
				break
			}
			defer newParent.DecRef(t)

			// Repeat the process with the parent and name of the
			// symlink target.
			parent = newParent
			name = newName
		}

		var newFile *fs.File
		switch err {
		case nil:
			// Like sys_open, check for a few things about the
			// filesystem before trying to get a reference to the
			// fs.File. The same constraints on Check apply.
			if err := found.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
				return err
			}

			// Truncate is called when O_TRUNC is specified for any kind of
			// existing Dirent. Behavior is delegated to the entry's Truncate
			// implementation.
			if flags&linux.O_TRUNC != 0 {
				if err := found.Inode.Truncate(t, found, 0); err != nil {
					return err
				}
			}

			// Create a new fs.File.
			newFile, err = found.Inode.GetFile(t, found, fileFlags)
			if err != nil {
				return syserror.ConvertIntr(err, syserror.ERESTARTSYS)
			}
			defer newFile.DecRef(t)
		case syserror.ENOENT:
			// File does not exist. Proceed with creation.

			// Do we have write permissions on the parent?
			if err := parent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
				return err
			}

			// Attempt a creation.
			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
			newFile, err = parent.Create(t, root, name, fileFlags, perms)
			if err != nil {
				// No luck, bail.
				return err
			}
			defer newFile.DecRef(t)
			found = newFile.Dirent
		default:
			return err
		}

		// Success.
		newFD, err := t.NewFDFrom(0, newFile, kernel.FDFlags{
			CloseOnExec: flags&linux.O_CLOEXEC != 0,
		})
		if err != nil {
			return err
		}

		// Set result in frame.
		fd = uintptr(newFD)

		// Queue the open inotify event. The creation event is
		// automatically queued when the dirent is found. The open
		// events are implemented at the syscall layer so we need to
		// manually queue one here.
		found.InotifyEvent(linux.IN_OPEN, 0)

		return nil
	})
	return fd, err // Use result in frame.
}

// Open implements linux syscall open(2).
func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()
	flags := uint(args[1].Uint())
	if flags&linux.O_CREAT != 0 {
		mode := linux.FileMode(args[2].ModeT())
		n, err := createAt(t, linux.AT_FDCWD, addr, flags, mode)
		return n, nil, err
	}
	n, err := openAt(t, linux.AT_FDCWD, addr, flags)
	return n, nil, err
}

// Openat implements linux syscall openat(2).
func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	dirFD := args[0].Int()
	addr := args[1].Pointer()
	flags := uint(args[2].Uint())
	if flags&linux.O_CREAT != 0 {
		mode := linux.FileMode(args[3].ModeT())
		n, err := createAt(t, dirFD, addr, flags, mode)
		return n, nil, err
	}
	n, err := openAt(t, dirFD, addr, flags)
	return n, nil, err
}

// Creat implements linux syscall creat(2).
func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()
	mode := linux.FileMode(args[1].ModeT())
	n, err := createAt(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_TRUNC, mode)
	return n, nil, err
}

// accessContext is a context that overrides the credentials used, but
// otherwise carries the same values as the embedded context.
//
// accessContext should only be used for access(2).
type accessContext struct {
	context.Context
	creds *auth.Credentials
}

// Value implements context.Context.
func (ac accessContext) Value(key interface{}) interface{} {
	switch key {
	case auth.CtxCredentials:
		return ac.creds
	default:
		return ac.Context.Value(key)
	}
}

func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode uint) error {
	const rOK = 4
	const wOK = 2
	const xOK = 1

	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
	if err != nil {
		return err
	}

	// Sanity check the mode.
	if mode&^(rOK|wOK|xOK) != 0 {
		return syserror.EINVAL
	}

	return fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
		// access(2) and faccessat(2) check permissions using real
		// UID/GID, not effective UID/GID.
		//
		// "access() needs to use the real uid/gid, not the effective
		// uid/gid. We do this by temporarily clearing all FS-related
		// capabilities and switching the fsuid/fsgid around to the
		// real ones." -fs/open.c:faccessat
		creds := t.Credentials().Fork()
		creds.EffectiveKUID = creds.RealKUID
		creds.EffectiveKGID = creds.RealKGID
		if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
			creds.EffectiveCaps = creds.PermittedCaps
		} else {
			creds.EffectiveCaps = 0
		}

		ctx := &accessContext{
			Context: t,
			creds:   creds,
		}

		return d.Inode.CheckPermission(ctx, fs.PermMask{
			Read:    mode&rOK != 0,
			Write:   mode&wOK != 0,
			Execute: mode&xOK != 0,
		})
	})
}

// Access implements linux syscall access(2).
func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()
	mode := args[1].ModeT()

	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode)
}

// Faccessat implements linux syscall faccessat(2).
//
// Note that the faccessat() system call does not take a flags argument:
// "The raw faccessat() system call takes only the first three arguments. The
// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within
// the glibc wrapper function for faccessat().  If either of these flags is
// specified, then the wrapper function employs fstatat(2) to determine access
// permissions." - faccessat(2)
func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	dirFD := args[0].Int()
	addr := args[1].Pointer()
	mode := args[2].ModeT()

	return 0, nil, accessAt(t, dirFD, addr, mode)
}

// LINT.ThenChange(vfs2/filesystem.go)

// LINT.IfChange

// Ioctl implements linux syscall ioctl(2).
func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	fd := args[0].Int()
	request := int(args[1].Int())

	file := t.GetFile(fd)
	if file == nil {
		return 0, nil, syserror.EBADF
	}
	defer file.DecRef(t)

	// Shared flags between file and socket.
	switch request {
	case linux.FIONCLEX:
		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
			CloseOnExec: false,
		})
		return 0, nil, nil
	case linux.FIOCLEX:
		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
			CloseOnExec: true,
		})
		return 0, nil, nil

	case linux.FIONBIO:
		var set int32
		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
			return 0, nil, err
		}
		flags := file.Flags()
		if set != 0 {
			flags.NonBlocking = true
		} else {
			flags.NonBlocking = false
		}
		file.SetFlags(flags.Settable())
		return 0, nil, nil

	case linux.FIOASYNC:
		var set int32
		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
			return 0, nil, err
		}
		flags := file.Flags()
		if set != 0 {
			flags.Async = true
		} else {
			flags.Async = false
		}
		file.SetFlags(flags.Settable())
		return 0, nil, nil

	case linux.FIOSETOWN, linux.SIOCSPGRP:
		var set int32
		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
			return 0, nil, err
		}
		fSetOwn(t, int(fd), file, set)
		return 0, nil, nil

	case linux.FIOGETOWN, linux.SIOCGPGRP:
		_, err := primitive.CopyInt32Out(t, args[2].Pointer(), fGetOwn(t, file))
		return 0, nil, err

	default:
		ret, err := file.FileOperations.Ioctl(t, file, t.MemoryManager(), args)
		if err != nil {
			return 0, nil, err
		}

		return ret, nil, nil
	}
}

// LINT.ThenChange(vfs2/ioctl.go)

// LINT.IfChange

// Getcwd implements the linux syscall getcwd(2).
func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()
	size := args[1].SizeT()
	cwd := t.FSContext().WorkingDirectory()
	defer cwd.DecRef(t)
	root := t.FSContext().RootDirectory()
	defer root.DecRef(t)

	// Get our fullname from the root and preprend unreachable if the root was
	// unreachable from our current dirent this is the same behavior as on linux.
	s, reachable := cwd.FullName(root)
	if !reachable {
		s = "(unreachable)" + s
	}

	// Note this is >= because we need a terminator.
	if uint(len(s)) >= size {
		return 0, nil, syserror.ERANGE
	}

	// Copy out the path name for the node.
	bytes, err := t.CopyOutBytes(addr, []byte(s))
	if err != nil {
		return 0, nil, err
	}

	// Top it off with a terminator.
	_, err = t.CopyOutBytes(addr+usermem.Addr(bytes), []byte("\x00"))
	return uintptr(bytes + 1), nil, err
}

// Chroot implements the linux syscall chroot(2).
func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()

	if !t.HasCapability(linux.CAP_SYS_CHROOT) {
		return 0, nil, syserror.EPERM
	}

	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
	if err != nil {
		return 0, nil, err
	}

	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
		// Is it a directory?
		if !fs.IsDir(d.Inode.StableAttr) {
			return syserror.ENOTDIR
		}

		// Does it have execute permissions?
		if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
			return err
		}

		t.FSContext().SetRootDirectory(t, d)
		return nil
	})
}

// Chdir implements the linux syscall chdir(2).
func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()

	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
	if err != nil {
		return 0, nil, err
	}

	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
		// Is it a directory?
		if !fs.IsDir(d.Inode.StableAttr) {
			return syserror.ENOTDIR
		}

		// Does it have execute permissions?
		if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
			return err
		}

		t.FSContext().SetWorkingDirectory(t, d)
		return nil
	})
}

// Fchdir implements the linux syscall fchdir(2).
func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	fd := args[0].Int()

	file := t.GetFile(fd)
	if file == nil {
		return 0, nil, syserror.EBADF
	}
	defer file.DecRef(t)

	// Is it a directory?
	if !fs.IsDir(file.Dirent.Inode.StableAttr) {
		return 0, nil, syserror.ENOTDIR
	}

	// Does it have execute permissions?
	if err := file.Dirent.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
		return 0, nil, err
	}

	t.FSContext().SetWorkingDirectory(t, file.Dirent)
	return 0, nil, nil
}

// LINT.ThenChange(vfs2/fscontext.go)

// LINT.IfChange

// Close implements linux syscall close(2).
func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	fd := args[0].Int()

	// Note that Remove provides a reference on the file that we may use to
	// flush. It is still active until we drop the final reference below
	// (and other reference-holding operations complete).
	file, _ := t.FDTable().Remove(t, fd)
	if file == nil {
		return 0, nil, syserror.EBADF
	}
	defer file.DecRef(t)

	err := file.Flush(t)
	return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file)
}

// Dup implements linux syscall dup(2).
func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	fd := args[0].Int()

	file := t.GetFile(fd)
	if file == nil {
		return 0, nil, syserror.EBADF
	}
	defer file.DecRef(t)

	newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{})
	if err != nil {
		return 0, nil, syserror.EMFILE
	}
	return uintptr(newFD), nil, nil
}

// Dup2 implements linux syscall dup2(2).
func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	oldfd := args[0].Int()
	newfd := args[1].Int()

	// If oldfd is a valid file descriptor, and newfd has the same value as oldfd,
	// then dup2() does nothing, and returns newfd.
	if oldfd == newfd {
		oldFile := t.GetFile(oldfd)
		if oldFile == nil {
			return 0, nil, syserror.EBADF
		}
		defer oldFile.DecRef(t)

		return uintptr(newfd), nil, nil
	}

	// Zero out flags arg to be used by Dup3.
	args[2].Value = 0
	return Dup3(t, args)
}

// Dup3 implements linux syscall dup3(2).
func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	oldfd := args[0].Int()
	newfd := args[1].Int()
	flags := args[2].Uint()

	if oldfd == newfd {
		return 0, nil, syserror.EINVAL
	}

	oldFile := t.GetFile(oldfd)
	if oldFile == nil {
		return 0, nil, syserror.EBADF
	}
	defer oldFile.DecRef(t)

	err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0})
	if err != nil {
		return 0, nil, err
	}

	return uintptr(newfd), nil, nil
}

func fGetOwnEx(t *kernel.Task, file *fs.File) linux.FOwnerEx {
	ma := file.Async(nil)
	if ma == nil {
		return linux.FOwnerEx{}
	}
	a := ma.(*fasync.FileAsync)
	ot, otg, opg := a.Owner()
	switch {
	case ot != nil:
		return linux.FOwnerEx{
			Type: linux.F_OWNER_TID,
			PID:  int32(t.PIDNamespace().IDOfTask(ot)),
		}
	case otg != nil:
		return linux.FOwnerEx{
			Type: linux.F_OWNER_PID,
			PID:  int32(t.PIDNamespace().IDOfThreadGroup(otg)),
		}
	case opg != nil:
		return linux.FOwnerEx{
			Type: linux.F_OWNER_PGRP,
			PID:  int32(t.PIDNamespace().IDOfProcessGroup(opg)),
		}
	default:
		return linux.FOwnerEx{}
	}
}

func fGetOwn(t *kernel.Task, file *fs.File) int32 {
	owner := fGetOwnEx(t, file)
	if owner.Type == linux.F_OWNER_PGRP {
		return -owner.PID
	}
	return owner.PID
}

// fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux.
//
// If who is positive, it represents a PID. If negative, it represents a PGID.
// If the PID or PGID is invalid, the owner is silently unset.
func fSetOwn(t *kernel.Task, fd int, file *fs.File, who int32) error {
	a := file.Async(fasync.New(fd)).(*fasync.FileAsync)
	if who < 0 {
		// Check for overflow before flipping the sign.
		if who-1 > who {
			return syserror.EINVAL
		}
		pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who))
		a.SetOwnerProcessGroup(t, pg)
	} else {
		tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who))
		a.SetOwnerThreadGroup(t, tg)
	}
	return nil
}

// Fcntl implements linux syscall fcntl(2).
func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	fd := args[0].Int()
	cmd := args[1].Int()

	file, flags := t.FDTable().Get(fd)
	if file == nil {
		return 0, nil, syserror.EBADF
	}
	defer file.DecRef(t)

	switch cmd {
	case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
		from := args[2].Int()
		fd, err := t.NewFDFrom(from, file, kernel.FDFlags{
			CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
		})
		if err != nil {
			return 0, nil, err
		}
		return uintptr(fd), nil, nil
	case linux.F_GETFD:
		return uintptr(flags.ToLinuxFDFlags()), nil, nil
	case linux.F_SETFD:
		flags := args[2].Uint()
		err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{
			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
		})
		return 0, nil, err
	case linux.F_GETFL:
		return uintptr(file.Flags().ToLinux()), nil, nil
	case linux.F_SETFL:
		flags := uint(args[2].Uint())
		file.SetFlags(linuxToFlags(flags).Settable())
		return 0, nil, nil
	case linux.F_SETLK, linux.F_SETLKW:
		// In Linux the file system can choose to provide lock operations for an inode.
		// Normally pipe and socket types lack lock operations. We diverge and use a heavy
		// hammer by only allowing locks on files and directories.
		if !fs.IsFile(file.Dirent.Inode.StableAttr) && !fs.IsDir(file.Dirent.Inode.StableAttr) {
			return 0, nil, syserror.EBADF
		}

		// Copy in the lock request.
		flockAddr := args[2].Pointer()
		var flock linux.Flock
		if _, err := flock.CopyIn(t, flockAddr); err != nil {
			return 0, nil, err
		}

		// Compute the lock whence.
		var sw fs.SeekWhence
		switch flock.Whence {
		case 0:
			sw = fs.SeekSet
		case 1:
			sw = fs.SeekCurrent
		case 2:
			sw = fs.SeekEnd
		default:
			return 0, nil, syserror.EINVAL
		}

		// Compute the lock offset.
		var off int64
		switch sw {
		case fs.SeekSet:
			off = 0
		case fs.SeekCurrent:
			// Note that Linux does not hold any mutexes while retrieving the file offset,
			// see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
			off = file.Offset()
		case fs.SeekEnd:
			uattr, err := file.Dirent.Inode.UnstableAttr(t)
			if err != nil {
				return 0, nil, err
			}
			off = uattr.Size
		default:
			return 0, nil, syserror.EINVAL
		}

		// Compute the lock range.
		rng, err := lock.ComputeRange(flock.Start, flock.Len, off)
		if err != nil {
			return 0, nil, err
		}

		// These locks don't block; execute the non-blocking operation using the inode's lock
		// context directly.
		switch flock.Type {
		case linux.F_RDLCK:
			if !file.Flags().Read {
				return 0, nil, syserror.EBADF
			}
			if cmd == linux.F_SETLK {
				// Non-blocking lock, provide a nil lock.Blocker.
				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, nil) {
					return 0, nil, syserror.EAGAIN
				}
			} else {
				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, t) {
					return 0, nil, syserror.EINTR
				}
			}
			return 0, nil, nil
		case linux.F_WRLCK:
			if !file.Flags().Write {
				return 0, nil, syserror.EBADF
			}
			if cmd == linux.F_SETLK {
				// Non-blocking lock, provide a nil lock.Blocker.
				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, nil) {
					return 0, nil, syserror.EAGAIN
				}
			} else {
				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, t) {
					return 0, nil, syserror.EINTR
				}
			}
			return 0, nil, nil
		case linux.F_UNLCK:
			file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng)
			return 0, nil, nil
		default:
			return 0, nil, syserror.EINVAL
		}
	case linux.F_GETOWN:
		return uintptr(fGetOwn(t, file)), nil, nil
	case linux.F_SETOWN:
		return 0, nil, fSetOwn(t, int(fd), file, args[2].Int())
	case linux.F_GETOWN_EX:
		addr := args[2].Pointer()
		owner := fGetOwnEx(t, file)
		_, err := owner.CopyOut(t, addr)
		return 0, nil, err
	case linux.F_SETOWN_EX:
		addr := args[2].Pointer()
		var owner linux.FOwnerEx
		_, err := owner.CopyIn(t, addr)
		if err != nil {
			return 0, nil, err
		}
		a := file.Async(fasync.New(int(fd))).(*fasync.FileAsync)
		switch owner.Type {
		case linux.F_OWNER_TID:
			task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID))
			if task == nil {
				return 0, nil, syserror.ESRCH
			}
			a.SetOwnerTask(t, task)
			return 0, nil, nil
		case linux.F_OWNER_PID:
			tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID))
			if tg == nil {
				return 0, nil, syserror.ESRCH
			}
			a.SetOwnerThreadGroup(t, tg)
			return 0, nil, nil
		case linux.F_OWNER_PGRP:
			pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID))
			if pg == nil {
				return 0, nil, syserror.ESRCH
			}
			a.SetOwnerProcessGroup(t, pg)
			return 0, nil, nil
		default:
			return 0, nil, syserror.EINVAL
		}
	case linux.F_GET_SEALS:
		val, err := tmpfs.GetSeals(file.Dirent.Inode)
		return uintptr(val), nil, err
	case linux.F_ADD_SEALS:
		if !file.Flags().Write {
			return 0, nil, syserror.EPERM
		}
		err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint())
		return 0, nil, err
	case linux.F_GETPIPE_SZ:
		sz, ok := file.FileOperations.(fs.FifoSizer)
		if !ok {
			return 0, nil, syserror.EINVAL
		}
		size, err := sz.FifoSize(t, file)
		return uintptr(size), nil, err
	case linux.F_SETPIPE_SZ:
		sz, ok := file.FileOperations.(fs.FifoSizer)
		if !ok {
			return 0, nil, syserror.EINVAL
		}
		n, err := sz.SetFifoSize(int64(args[2].Int()))
		return uintptr(n), nil, err
	case linux.F_GETSIG:
		a := file.Async(fasync.New(int(fd))).(*fasync.FileAsync)
		return uintptr(a.Signal()), nil, nil
	case linux.F_SETSIG:
		a := file.Async(fasync.New(int(fd))).(*fasync.FileAsync)
		return 0, nil, a.SetSignal(linux.Signal(args[2].Int()))
	default:
		// Everything else is not yet supported.
		return 0, nil, syserror.EINVAL
	}
}

// Fadvise64 implements linux syscall fadvise64(2).
// This implementation currently ignores the provided advice.
func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	fd := args[0].Int()
	length := args[2].Int64()
	advice := args[3].Int()

	// Note: offset is allowed to be negative.
	if length < 0 {
		return 0, nil, syserror.EINVAL
	}

	file := t.GetFile(fd)
	if file == nil {
		return 0, nil, syserror.EBADF
	}
	defer file.DecRef(t)

	// If the FD refers to a pipe or FIFO, return error.
	if fs.IsPipe(file.Dirent.Inode.StableAttr) {
		return 0, nil, syserror.ESPIPE
	}

	switch advice {
	case linux.POSIX_FADV_NORMAL:
	case linux.POSIX_FADV_RANDOM:
	case linux.POSIX_FADV_SEQUENTIAL:
	case linux.POSIX_FADV_WILLNEED:
	case linux.POSIX_FADV_DONTNEED:
	case linux.POSIX_FADV_NOREUSE:
	default:
		return 0, nil, syserror.EINVAL
	}

	// Sure, whatever.
	return 0, nil, nil
}

// LINT.ThenChange(vfs2/fd.go)

// LINT.IfChange

func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
	if err != nil {
		return err
	}

	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
		if !fs.IsDir(d.Inode.StableAttr) {
			return syserror.ENOTDIR
		}

		// Does this directory exist already?
		remainingTraversals := uint(linux.MaxSymlinkTraversals)
		f, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals)
		switch err {
		case nil:
			// The directory existed.
			defer f.DecRef(t)
			return syserror.EEXIST
		case syserror.EACCES:
			// Permission denied while walking to the directory.
			return err
		default:
			// Do we have write permissions on the parent?
			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
				return err
			}

			// Create the directory.
			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
			return d.CreateDirectory(t, root, name, perms)
		}
	})
}

// Mkdir implements linux syscall mkdir(2).
func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()
	mode := linux.FileMode(args[1].ModeT())

	return 0, nil, mkdirAt(t, linux.AT_FDCWD, addr, mode)
}

// Mkdirat implements linux syscall mkdirat(2).
func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	dirFD := args[0].Int()
	addr := args[1].Pointer()
	mode := linux.FileMode(args[2].ModeT())

	return 0, nil, mkdirAt(t, dirFD, addr, mode)
}

func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
	if err != nil {
		return err
	}

	// Special case: removing the root always returns EBUSY.
	if path == "/" {
		return syserror.EBUSY
	}

	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
		if !fs.IsDir(d.Inode.StableAttr) {
			return syserror.ENOTDIR
		}

		// Linux returns different ernos when the path ends in single
		// dot vs. double dots.
		switch name {
		case ".":
			return syserror.EINVAL
		case "..":
			return syserror.ENOTEMPTY
		}

		if err := d.MayDelete(t, root, name); err != nil {
			return err
		}

		return d.RemoveDirectory(t, root, name)
	})
}

// Rmdir implements linux syscall rmdir(2).
func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()

	return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr)
}

func symlinkAt(t *kernel.Task, dirFD int32, newAddr usermem.Addr, oldAddr usermem.Addr) error {
	newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
	if err != nil {
		return err
	}
	if dirPath {
		return syserror.ENOENT
	}

	// The oldPath is copied in verbatim. This is because the symlink
	// will include all details, including trailing slashes.
	oldPath, err := t.CopyInString(oldAddr, linux.PATH_MAX)
	if err != nil {
		return err
	}
	if oldPath == "" {
		return syserror.ENOENT
	}

	return fileOpAt(t, dirFD, newPath, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
		if !fs.IsDir(d.Inode.StableAttr) {
			return syserror.ENOTDIR
		}

		// Make sure we have write permissions on the parent directory.
		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
			return err
		}
		return d.CreateLink(t, root, oldPath, name)
	})
}

// Symlink implements linux syscall symlink(2).
func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	oldAddr := args[0].Pointer()
	newAddr := args[1].Pointer()

	return 0, nil, symlinkAt(t, linux.AT_FDCWD, newAddr, oldAddr)
}

// Symlinkat implements linux syscall symlinkat(2).
func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	oldAddr := args[0].Pointer()
	dirFD := args[1].Int()
	newAddr := args[2].Pointer()

	return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr)
}

// mayLinkAt determines whether t can create a hard link to target.
//
// This corresponds to Linux's fs/namei.c:may_linkat.
func mayLinkAt(t *kernel.Task, target *fs.Inode) error {
	// Linux will impose the following restrictions on hard links only if
	// sysctl_protected_hardlinks is enabled. The kernel disables this
	// setting by default for backward compatibility (see commit
	// 561ec64ae67e), but also recommends that distributions enable it (and
	// Debian does:
	// https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=889098).
	//
	// gVisor currently behaves as though sysctl_protected_hardlinks is
	// always enabled, and thus imposes the following restrictions on hard
	// links.

	if target.CheckOwnership(t) {
		// fs/namei.c:may_linkat: "Source inode owner (or CAP_FOWNER)
		// can hardlink all they like."
		return nil
	}

	// If we are not the owner, then the file must be regular and have
	// Read+Write permissions.
	if !fs.IsRegular(target.StableAttr) {
		return syserror.EPERM
	}
	if target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil {
		return syserror.EPERM
	}

	return nil
}

// linkAt creates a hard link to the target specified by oldDirFD and oldAddr,
// specified by newDirFD and newAddr.  If resolve is true, then the symlinks
// will be followed when evaluating the target.
func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr, resolve, allowEmpty bool) error {
	oldPath, _, err := copyInPath(t, oldAddr, allowEmpty)
	if err != nil {
		return err
	}
	newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
	if err != nil {
		return err
	}
	if dirPath {
		return syserror.ENOENT
	}

	if allowEmpty && oldPath == "" {
		target := t.GetFile(oldDirFD)
		if target == nil {
			return syserror.EBADF
		}
		defer target.DecRef(t)
		if err := mayLinkAt(t, target.Dirent.Inode); err != nil {
			return err
		}

		// Resolve the target directory.
		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error {
			if !fs.IsDir(newParent.Inode.StableAttr) {
				return syserror.ENOTDIR
			}

			// Make sure we have write permissions on the parent directory.
			if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
				return err
			}
			return newParent.CreateHardLink(t, root, target.Dirent, newName)
		})
	}

	// Resolve oldDirFD and oldAddr to a dirent.  The "resolve" argument
	// only applies to this name.
	return fileOpOn(t, oldDirFD, oldPath, resolve, func(root *fs.Dirent, target *fs.Dirent, _ uint) error {
		if err := mayLinkAt(t, target.Inode); err != nil {
			return err
		}

		// Next resolve newDirFD and newAddr to the parent dirent and name.
		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error {
			if !fs.IsDir(newParent.Inode.StableAttr) {
				return syserror.ENOTDIR
			}

			// Make sure we have write permissions on the parent directory.
			if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
				return err
			}
			return newParent.CreateHardLink(t, root, target, newName)
		})
	})
}

// Link implements linux syscall link(2).
func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	oldAddr := args[0].Pointer()
	newAddr := args[1].Pointer()

	// man link(2):
	// POSIX.1-2001 says that link() should dereference oldpath if it is a
	// symbolic link. However, since kernel 2.0, Linux does not do so: if
	// oldpath is a symbolic link, then newpath is created as a (hard) link
	// to the same symbolic link file (i.e., newpath becomes a symbolic
	// link to the same file that oldpath refers to).
	resolve := false
	return 0, nil, linkAt(t, linux.AT_FDCWD, oldAddr, linux.AT_FDCWD, newAddr, resolve, false /* allowEmpty */)
}

// Linkat implements linux syscall linkat(2).
func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	oldDirFD := args[0].Int()
	oldAddr := args[1].Pointer()
	newDirFD := args[2].Int()
	newAddr := args[3].Pointer()

	// man linkat(2):
	// By default, linkat(), does not dereference oldpath if it is a
	// symbolic link (like link(2)). Since Linux 2.6.18, the flag
	// AT_SYMLINK_FOLLOW can be specified in flags to cause oldpath to be
	// dereferenced if it is a symbolic link.
	flags := args[4].Int()

	// Sanity check flags.
	if flags&^(linux.AT_SYMLINK_FOLLOW|linux.AT_EMPTY_PATH) != 0 {
		return 0, nil, syserror.EINVAL
	}

	resolve := flags&linux.AT_SYMLINK_FOLLOW == linux.AT_SYMLINK_FOLLOW
	allowEmpty := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH

	if allowEmpty && !t.HasCapabilityIn(linux.CAP_DAC_READ_SEARCH, t.UserNamespace().Root()) {
		return 0, nil, syserror.ENOENT
	}

	return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
}

// LINT.ThenChange(vfs2/filesystem.go)

// LINT.IfChange

func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
	if err != nil {
		return 0, err
	}
	if dirPath {
		return 0, syserror.ENOENT
	}

	err = fileOpOn(t, dirFD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
		// Check for Read permission.
		if err := d.Inode.CheckPermission(t, fs.PermMask{Read: true}); err != nil {
			return err
		}

		s, err := d.Inode.Readlink(t)
		if err == syserror.ENOLINK {
			return syserror.EINVAL
		}
		if err != nil {
			return err
		}

		buffer := []byte(s)
		if uint(len(buffer)) > size {
			buffer = buffer[:size]
		}

		n, err := t.CopyOutBytes(bufAddr, buffer)

		// Update frame return value.
		copied = uintptr(n)

		return err
	})
	return copied, err // Return frame value.
}

// Readlink implements linux syscall readlink(2).
func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()
	bufAddr := args[1].Pointer()
	size := args[2].SizeT()

	n, err := readlinkAt(t, linux.AT_FDCWD, addr, bufAddr, size)
	return n, nil, err
}

// Readlinkat implements linux syscall readlinkat(2).
func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	dirFD := args[0].Int()
	addr := args[1].Pointer()
	bufAddr := args[2].Pointer()
	size := args[3].SizeT()

	n, err := readlinkAt(t, dirFD, addr, bufAddr, size)
	return n, nil, err
}

// LINT.ThenChange(vfs2/stat.go)

// LINT.IfChange

func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
	if err != nil {
		return err
	}

	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
		if !fs.IsDir(d.Inode.StableAttr) {
			return syserror.ENOTDIR
		}

		if err := d.MayDelete(t, root, name); err != nil {
			return err
		}

		return d.Remove(t, root, name, dirPath)
	})
}

// Unlink implements linux syscall unlink(2).
func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()
	return 0, nil, unlinkAt(t, linux.AT_FDCWD, addr)
}

// Unlinkat implements linux syscall unlinkat(2).
func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	dirFD := args[0].Int()
	addr := args[1].Pointer()
	flags := args[2].Uint()
	if flags&linux.AT_REMOVEDIR != 0 {
		return 0, nil, rmdirAt(t, dirFD, addr)
	}
	return 0, nil, unlinkAt(t, dirFD, addr)
}

// LINT.ThenChange(vfs2/filesystem.go)

// LINT.IfChange

// Truncate implements linux syscall truncate(2).
func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()
	length := args[1].Int64()

	if length < 0 {
		return 0, nil, syserror.EINVAL
	}

	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
	if err != nil {
		return 0, nil, err
	}
	if dirPath {
		return 0, nil, syserror.EINVAL
	}

	if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
		t.SendSignal(&arch.SignalInfo{
			Signo: int32(linux.SIGXFSZ),
			Code:  arch.SignalInfoUser,
		})
		return 0, nil, syserror.EFBIG
	}

	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
		if fs.IsDir(d.Inode.StableAttr) {
			return syserror.EISDIR
		}
		// In contrast to open(O_TRUNC), truncate(2) is only valid for file
		// types.
		if !fs.IsFile(d.Inode.StableAttr) {
			return syserror.EINVAL
		}

		// Reject truncation if the access permissions do not allow truncation.
		// This is different from the behavior of sys_ftruncate, see below.
		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
			return err
		}

		if err := d.Inode.Truncate(t, d, length); err != nil {
			return err
		}

		// File length modified, generate notification.
		d.InotifyEvent(linux.IN_MODIFY, 0)

		return nil
	})
}

// Ftruncate implements linux syscall ftruncate(2).
func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	fd := args[0].Int()
	length := args[1].Int64()

	file := t.GetFile(fd)
	if file == nil {
		return 0, nil, syserror.EBADF
	}
	defer file.DecRef(t)

	// Reject truncation if the file flags do not permit this operation.
	// This is different from truncate(2) above.
	if !file.Flags().Write {
		return 0, nil, syserror.EINVAL
	}

	// In contrast to open(O_TRUNC), truncate(2) is only valid for file
	// types. Note that this is different from truncate(2) above, where a
	// directory returns EISDIR.
	if !fs.IsFile(file.Dirent.Inode.StableAttr) {
		return 0, nil, syserror.EINVAL
	}

	if length < 0 {
		return 0, nil, syserror.EINVAL
	}

	if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
		t.SendSignal(&arch.SignalInfo{
			Signo: int32(linux.SIGXFSZ),
			Code:  arch.SignalInfoUser,
		})
		return 0, nil, syserror.EFBIG
	}

	if err := file.Dirent.Inode.Truncate(t, file.Dirent, length); err != nil {
		return 0, nil, err
	}

	// File length modified, generate notification.
	file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)

	return 0, nil, nil
}

// LINT.ThenChange(vfs2/setstat.go)

// Umask implements linux syscall umask(2).
func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	mask := args[0].ModeT()
	mask = t.FSContext().SwapUmask(mask & 0777)
	return uintptr(mask), nil, nil
}

// LINT.IfChange

// Change ownership of a file.
//
// uid and gid may be -1, in which case they will not be changed.
func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error {
	owner := fs.FileOwner{
		UID: auth.NoID,
		GID: auth.NoID,
	}

	uattr, err := d.Inode.UnstableAttr(t)
	if err != nil {
		return err
	}
	c := t.Credentials()
	hasCap := d.Inode.CheckCapability(t, linux.CAP_CHOWN)
	isOwner := uattr.Owner.UID == c.EffectiveKUID
	if uid.Ok() {
		kuid := c.UserNamespace.MapToKUID(uid)
		// Valid UID must be supplied if UID is to be changed.
		if !kuid.Ok() {
			return syserror.EINVAL
		}

		// "Only a privileged process (CAP_CHOWN) may change the owner
		// of a file." -chown(2)
		//
		// Linux also allows chown if you own the file and are
		// explicitly not changing its UID.
		isNoop := uattr.Owner.UID == kuid
		if !(hasCap || (isOwner && isNoop)) {
			return syserror.EPERM
		}

		owner.UID = kuid
	}
	if gid.Ok() {
		kgid := c.UserNamespace.MapToKGID(gid)
		// Valid GID must be supplied if GID is to be changed.
		if !kgid.Ok() {
			return syserror.EINVAL
		}

		// "The owner of a file may change the group of the file to any
		// group of which that owner is a member. A privileged process
		// (CAP_CHOWN) may change the group arbitrarily." -chown(2)
		isNoop := uattr.Owner.GID == kgid
		isMemberGroup := c.InGroup(kgid)
		if !(hasCap || (isOwner && (isNoop || isMemberGroup))) {
			return syserror.EPERM
		}

		owner.GID = kgid
	}

	// FIXME(b/62949101): This is racy; the inode's owner may have changed in
	// the meantime. (Linux holds i_mutex while calling
	// fs/attr.c:notify_change() => inode_operations::setattr =>
	// inode_change_ok().)
	if err := d.Inode.SetOwner(t, d, owner); err != nil {
		return err
	}

	// When the owner or group are changed by an unprivileged user,
	// chown(2) also clears the set-user-ID and set-group-ID bits, but
	// we do not support them.
	return nil
}

func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error {
	path, _, err := copyInPath(t, addr, allowEmpty)
	if err != nil {
		return err
	}

	if path == "" {
		// Annoying. What's wrong with fchown?
		file := t.GetFile(fd)
		if file == nil {
			return syserror.EBADF
		}
		defer file.DecRef(t)

		return chown(t, file.Dirent, uid, gid)
	}

	return fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
		return chown(t, d, uid, gid)
	})
}

// Chown implements linux syscall chown(2).
func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()
	uid := auth.UID(args[1].Uint())
	gid := auth.GID(args[2].Uint())

	return 0, nil, chownAt(t, linux.AT_FDCWD, addr, true /* resolve */, false /* allowEmpty */, uid, gid)
}

// Lchown implements linux syscall lchown(2).
func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()
	uid := auth.UID(args[1].Uint())
	gid := auth.GID(args[2].Uint())

	return 0, nil, chownAt(t, linux.AT_FDCWD, addr, false /* resolve */, false /* allowEmpty */, uid, gid)
}

// Fchown implements linux syscall fchown(2).
func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	fd := args[0].Int()
	uid := auth.UID(args[1].Uint())
	gid := auth.GID(args[2].Uint())

	file := t.GetFile(fd)
	if file == nil {
		return 0, nil, syserror.EBADF
	}
	defer file.DecRef(t)

	return 0, nil, chown(t, file.Dirent, uid, gid)
}

// Fchownat implements Linux syscall fchownat(2).
func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	dirFD := args[0].Int()
	addr := args[1].Pointer()
	uid := auth.UID(args[2].Uint())
	gid := auth.GID(args[3].Uint())
	flags := args[4].Int()

	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
		return 0, nil, syserror.EINVAL
	}

	return 0, nil, chownAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, flags&linux.AT_EMPTY_PATH != 0, uid, gid)
}

func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error {
	// Must own file to change mode.
	if !d.Inode.CheckOwnership(t) {
		return syserror.EPERM
	}

	p := fs.FilePermsFromMode(mode)
	if !d.Inode.SetPermissions(t, d, p) {
		return syserror.EPERM
	}

	// File attribute changed, generate notification.
	d.InotifyEvent(linux.IN_ATTRIB, 0)

	return nil
}

func chmodAt(t *kernel.Task, fd int32, addr usermem.Addr, mode linux.FileMode) error {
	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
	if err != nil {
		return err
	}

	return fileOpOn(t, fd, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
		return chmod(t, d, mode)
	})
}

// Chmod implements linux syscall chmod(2).
func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()
	mode := linux.FileMode(args[1].ModeT())

	return 0, nil, chmodAt(t, linux.AT_FDCWD, addr, mode)
}

// Fchmod implements linux syscall fchmod(2).
func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	fd := args[0].Int()
	mode := linux.FileMode(args[1].ModeT())

	file := t.GetFile(fd)
	if file == nil {
		return 0, nil, syserror.EBADF
	}
	defer file.DecRef(t)

	return 0, nil, chmod(t, file.Dirent, mode)
}

// Fchmodat implements linux syscall fchmodat(2).
func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	fd := args[0].Int()
	addr := args[1].Pointer()
	mode := linux.FileMode(args[2].ModeT())

	return 0, nil, chmodAt(t, fd, addr, mode)
}

// defaultSetToSystemTimeSpec returns a TimeSpec that will set ATime and MTime
// to the system time.
func defaultSetToSystemTimeSpec() fs.TimeSpec {
	return fs.TimeSpec{
		ATimeSetSystemTime: true,
		MTimeSetSystemTime: true,
	}
}

func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error {
	setTimestamp := func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
		// Does the task own the file?
		if !d.Inode.CheckOwnership(t) {
			// Trying to set a specific time? Must be owner.
			if (ts.ATimeOmit || !ts.ATimeSetSystemTime) && (ts.MTimeOmit || !ts.MTimeSetSystemTime) {
				return syserror.EPERM
			}

			// Trying to set to current system time? Must have write access.
			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
				return err
			}
		}

		if err := d.Inode.SetTimestamps(t, d, ts); err != nil {
			return err
		}

		// File attribute changed, generate notification.
		d.InotifyEvent(linux.IN_ATTRIB, 0)
		return nil
	}

	// From utimes.c:
	// "If filename is NULL and dfd refers to an open file, then operate on
	// the file.  Otherwise look up filename, possibly using dfd as a
	// starting point."
	if addr == 0 && dirFD != linux.AT_FDCWD {
		if !resolve {
			// Linux returns EINVAL in this case. See utimes.c.
			return syserror.EINVAL
		}
		f := t.GetFile(dirFD)
		if f == nil {
			return syserror.EBADF
		}
		defer f.DecRef(t)

		root := t.FSContext().RootDirectory()
		defer root.DecRef(t)

		return setTimestamp(root, f.Dirent, linux.MaxSymlinkTraversals)
	}

	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
	if err != nil {
		return err
	}

	return fileOpOn(t, dirFD, path, resolve, setTimestamp)
}

// Utime implements linux syscall utime(2).
func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	filenameAddr := args[0].Pointer()
	timesAddr := args[1].Pointer()

	// No timesAddr argument will be interpreted as current system time.
	ts := defaultSetToSystemTimeSpec()
	if timesAddr != 0 {
		var times linux.Utime
		if _, err := times.CopyIn(t, timesAddr); err != nil {
			return 0, nil, err
		}
		ts = fs.TimeSpec{
			ATime: ktime.FromSeconds(times.Actime),
			MTime: ktime.FromSeconds(times.Modtime),
		}
	}
	return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
}

// Utimes implements linux syscall utimes(2).
func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	filenameAddr := args[0].Pointer()
	timesAddr := args[1].Pointer()

	// No timesAddr argument will be interpreted as current system time.
	ts := defaultSetToSystemTimeSpec()
	if timesAddr != 0 {
		var times [2]linux.Timeval
		if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
			return 0, nil, err
		}
		ts = fs.TimeSpec{
			ATime: ktime.FromTimeval(times[0]),
			MTime: ktime.FromTimeval(times[1]),
		}
	}
	return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
}

// timespecIsValid checks that the timespec is valid for use in utimensat.
func timespecIsValid(ts linux.Timespec) bool {
	// Nsec must be UTIME_OMIT, UTIME_NOW, or less than 10^9.
	return ts.Nsec == linux.UTIME_OMIT || ts.Nsec == linux.UTIME_NOW || ts.Nsec < 1e9
}

// Utimensat implements linux syscall utimensat(2).
func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	dirFD := args[0].Int()
	pathnameAddr := args[1].Pointer()
	timesAddr := args[2].Pointer()
	flags := args[3].Int()

	// No timesAddr argument will be interpreted as current system time.
	ts := defaultSetToSystemTimeSpec()
	if timesAddr != 0 {
		var times [2]linux.Timespec
		if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
			return 0, nil, err
		}
		if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) {
			return 0, nil, syserror.EINVAL
		}

		// If both are UTIME_OMIT, this is a noop.
		if times[0].Nsec == linux.UTIME_OMIT && times[1].Nsec == linux.UTIME_OMIT {
			return 0, nil, nil
		}

		ts = fs.TimeSpec{
			ATime:              ktime.FromTimespec(times[0]),
			ATimeOmit:          times[0].Nsec == linux.UTIME_OMIT,
			ATimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
			MTime:              ktime.FromTimespec(times[1]),
			MTimeOmit:          times[1].Nsec == linux.UTIME_OMIT,
			MTimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
		}
	}
	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, flags&linux.AT_SYMLINK_NOFOLLOW == 0)
}

// Futimesat implements linux syscall futimesat(2).
func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	dirFD := args[0].Int()
	pathnameAddr := args[1].Pointer()
	timesAddr := args[2].Pointer()

	// No timesAddr argument will be interpreted as current system time.
	ts := defaultSetToSystemTimeSpec()
	if timesAddr != 0 {
		var times [2]linux.Timeval
		if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
			return 0, nil, err
		}
		if times[0].Usec >= 1e6 || times[0].Usec < 0 ||
			times[1].Usec >= 1e6 || times[1].Usec < 0 {
			return 0, nil, syserror.EINVAL
		}

		ts = fs.TimeSpec{
			ATime: ktime.FromTimeval(times[0]),
			MTime: ktime.FromTimeval(times[1]),
		}
	}
	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
}

// LINT.ThenChange(vfs2/setstat.go)

// LINT.IfChange

func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error {
	newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
	if err != nil {
		return err
	}
	oldPath, _, err := copyInPath(t, oldAddr, false /* allowEmpty */)
	if err != nil {
		return err
	}

	return fileOpAt(t, oldDirFD, oldPath, func(root *fs.Dirent, oldParent *fs.Dirent, oldName string, _ uint) error {
		if !fs.IsDir(oldParent.Inode.StableAttr) {
			return syserror.ENOTDIR
		}

		// Rename rejects paths that end in ".", "..", or empty (i.e.
		// the root) with EBUSY.
		switch oldName {
		case "", ".", "..":
			return syserror.EBUSY
		}

		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error {
			if !fs.IsDir(newParent.Inode.StableAttr) {
				return syserror.ENOTDIR
			}

			// Rename rejects paths that end in ".", "..", or empty
			// (i.e.  the root) with EBUSY.
			switch newName {
			case "", ".", "..":
				return syserror.EBUSY
			}

			return fs.Rename(t, root, oldParent, oldName, newParent, newName)
		})
	})
}

// Rename implements linux syscall rename(2).
func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	oldPathAddr := args[0].Pointer()
	newPathAddr := args[1].Pointer()
	return 0, nil, renameAt(t, linux.AT_FDCWD, oldPathAddr, linux.AT_FDCWD, newPathAddr)
}

// Renameat implements linux syscall renameat(2).
func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	oldDirFD := args[0].Int()
	oldPathAddr := args[1].Pointer()
	newDirFD := args[2].Int()
	newPathAddr := args[3].Pointer()
	return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
}

// LINT.ThenChange(vfs2/filesystem.go)

// Fallocate implements linux system call fallocate(2).
func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	fd := args[0].Int()
	mode := args[1].Int64()
	offset := args[2].Int64()
	length := args[3].Int64()

	file := t.GetFile(fd)
	if file == nil {
		return 0, nil, syserror.EBADF
	}
	defer file.DecRef(t)

	if offset < 0 || length <= 0 {
		return 0, nil, syserror.EINVAL
	}
	if mode != 0 {
		t.Kernel().EmitUnimplementedEvent(t)
		return 0, nil, syserror.ENOTSUP
	}
	if !file.Flags().Write {
		return 0, nil, syserror.EBADF
	}
	if fs.IsPipe(file.Dirent.Inode.StableAttr) {
		return 0, nil, syserror.ESPIPE
	}
	if fs.IsDir(file.Dirent.Inode.StableAttr) {
		return 0, nil, syserror.EISDIR
	}
	if !fs.IsRegular(file.Dirent.Inode.StableAttr) {
		return 0, nil, syserror.ENODEV
	}
	size := offset + length
	if size < 0 {
		return 0, nil, syserror.EFBIG
	}
	if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
		t.SendSignal(&arch.SignalInfo{
			Signo: int32(linux.SIGXFSZ),
			Code:  arch.SignalInfoUser,
		})
		return 0, nil, syserror.EFBIG
	}

	if err := file.Dirent.Inode.Allocate(t, file.Dirent, offset, length); err != nil {
		return 0, nil, err
	}

	// File length modified, generate notification.
	file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)

	return 0, nil, nil
}

// Flock implements linux syscall flock(2).
func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	fd := args[0].Int()
	operation := args[1].Int()

	file := t.GetFile(fd)
	if file == nil {
		// flock(2): EBADF fd is not an open file descriptor.
		return 0, nil, syserror.EBADF
	}
	defer file.DecRef(t)

	nonblocking := operation&linux.LOCK_NB != 0
	operation &^= linux.LOCK_NB

	// A BSD style lock spans the entire file.
	rng := lock.LockRange{
		Start: 0,
		End:   lock.LockEOF,
	}

	switch operation {
	case linux.LOCK_EX:
		if nonblocking {
			// Since we're nonblocking we pass a nil lock.Blocker implementation.
			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, nil) {
				return 0, nil, syserror.EWOULDBLOCK
			}
		} else {
			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, t) {
				return 0, nil, syserror.EINTR
			}
		}
	case linux.LOCK_SH:
		if nonblocking {
			// Since we're nonblocking we pass a nil lock.Blocker implementation.
			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, nil) {
				return 0, nil, syserror.EWOULDBLOCK
			}
		} else {
			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, t) {
				return 0, nil, syserror.EINTR
			}
		}
	case linux.LOCK_UN:
		file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng)
	default:
		// flock(2): EINVAL operation is invalid.
		return 0, nil, syserror.EINVAL
	}

	return 0, nil, nil
}

const (
	memfdPrefix     = "/memfd:"
	memfdAllFlags   = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
	memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + 1
)

// MemfdCreate implements the linux syscall memfd_create(2).
func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
	addr := args[0].Pointer()
	flags := args[1].Uint()

	if flags&^memfdAllFlags != 0 {
		// Unknown bits in flags.
		return 0, nil, syserror.EINVAL
	}

	allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
	cloExec := flags&linux.MFD_CLOEXEC != 0

	name, err := t.CopyInString(addr, syscall.PathMax-len(memfdPrefix))
	if err != nil {
		return 0, nil, err
	}
	if len(name) > memfdMaxNameLen {
		return 0, nil, syserror.EINVAL
	}
	name = memfdPrefix + name

	inode := tmpfs.NewMemfdInode(t, allowSeals)
	dirent := fs.NewDirent(t, inode, name)
	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
	// FMODE_READ | FMODE_WRITE.
	file, err := inode.GetFile(t, dirent, fs.FileFlags{Read: true, Write: true})
	if err != nil {
		return 0, nil, err
	}

	defer dirent.DecRef(t)
	defer file.DecRef(t)

	newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
		CloseOnExec: cloExec,
	})
	if err != nil {
		return 0, nil, err
	}

	return uintptr(newFD), nil, nil
}