1 files changed, 2088 insertions, 0 deletions
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
new file mode 100644
index 000000000..19f579930
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -0,0 +1,2088 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/fasync"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// fileOpAt performs an operation on the second last component in the path.
+func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string) error) error {
+	// Extract the last component.
+	dir, name := fs.SplitLast(path)
+	if dir == "/" {
+		// Common case: we are accessing a file in the root.
+		root := t.FSContext().RootDirectory()
+		err := fn(root, root, name)
+		root.DecRef()
+		return err
+	} else if dir == "." && dirFD == linux.AT_FDCWD {
+		// Common case: we are accessing a file relative to the current
+		// working directory; skip the look-up.
+		wd := t.FSContext().WorkingDirectory()
+		root := t.FSContext().RootDirectory()
+		err := fn(root, wd, name)
+		wd.DecRef()
+		root.DecRef()
+		return err
+	}
+
+	return fileOpOn(t, dirFD, dir, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return fn(root, d, name)
+	})
+}
+
+// fileOpOn performs an operation on the last entry of the path.
+func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error {
+	var (
+		d   *fs.Dirent // The file.
+		wd  *fs.Dirent // The working directory (if required.)
+		rel *fs.Dirent // The relative directory for search (if required.)
+		f   *fs.File   // The file corresponding to dirFD (if required.)
+		err error
+	)
+
+	// Extract the working directory (maybe).
+	if len(path) > 0 && path[0] == '/' {
+		// Absolute path; rel can be nil.
+	} else if dirFD == linux.AT_FDCWD {
+		// Need to reference the working directory.
+		wd = t.FSContext().WorkingDirectory()
+		rel = wd
+	} else {
+		// Need to extract the given FD.
+		f = t.FDMap().GetFile(dirFD)
+		if f == nil {
+			return syserror.EBADF
+		}
+		rel = f.Dirent
+		if !fs.IsDir(rel.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+	}
+
+	// Grab the root (always required.)
+	root := t.FSContext().RootDirectory()
+
+	// Lookup the node.
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
+	if resolve {
+		d, err = t.MountNamespace().FindInode(t, root, rel, path, &remainingTraversals)
+	} else {
+		d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals)
+	}
+	root.DecRef()
+	if wd != nil {
+		wd.DecRef()
+	}
+	if f != nil {
+		f.DecRef()
+	}
+	if err != nil {
+		return err
+	}
+
+	err = fn(root, d)
+	d.DecRef()
+	return err
+}
+
+// copyInPath copies a path in.
+func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string, dirPath bool, err error) {
+	path, err = t.CopyInString(addr, linux.PATH_MAX)
+	if err != nil {
+		return "", false, err
+	}
+	if path == "" && !allowEmpty {
+		return "", false, syserror.ENOENT
+	}
+
+	// If the path ends with a /, then checks must be enforced in various
+	// ways in the different callers. We pass this back to the caller.
+	path, dirPath = fs.TrimTrailingSlashes(path)
+
+	return path, dirPath, nil
+}
+
+func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd uintptr, err error) {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, err
+	}
+
+	resolve := flags&linux.O_NOFOLLOW == 0
+	err = fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error {
+		// First check a few things about the filesystem before trying to get the file
+		// reference.
+		//
+		// It's required that Check does not try to open files not that aren't backed by
+		// this dirent (e.g. pipes and sockets) because this would result in opening these
+		// files an extra time just to check permissions.
+		if err := d.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
+			return err
+		}
+
+		if fs.IsSymlink(d.Inode.StableAttr) && !resolve {
+			return syserror.ELOOP
+		}
+
+		fileFlags := linuxToFlags(flags)
+		// Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
+		fileFlags.LargeFile = true
+		if fs.IsDir(d.Inode.StableAttr) {
+			// Don't allow directories to be opened writable.
+			if fileFlags.Write {
+				return syserror.EISDIR
+			}
+		} else {
+			// If O_DIRECTORY is set, but the file is not a directory, then fail.
+			if fileFlags.Directory {
+				return syserror.ENOTDIR
+			}
+			// If it's a directory, then make sure.
+			if dirPath {
+				return syserror.ENOTDIR
+			}
+			if flags&linux.O_TRUNC != 0 {
+				if err := d.Inode.Truncate(t, d, 0); err != nil {
+					return err
+				}
+			}
+		}
+
+		file, err := d.Inode.GetFile(t, d, fileFlags)
+		if err != nil {
+			return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+		}
+		defer file.DecRef()
+
+		// Success.
+		fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}
+		newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
+		if err != nil {
+			return err
+		}
+
+		// Set return result in frame.
+		fd = uintptr(newFD)
+
+		// Generate notification for opened file.
+		d.InotifyEvent(linux.IN_OPEN, 0)
+
+		return nil
+	})
+	return fd, err // Use result in frame.
+}
+
+func mknodAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Do we have the appropriate permissions on the parent?
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+			return err
+		}
+
+		// Attempt a creation.
+		perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+
+		switch mode.FileType() {
+		case 0:
+			// "Zero file type is equivalent to type S_IFREG." - mknod(2)
+			fallthrough
+		case linux.ModeRegular:
+			// We are not going to return the file, so the actual
+			// flags used don't matter, but they cannot be empty or
+			// Create will complain.
+			flags := fs.FileFlags{Read: true, Write: true}
+			file, err := d.Create(t, root, name, flags, perms)
+			if err != nil {
+				return err
+			}
+			file.DecRef()
+			return nil
+
+		case linux.ModeNamedPipe:
+			return d.CreateFifo(t, root, name, perms)
+
+		case linux.ModeSocket:
+			// While it is possible create a unix domain socket file on linux
+			// using mknod(2), in practice this is pretty useless from an
+			// application. Linux internally uses mknod() to create the socket
+			// node during bind(2), but we implement bind(2) independently. If
+			// an application explicitly creates a socket node using mknod(),
+			// you can't seem to bind() or connect() to the resulting socket.
+			//
+			// Instead of emulating this seemingly useless behaviour, we'll
+			// indicate that the filesystem doesn't support the creation of
+			// sockets.
+			return syserror.EOPNOTSUPP
+
+		case linux.ModeCharacterDevice:
+			fallthrough
+		case linux.ModeBlockDevice:
+			// TODO(b/72101894): We don't support creating block or character
+			// devices at the moment.
+			//
+			// When we start supporting block and character devices, we'll
+			// need to check for CAP_MKNOD here.
+			return syserror.EPERM
+
+		default:
+			// "EINVAL - mode requested creation of something other than a
+			// regular file, device special file, FIFO or socket." - mknod(2)
+			return syserror.EINVAL
+		}
+	})
+}
+
+// Mknod implements the linux syscall mknod(2).
+func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	path := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+	// We don't need this argument until we support creation of device nodes.
+	_ = args[2].Uint() // dev
+
+	return 0, nil, mknodAt(t, linux.AT_FDCWD, path, mode)
+}
+
+// Mknodat implements the linux syscall mknodat(2).
+func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	path := args[1].Pointer()
+	mode := linux.FileMode(args[2].ModeT())
+	// We don't need this argument until we support creation of device nodes.
+	_ = args[3].Uint() // dev
+
+	return 0, nil, mknodAt(t, dirFD, path, mode)
+}
+
+func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, err
+	}
+	if dirPath {
+		return 0, syserror.ENOENT
+	}
+
+	err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		fileFlags := linuxToFlags(flags)
+		// Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
+		fileFlags.LargeFile = true
+
+		// Does this file exist already?
+		remainingTraversals := uint(linux.MaxSymlinkTraversals)
+		targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals)
+		var newFile *fs.File
+		switch err {
+		case nil:
+			// The file existed.
+			defer targetDirent.DecRef()
+
+			// Check if we wanted to create.
+			if flags&linux.O_EXCL != 0 {
+				return syserror.EEXIST
+			}
+
+			// Like sys_open, check for a few things about the
+			// filesystem before trying to get a reference to the
+			// fs.File. The same constraints on Check apply.
+			if err := targetDirent.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
+				return err
+			}
+
+			// Should we truncate the file?
+			if flags&linux.O_TRUNC != 0 {
+				if err := targetDirent.Inode.Truncate(t, targetDirent, 0); err != nil {
+					return err
+				}
+			}
+
+			// Create a new fs.File.
+			newFile, err = targetDirent.Inode.GetFile(t, targetDirent, fileFlags)
+			if err != nil {
+				return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+			}
+			defer newFile.DecRef()
+		case syserror.ENOENT:
+			// File does not exist. Proceed with creation.
+
+			// Do we have write permissions on the parent?
+			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+
+			// Attempt a creation.
+			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+			newFile, err = d.Create(t, root, name, fileFlags, perms)
+			if err != nil {
+				// No luck, bail.
+				return err
+			}
+			defer newFile.DecRef()
+			targetDirent = newFile.Dirent
+		default:
+			return err
+		}
+
+		// Success.
+		fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}
+		newFD, err := t.FDMap().NewFDFrom(0, newFile, fdFlags, t.ThreadGroup().Limits())
+		if err != nil {
+			return err
+		}
+
+		// Set result in frame.
+		fd = uintptr(newFD)
+
+		// Queue the open inotify event. The creation event is
+		// automatically queued when the dirent is targetDirent. The
+		// open events are implemented at the syscall layer so we need
+		// to manually queue one here.
+		targetDirent.InotifyEvent(linux.IN_OPEN, 0)
+
+		return nil
+	})
+	return fd, err // Use result in frame.
+}
+
+// Open implements linux syscall open(2).
+func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := uint(args[1].Uint())
+	if flags&linux.O_CREAT != 0 {
+		mode := linux.FileMode(args[2].ModeT())
+		n, err := createAt(t, linux.AT_FDCWD, addr, flags, mode)
+		return n, nil, err
+	}
+	n, err := openAt(t, linux.AT_FDCWD, addr, flags)
+	return n, nil, err
+}
+
+// Openat implements linux syscall openat(2).
+func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	flags := uint(args[2].Uint())
+	if flags&linux.O_CREAT != 0 {
+		mode := linux.FileMode(args[3].ModeT())
+		n, err := createAt(t, dirFD, addr, flags, mode)
+		return n, nil, err
+	}
+	n, err := openAt(t, dirFD, addr, flags)
+	return n, nil, err
+}
+
+// Creat implements linux syscall creat(2).
+func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+	n, err := createAt(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_TRUNC, mode)
+	return n, nil, err
+}
+
+// accessContext is a context that overrides the credentials used, but
+// otherwise carries the same values as the embedded context.
+//
+// accessContext should only be used for access(2).
+type accessContext struct {
+	context.Context
+	creds *auth.Credentials
+}
+
+// Value implements context.Context.
+func (ac accessContext) Value(key interface{}) interface{} {
+	switch key {
+	case auth.CtxCredentials:
+		return ac.creds
+	default:
+		return ac.Context.Value(key)
+	}
+}
+
+func accessAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, resolve bool, mode uint) error {
+	const rOK = 4
+	const wOK = 2
+	const xOK = 1
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	// Sanity check the mode.
+	if mode&^(rOK|wOK|xOK) != 0 {
+		return syserror.EINVAL
+	}
+
+	return fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error {
+		// access(2) and faccessat(2) check permissions using real
+		// UID/GID, not effective UID/GID.
+		//
+		// "access() needs to use the real uid/gid, not the effective
+		// uid/gid. We do this by temporarily clearing all FS-related
+		// capabilities and switching the fsuid/fsgid around to the
+		// real ones." -fs/open.c:faccessat
+		creds := t.Credentials().Fork()
+		creds.EffectiveKUID = creds.RealKUID
+		creds.EffectiveKGID = creds.RealKGID
+		if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
+			creds.EffectiveCaps = creds.PermittedCaps
+		} else {
+			creds.EffectiveCaps = 0
+		}
+
+		ctx := &accessContext{
+			Context: t,
+			creds:   creds,
+		}
+
+		return d.Inode.CheckPermission(ctx, fs.PermMask{
+			Read:    mode&rOK != 0,
+			Write:   mode&wOK != 0,
+			Execute: mode&xOK != 0,
+		})
+	})
+}
+
+// Access implements linux syscall access(2).
+func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+
+	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, true, mode)
+}
+
+// Faccessat implements linux syscall faccessat(2).
+func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+	flags := args[3].Int()
+
+	return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode)
+}
+
+// Ioctl implements linux syscall ioctl(2).
+func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	request := int(args[1].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Shared flags between file and socket.
+	switch request {
+	case linux.FIONCLEX:
+		t.FDMap().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: false,
+		})
+		return 0, nil, nil
+	case linux.FIOCLEX:
+		t.FDMap().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: true,
+		})
+		return 0, nil, nil
+
+	case linux.FIONBIO:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		flags := file.Flags()
+		if set != 0 {
+			flags.NonBlocking = true
+		} else {
+			flags.NonBlocking = false
+		}
+		file.SetFlags(flags.Settable())
+		return 0, nil, nil
+
+	case linux.FIOASYNC:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		flags := file.Flags()
+		if set != 0 {
+			flags.Async = true
+		} else {
+			flags.Async = false
+		}
+		file.SetFlags(flags.Settable())
+		return 0, nil, nil
+
+	case linux.FIOSETOWN, linux.SIOCSPGRP:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		fSetOwn(t, file, set)
+		return 0, nil, nil
+
+	case linux.FIOGETOWN, linux.SIOCGPGRP:
+		who := fGetOwn(t, file)
+		_, err := t.CopyOut(args[2].Pointer(), &who)
+		return 0, nil, err
+
+	default:
+		ret, err := file.FileOperations.Ioctl(t, t.MemoryManager(), args)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		return ret, nil, nil
+	}
+}
+
+// Getcwd implements the linux syscall getcwd(2).
+func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	size := args[1].SizeT()
+	cwd := t.FSContext().WorkingDirectory()
+	defer cwd.DecRef()
+	root := t.FSContext().RootDirectory()
+	defer root.DecRef()
+
+	// Get our fullname from the root and preprend unreachable if the root was
+	// unreachable from our current dirent this is the same behavior as on linux.
+	s, reachable := cwd.FullName(root)
+	if !reachable {
+		s = "(unreachable)" + s
+	}
+
+	// Note this is >= because we need a terminator.
+	if uint(len(s)) >= size {
+		return 0, nil, syserror.ERANGE
+	}
+
+	// Copy out the path name for the node.
+	bytes, err := t.CopyOutBytes(addr, []byte(s))
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Top it off with a terminator.
+	_, err = t.CopyOut(addr+usermem.Addr(bytes), []byte("\x00"))
+	return uintptr(bytes + 1), nil, err
+}
+
+// Chroot implements the linux syscall chroot(2).
+func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	if !t.HasCapability(linux.CAP_SYS_CHROOT) {
+		return 0, nil, syserror.EPERM
+	}
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		// Is it a directory?
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does it have execute permissions?
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+			return err
+		}
+
+		t.FSContext().SetRootDirectory(d)
+		return nil
+	})
+}
+
+// Chdir implements the linux syscall chdir(2).
+func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		// Is it a directory?
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does it have execute permissions?
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+			return err
+		}
+
+		t.FSContext().SetWorkingDirectory(d)
+		return nil
+	})
+}
+
+// Fchdir implements the linux syscall fchdir(2).
+func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Is it a directory?
+	if !fs.IsDir(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ENOTDIR
+	}
+
+	// Does it have execute permissions?
+	if err := file.Dirent.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+		return 0, nil, err
+	}
+
+	t.FSContext().SetWorkingDirectory(file.Dirent)
+	return 0, nil, nil
+}
+
+// Close implements linux syscall close(2).
+func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file, ok := t.FDMap().Remove(fd)
+	if !ok {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.Flush(t)
+	return 0, nil, handleIOError(t, false /* partial */, err, syscall.EINTR, "close", file)
+}
+
+// Dup implements linux syscall dup(2).
+func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newfd, err := t.FDMap().NewFDFrom(0, file, kernel.FDFlags{}, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, syserror.EMFILE
+	}
+	return uintptr(newfd), nil, nil
+}
+
+// Dup2 implements linux syscall dup2(2).
+func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := kdefs.FD(args[0].Int())
+	newfd := kdefs.FD(args[1].Int())
+
+	// If oldfd is a valid file descriptor, and newfd has the same value as oldfd,
+	// then dup2() does nothing, and returns newfd.
+	if oldfd == newfd {
+		oldFile := t.FDMap().GetFile(oldfd)
+		if oldFile == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer oldFile.DecRef()
+
+		return uintptr(newfd), nil, nil
+	}
+
+	// Zero out flags arg to be used by Dup3.
+	args[2].Value = 0
+	return Dup3(t, args)
+}
+
+// Dup3 implements linux syscall dup3(2).
+func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := kdefs.FD(args[0].Int())
+	newfd := kdefs.FD(args[1].Int())
+	flags := args[2].Uint()
+
+	if oldfd == newfd {
+		return 0, nil, syserror.EINVAL
+	}
+
+	oldFile := t.FDMap().GetFile(oldfd)
+	if oldFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer oldFile.DecRef()
+
+	err := t.FDMap().NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(newfd), nil, nil
+}
+
+func fGetOwn(t *kernel.Task, file *fs.File) int32 {
+	ma := file.Async(nil)
+	if ma == nil {
+		return 0
+	}
+	a := ma.(*fasync.FileAsync)
+	ot, otg, opg := a.Owner()
+	switch {
+	case ot != nil:
+		return int32(t.PIDNamespace().IDOfTask(ot))
+	case otg != nil:
+		return int32(t.PIDNamespace().IDOfThreadGroup(otg))
+	case opg != nil:
+		return int32(-t.PIDNamespace().IDOfProcessGroup(opg))
+	default:
+		return 0
+	}
+}
+
+// fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux.
+//
+// If who is positive, it represents a PID. If negative, it represents a PGID.
+// If the PID or PGID is invalid, the owner is silently unset.
+func fSetOwn(t *kernel.Task, file *fs.File, who int32) {
+	a := file.Async(fasync.New).(*fasync.FileAsync)
+	if who < 0 {
+		pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who))
+		a.SetOwnerProcessGroup(t, pg)
+	}
+	tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who))
+	a.SetOwnerThreadGroup(t, tg)
+}
+
+// Fcntl implements linux syscall fcntl(2).
+func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	cmd := args[1].Int()
+
+	file, flags := t.FDMap().GetDescriptor(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	switch cmd {
+	case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
+		from := kdefs.FD(args[2].Int())
+		fdFlags := kernel.FDFlags{CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC}
+		fd, err := t.FDMap().NewFDFrom(from, file, fdFlags, t.ThreadGroup().Limits())
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(fd), nil, nil
+	case linux.F_GETFD:
+		return uintptr(flags.ToLinuxFDFlags()), nil, nil
+	case linux.F_SETFD:
+		flags := args[2].Uint()
+		t.FDMap().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
+		})
+	case linux.F_GETFL:
+		return uintptr(file.Flags().ToLinux()), nil, nil
+	case linux.F_SETFL:
+		flags := uint(args[2].Uint())
+		file.SetFlags(linuxToFlags(flags).Settable())
+	case linux.F_SETLK, linux.F_SETLKW:
+		// In Linux the file system can choose to provide lock operations for an inode.
+		// Normally pipe and socket types lack lock operations. We diverge and use a heavy
+		// hammer by only allowing locks on files and directories.
+		if !fs.IsFile(file.Dirent.Inode.StableAttr) && !fs.IsDir(file.Dirent.Inode.StableAttr) {
+			return 0, nil, syserror.EBADF
+		}
+
+		// Copy in the lock request.
+		flockAddr := args[2].Pointer()
+		var flock syscall.Flock_t
+		if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+			return 0, nil, err
+		}
+
+		// Compute the lock whence.
+		var sw fs.SeekWhence
+		switch flock.Whence {
+		case 0:
+			sw = fs.SeekSet
+		case 1:
+			sw = fs.SeekCurrent
+		case 2:
+			sw = fs.SeekEnd
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Compute the lock offset.
+		var off int64
+		switch sw {
+		case fs.SeekSet:
+			off = 0
+		case fs.SeekCurrent:
+			// Note that Linux does not hold any mutexes while retrieving the file offset,
+			// see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
+			off = file.Offset()
+		case fs.SeekEnd:
+			uattr, err := file.Dirent.Inode.UnstableAttr(t)
+			if err != nil {
+				return 0, nil, err
+			}
+			off = uattr.Size
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Compute the lock range.
+		rng, err := lock.ComputeRange(flock.Start, flock.Len, off)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		// The lock uid is that of the Task's FDMap.
+		lockUniqueID := lock.UniqueID(t.FDMap().ID())
+
+		// These locks don't block; execute the non-blocking operation using the inode's lock
+		// context directly.
+		switch flock.Type {
+		case syscall.F_RDLCK:
+			if !file.Flags().Read {
+				return 0, nil, syserror.EBADF
+			}
+			if cmd == syscall.F_SETLK {
+				// Non-blocking lock, provide a nil lock.Blocker.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+					return 0, nil, syserror.EAGAIN
+				}
+			} else {
+				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+					return 0, nil, syserror.EINTR
+				}
+			}
+			return 0, nil, nil
+		case syscall.F_WRLCK:
+			if !file.Flags().Write {
+				return 0, nil, syserror.EBADF
+			}
+			if cmd == syscall.F_SETLK {
+				// Non-blocking lock, provide a nil lock.Blocker.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+					return 0, nil, syserror.EAGAIN
+				}
+			} else {
+				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+					return 0, nil, syserror.EINTR
+				}
+			}
+			return 0, nil, nil
+		case syscall.F_UNLCK:
+			file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng)
+			return 0, nil, nil
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+	case linux.F_GETOWN:
+		return uintptr(fGetOwn(t, file)), nil, nil
+	case linux.F_SETOWN:
+		fSetOwn(t, file, args[2].Int())
+		return 0, nil, nil
+	case linux.F_GET_SEALS:
+		val, err := tmpfs.GetSeals(file.Dirent.Inode)
+		return uintptr(val), nil, err
+	case linux.F_ADD_SEALS:
+		if !file.Flags().Write {
+			return 0, nil, syserror.EPERM
+		}
+		err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint())
+		return 0, nil, err
+	case linux.F_GETPIPE_SZ:
+		sz, ok := file.FileOperations.(pipe.Sizer)
+		if !ok {
+			return 0, nil, syserror.EINVAL
+		}
+		return uintptr(sz.PipeSize()), nil, nil
+	case linux.F_SETPIPE_SZ:
+		sz, ok := file.FileOperations.(pipe.Sizer)
+		if !ok {
+			return 0, nil, syserror.EINVAL
+		}
+		n, err := sz.SetPipeSize(int64(args[2].Int()))
+		return uintptr(n), nil, err
+	default:
+		// Everything else is not yet supported.
+		return 0, nil, syserror.EINVAL
+	}
+	return 0, nil, nil
+}
+
+const (
+	_FADV_NORMAL     = 0
+	_FADV_RANDOM     = 1
+	_FADV_SEQUENTIAL = 2
+	_FADV_WILLNEED   = 3
+	_FADV_DONTNEED   = 4
+	_FADV_NOREUSE    = 5
+)
+
+// Fadvise64 implements linux syscall fadvise64(2).
+// This implementation currently ignores the provided advice.
+func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	length := args[2].Int64()
+	advice := args[3].Int()
+
+	// Note: offset is allowed to be negative.
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// If the FD refers to a pipe or FIFO, return error.
+	if fs.IsPipe(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	switch advice {
+	case _FADV_NORMAL:
+	case _FADV_RANDOM:
+	case _FADV_SEQUENTIAL:
+	case _FADV_WILLNEED:
+	case _FADV_DONTNEED:
+	case _FADV_NOREUSE:
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Sure, whatever.
+	return 0, nil, nil
+}
+
+func mkdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does this directory exist already?
+		remainingTraversals := uint(linux.MaxSymlinkTraversals)
+		f, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals)
+		switch err {
+		case nil:
+			// The directory existed.
+			defer f.DecRef()
+			return syserror.EEXIST
+		case syserror.EACCES:
+			// Permission denied while walking to the directory.
+			return err
+		default:
+			// Do we have write permissions on the parent?
+			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+
+			// Create the directory.
+			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+			return d.CreateDirectory(t, root, name, perms)
+		}
+	})
+}
+
+// Mkdir implements linux syscall mkdir(2).
+func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+
+	return 0, nil, mkdirAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Mkdirat implements linux syscall mkdirat(2).
+func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	mode := linux.FileMode(args[2].ModeT())
+
+	return 0, nil, mkdirAt(t, dirFD, addr, mode)
+}
+
+func rmdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error {
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	// Special case: removing the root always returns EBUSY.
+	if path == "/" {
+		return syserror.EBUSY
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Linux returns different ernos when the path ends in single
+		// dot vs. double dots.
+		switch name {
+		case ".":
+			return syserror.EINVAL
+		case "..":
+			return syserror.ENOTEMPTY
+		}
+
+		if err := fs.MayDelete(t, root, d, name); err != nil {
+			return err
+		}
+
+		return d.RemoveDirectory(t, root, name)
+	})
+}
+
+// Rmdir implements linux syscall rmdir(2).
+func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr)
+}
+
+func symlinkAt(t *kernel.Task, dirFD kdefs.FD, newAddr usermem.Addr, oldAddr usermem.Addr) error {
+	newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	// The oldPath is copied in verbatim. This is because the symlink
+	// will include all details, including trailing slashes.
+	oldPath, err := t.CopyInString(oldAddr, linux.PATH_MAX)
+	if err != nil {
+		return err
+	}
+	if oldPath == "" {
+		return syserror.ENOENT
+	}
+
+	return fileOpAt(t, dirFD, newPath, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Make sure we have write permissions on the parent directory.
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+			return err
+		}
+		return d.CreateLink(t, root, oldPath, name)
+	})
+}
+
+// Symlink implements linux syscall symlink(2).
+func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	newAddr := args[1].Pointer()
+
+	return 0, nil, symlinkAt(t, linux.AT_FDCWD, newAddr, oldAddr)
+}
+
+// Symlinkat implements linux syscall symlinkat(2).
+func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	dirFD := kdefs.FD(args[1].Int())
+	newAddr := args[2].Pointer()
+
+	return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr)
+}
+
+// mayLinkAt determines whether t can create a hard link to target.
+//
+// This corresponds to Linux's fs/namei.c:may_linkat.
+func mayLinkAt(t *kernel.Task, target *fs.Inode) error {
+	// Linux will impose the following restrictions on hard links only if
+	// sysctl_protected_hardlinks is enabled. The kernel disables this
+	// setting by default for backward compatibility (see commit
+	// 561ec64ae67e), but also recommends that distributions enable it (and
+	// Debian does:
+	// https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=889098).
+	//
+	// gVisor currently behaves as though sysctl_protected_hardlinks is
+	// always enabled, and thus imposes the following restrictions on hard
+	// links.
+
+	if target.CheckOwnership(t) {
+		// fs/namei.c:may_linkat: "Source inode owner (or CAP_FOWNER)
+		// can hardlink all they like."
+		return nil
+	}
+
+	// If we are not the owner, then the file must be regular and have
+	// Read+Write permissions.
+	if !fs.IsRegular(target.StableAttr) {
+		return syserror.EPERM
+	}
+	if target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil {
+		return syserror.EPERM
+	}
+
+	return nil
+}
+
+// linkAt creates a hard link to the target specified by oldDirFD and oldAddr,
+// specified by newDirFD and newAddr.  If resolve is true, then the symlinks
+// will be followed when evaluating the target.
+func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr, resolve, allowEmpty bool) error {
+	oldPath, _, err := copyInPath(t, oldAddr, allowEmpty)
+	if err != nil {
+		return err
+	}
+	newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	if allowEmpty && oldPath == "" {
+		target := t.FDMap().GetFile(oldDirFD)
+		if target == nil {
+			return syserror.EBADF
+		}
+		defer target.DecRef()
+		if err := mayLinkAt(t, target.Dirent.Inode); err != nil {
+			return err
+		}
+
+		// Resolve the target directory.
+		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error {
+			if !fs.IsDir(newParent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
+
+			// Make sure we have write permissions on the parent directory.
+			if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+			return newParent.CreateHardLink(t, root, target.Dirent, newName)
+		})
+	}
+
+	// Resolve oldDirFD and oldAddr to a dirent.  The "resolve" argument
+	// only applies to this name.
+	return fileOpOn(t, oldDirFD, oldPath, resolve, func(root *fs.Dirent, target *fs.Dirent) error {
+		if err := mayLinkAt(t, target.Inode); err != nil {
+			return err
+		}
+
+		// Next resolve newDirFD and newAddr to the parent dirent and name.
+		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error {
+			if !fs.IsDir(newParent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
+
+			// Make sure we have write permissions on the parent directory.
+			if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+			return newParent.CreateHardLink(t, root, target, newName)
+		})
+	})
+}
+
+// Link implements linux syscall link(2).
+func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	newAddr := args[1].Pointer()
+
+	// man link(2):
+	// POSIX.1-2001 says that link() should dereference oldpath if it is a
+	// symbolic link. However, since kernel 2.0, Linux does not do so: if
+	// oldpath is a symbolic link, then newpath is created as a (hard) link
+	// to the same symbolic link file (i.e., newpath becomes a symbolic
+	// link to the same file that oldpath refers to).
+	resolve := false
+	return 0, nil, linkAt(t, linux.AT_FDCWD, oldAddr, linux.AT_FDCWD, newAddr, resolve, false /* allowEmpty */)
+}
+
+// Linkat implements linux syscall linkat(2).
+func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldDirFD := kdefs.FD(args[0].Int())
+	oldAddr := args[1].Pointer()
+	newDirFD := kdefs.FD(args[2].Int())
+	newAddr := args[3].Pointer()
+
+	// man linkat(2):
+	// By default, linkat(), does not dereference oldpath if it is a
+	// symbolic link (like link(2)). Since Linux 2.6.18, the flag
+	// AT_SYMLINK_FOLLOW can be specified in flags to cause oldpath to be
+	// dereferenced if it is a symbolic link.
+	flags := args[4].Int()
+
+	// Sanity check flags.
+	if flags&^(linux.AT_SYMLINK_FOLLOW|linux.AT_EMPTY_PATH) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	resolve := flags&linux.AT_SYMLINK_FOLLOW == linux.AT_SYMLINK_FOLLOW
+	allowEmpty := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH
+
+	if allowEmpty && !t.HasCapabilityIn(linux.CAP_DAC_READ_SEARCH, t.UserNamespace().Root()) {
+		return 0, nil, syserror.ENOENT
+	}
+
+	return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
+}
+
+func readlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, err
+	}
+	if dirPath {
+		return 0, syserror.ENOENT
+	}
+
+	err = fileOpOn(t, dirFD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		// Check for Read permission.
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Read: true}); err != nil {
+			return err
+		}
+
+		s, err := d.Inode.Readlink(t)
+		if err == syserror.ENOLINK {
+			return syserror.EINVAL
+		}
+		if err != nil {
+			return err
+		}
+
+		buffer := []byte(s)
+		if uint(len(buffer)) > size {
+			buffer = buffer[:size]
+		}
+
+		n, err := t.CopyOutBytes(bufAddr, buffer)
+
+		// Update frame return value.
+		copied = uintptr(n)
+
+		return err
+	})
+	return copied, err // Return frame value.
+}
+
+// Readlink implements linux syscall readlink(2).
+func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	n, err := readlinkAt(t, linux.AT_FDCWD, addr, bufAddr, size)
+	return n, nil, err
+}
+
+// Readlinkat implements linux syscall readlinkat(2).
+func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	bufAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	n, err := readlinkAt(t, dirFD, addr, bufAddr, size)
+	return n, nil, err
+}
+
+func unlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		if err := fs.MayDelete(t, root, d, name); err != nil {
+			return err
+		}
+
+		return d.Remove(t, root, name)
+	})
+}
+
+// Unlink implements linux syscall unlink(2).
+func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	return 0, nil, unlinkAt(t, linux.AT_FDCWD, addr)
+}
+
+// Unlinkat implements linux syscall unlinkat(2).
+func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	flags := args[2].Uint()
+	if flags&linux.AT_REMOVEDIR != 0 {
+		return 0, nil, rmdirAt(t, dirFD, addr)
+	}
+	return 0, nil, unlinkAt(t, dirFD, addr)
+}
+
+// Truncate implements linux syscall truncate(2).
+func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+	if dirPath {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(syscall.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		if fs.IsDir(d.Inode.StableAttr) {
+			return syserror.EISDIR
+		}
+		if !fs.IsFile(d.Inode.StableAttr) {
+			return syserror.EINVAL
+		}
+
+		// Reject truncation if the access permissions do not allow truncation.
+		// This is different from the behavior of sys_ftruncate, see below.
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
+			return err
+		}
+
+		if err := d.Inode.Truncate(t, d, length); err != nil {
+			return err
+		}
+
+		// File length modified, generate notification.
+		d.InotifyEvent(linux.IN_MODIFY, 0)
+
+		return nil
+	})
+}
+
+// Ftruncate implements linux syscall ftruncate(2).
+func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	length := args[1].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Reject truncation if the file flags do not permit this operation.
+	// This is different from truncate(2) above.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Note that this is different from truncate(2) above, where a
+	// directory returns EISDIR.
+	if !fs.IsFile(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(syscall.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	if err := file.Dirent.Inode.Truncate(t, file.Dirent, length); err != nil {
+		return 0, nil, err
+	}
+
+	// File length modified, generate notification.
+	file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+
+	return 0, nil, nil
+}
+
+// Umask implements linux syscall umask(2).
+func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	mask := args[0].ModeT()
+	mask = t.FSContext().SwapUmask(mask & 0777)
+	return uintptr(mask), nil, nil
+}
+
+// Change ownership of a file.
+//
+// uid and gid may be -1, in which case they will not be changed.
+func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error {
+	owner := fs.FileOwner{
+		UID: auth.NoID,
+		GID: auth.NoID,
+	}
+
+	uattr, err := d.Inode.UnstableAttr(t)
+	if err != nil {
+		return err
+	}
+	c := t.Credentials()
+	hasCap := d.Inode.CheckCapability(t, linux.CAP_CHOWN)
+	isOwner := uattr.Owner.UID == c.EffectiveKUID
+	if uid.Ok() {
+		kuid := c.UserNamespace.MapToKUID(uid)
+		// Valid UID must be supplied if UID is to be changed.
+		if !kuid.Ok() {
+			return syserror.EINVAL
+		}
+
+		// "Only a privileged process (CAP_CHOWN) may change the owner
+		// of a file." -chown(2)
+		//
+		// Linux also allows chown if you own the file and are
+		// explicitly not changing its UID.
+		isNoop := uattr.Owner.UID == kuid
+		if !(hasCap || (isOwner && isNoop)) {
+			return syserror.EPERM
+		}
+
+		owner.UID = kuid
+	}
+	if gid.Ok() {
+		kgid := c.UserNamespace.MapToKGID(gid)
+		// Valid GID must be supplied if GID is to be changed.
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+
+		// "The owner of a file may change the group of the file to any
+		// group of which that owner is a member. A privileged process
+		// (CAP_CHOWN) may change the group arbitrarily." -chown(2)
+		isNoop := uattr.Owner.GID == kgid
+		isMemberGroup := c.InGroup(kgid)
+		if !(hasCap || (isOwner && (isNoop || isMemberGroup))) {
+			return syserror.EPERM
+		}
+
+		owner.GID = kgid
+	}
+
+	// FIXME(b/62949101): This is racy; the inode's owner may have changed in
+	// the meantime. (Linux holds i_mutex while calling
+	// fs/attr.c:notify_change() => inode_operations::setattr =>
+	// inode_change_ok().)
+	if err := d.Inode.SetOwner(t, d, owner); err != nil {
+		return err
+	}
+
+	// When the owner or group are changed by an unprivileged user,
+	// chown(2) also clears the set-user-ID and set-group-ID bits, but
+	// we do not support them.
+	return nil
+}
+
+func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error {
+	path, _, err := copyInPath(t, addr, allowEmpty)
+	if err != nil {
+		return err
+	}
+
+	if path == "" {
+		// Annoying. What's wrong with fchown?
+		file := t.FDMap().GetFile(fd)
+		if file == nil {
+			return syserror.EBADF
+		}
+		defer file.DecRef()
+
+		return chown(t, file.Dirent, uid, gid)
+	}
+
+	return fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error {
+		return chown(t, d, uid, gid)
+	})
+}
+
+// Chown implements linux syscall chown(2).
+func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	uid := auth.UID(args[1].Uint())
+	gid := auth.GID(args[2].Uint())
+
+	return 0, nil, chownAt(t, linux.AT_FDCWD, addr, true /* resolve */, false /* allowEmpty */, uid, gid)
+}
+
+// Lchown implements linux syscall lchown(2).
+func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	uid := auth.UID(args[1].Uint())
+	gid := auth.GID(args[2].Uint())
+
+	return 0, nil, chownAt(t, linux.AT_FDCWD, addr, false /* resolve */, false /* allowEmpty */, uid, gid)
+}
+
+// Fchown implements linux syscall fchown(2).
+func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	uid := auth.UID(args[1].Uint())
+	gid := auth.GID(args[2].Uint())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, chown(t, file.Dirent, uid, gid)
+}
+
+// Fchownat implements Linux syscall fchownat(2).
+func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	uid := auth.UID(args[2].Uint())
+	gid := auth.GID(args[3].Uint())
+	flags := args[4].Int()
+
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, chownAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, flags&linux.AT_EMPTY_PATH != 0, uid, gid)
+}
+
+func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error {
+	// Must own file to change mode.
+	if !d.Inode.CheckOwnership(t) {
+		return syserror.EPERM
+	}
+
+	p := fs.FilePermsFromMode(mode)
+	if !d.Inode.SetPermissions(t, d, p) {
+		return syserror.EPERM
+	}
+
+	// File attribute changed, generate notification.
+	d.InotifyEvent(linux.IN_ATTRIB, 0)
+
+	return nil
+}
+
+func chmodAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpOn(t, fd, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return chmod(t, d, mode)
+	})
+}
+
+// Chmod implements linux syscall chmod(2).
+func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+
+	return 0, nil, chmodAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Fchmod implements linux syscall fchmod(2).
+func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	mode := linux.FileMode(args[1].ModeT())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, chmod(t, file.Dirent, mode)
+}
+
+// Fchmodat implements linux syscall fchmodat(2).
+func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	mode := linux.FileMode(args[2].ModeT())
+
+	return 0, nil, chmodAt(t, fd, addr, mode)
+}
+
+// defaultSetToSystemTimeSpec returns a TimeSpec that will set ATime and MTime
+// to the system time.
+func defaultSetToSystemTimeSpec() fs.TimeSpec {
+	return fs.TimeSpec{
+		ATimeSetSystemTime: true,
+		MTimeSetSystemTime: true,
+	}
+}
+
+func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error {
+	setTimestamp := func(root *fs.Dirent, d *fs.Dirent) error {
+		// Does the task own the file?
+		if !d.Inode.CheckOwnership(t) {
+			// Trying to set a specific time? Must be owner.
+			if (ts.ATimeOmit || !ts.ATimeSetSystemTime) && (ts.MTimeOmit || !ts.MTimeSetSystemTime) {
+				return syserror.EPERM
+			}
+
+			// Trying to set to current system time? Must have write access.
+			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
+				return err
+			}
+		}
+
+		if err := d.Inode.SetTimestamps(t, d, ts); err != nil {
+			return err
+		}
+
+		// File attribute changed, generate notification.
+		d.InotifyEvent(linux.IN_ATTRIB, 0)
+		return nil
+	}
+
+	// From utimes.c:
+	// "If filename is NULL and dfd refers to an open file, then operate on
+	// the file.  Otherwise look up filename, possibly using dfd as a
+	// starting point."
+	if addr == 0 && dirFD != linux.AT_FDCWD {
+		if !resolve {
+			// Linux returns EINVAL in this case. See utimes.c.
+			return syserror.EINVAL
+		}
+		f := t.FDMap().GetFile(dirFD)
+		if f == nil {
+			return syserror.EBADF
+		}
+		defer f.DecRef()
+
+		root := t.FSContext().RootDirectory()
+		defer root.DecRef()
+
+		return setTimestamp(root, f.Dirent)
+	}
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpOn(t, dirFD, path, resolve, setTimestamp)
+}
+
+// Utime implements linux syscall utime(2).
+func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	filenameAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times syscall.Utimbuf
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		ts = fs.TimeSpec{
+			ATime: ktime.FromSeconds(times.Actime),
+			MTime: ktime.FromSeconds(times.Modtime),
+		}
+	}
+	return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
+}
+
+// Utimes implements linux syscall utimes(2).
+func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	filenameAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times [2]linux.Timeval
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		ts = fs.TimeSpec{
+			ATime: ktime.FromTimeval(times[0]),
+			MTime: ktime.FromTimeval(times[1]),
+		}
+	}
+	return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
+}
+
+// timespecIsValid checks that the timespec is valid for use in utimensat.
+func timespecIsValid(ts linux.Timespec) bool {
+	// Nsec must be UTIME_OMIT, UTIME_NOW, or less than 10^9.
+	return ts.Nsec == linux.UTIME_OMIT || ts.Nsec == linux.UTIME_NOW || ts.Nsec < 1e9
+}
+
+// Utimensat implements linux syscall utimensat(2).
+func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	pathnameAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+	flags := args[3].Int()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times [2]linux.Timespec
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) {
+			return 0, nil, syserror.EINVAL
+		}
+
+		// If both are UTIME_OMIT, this is a noop.
+		if times[0].Nsec == linux.UTIME_OMIT && times[1].Nsec == linux.UTIME_OMIT {
+			return 0, nil, nil
+		}
+
+		ts = fs.TimeSpec{
+			ATime:              ktime.FromTimespec(times[0]),
+			ATimeOmit:          times[0].Nsec == linux.UTIME_OMIT,
+			ATimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
+			MTime:              ktime.FromTimespec(times[1]),
+			MTimeOmit:          times[1].Nsec == linux.UTIME_OMIT,
+			MTimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
+		}
+	}
+	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, flags&linux.AT_SYMLINK_NOFOLLOW == 0)
+}
+
+// Futimesat implements linux syscall futimesat(2).
+func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	pathnameAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times [2]linux.Timeval
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		if times[0].Usec >= 1e6 || times[0].Usec < 0 ||
+			times[1].Usec >= 1e6 || times[1].Usec < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+
+		ts = fs.TimeSpec{
+			ATime: ktime.FromTimeval(times[0]),
+			MTime: ktime.FromTimeval(times[1]),
+		}
+	}
+	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
+}
+
+func renameAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr) error {
+	newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	oldPath, _, err := copyInPath(t, oldAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpAt(t, oldDirFD, oldPath, func(root *fs.Dirent, oldParent *fs.Dirent, oldName string) error {
+		if !fs.IsDir(oldParent.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Rename rejects paths that end in ".", "..", or empty (i.e.
+		// the root) with EBUSY.
+		switch oldName {
+		case "", ".", "..":
+			return syserror.EBUSY
+		}
+
+		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error {
+			if !fs.IsDir(newParent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
+
+			// Rename rejects paths that end in ".", "..", or empty
+			// (i.e.  the root) with EBUSY.
+			switch newName {
+			case "", ".", "..":
+				return syserror.EBUSY
+			}
+
+			return fs.Rename(t, root, oldParent, oldName, newParent, newName)
+		})
+	})
+}
+
+// Rename implements linux syscall rename(2).
+func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldPathAddr := args[0].Pointer()
+	newPathAddr := args[1].Pointer()
+	return 0, nil, renameAt(t, linux.AT_FDCWD, oldPathAddr, linux.AT_FDCWD, newPathAddr)
+}
+
+// Renameat implements linux syscall renameat(2).
+func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldDirFD := kdefs.FD(args[0].Int())
+	oldPathAddr := args[1].Pointer()
+	newDirFD := kdefs.FD(args[2].Int())
+	newPathAddr := args[3].Pointer()
+	return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
+}
+
+// Fallocate implements linux system call fallocate(2).
+func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	mode := args[1].Int64()
+	offset := args[2].Int64()
+	length := args[3].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	if offset < 0 || length <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if mode != 0 {
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.ENOTSUP
+	}
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+	if fs.IsPipe(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ESPIPE
+	}
+	if fs.IsDir(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EISDIR
+	}
+	if !fs.IsRegular(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ENODEV
+	}
+	size := offset + length
+	if size < 0 {
+		return 0, nil, syserror.EFBIG
+	}
+	if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(syscall.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	if err := file.Dirent.Inode.Allocate(t, file.Dirent, offset, length); err != nil {
+		return 0, nil, err
+	}
+
+	// File length modified, generate notification.
+	file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+
+	return 0, nil, nil
+}
+
+// Flock implements linux syscall flock(2).
+func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	operation := args[1].Int()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		// flock(2): EBADF fd is not an open file descriptor.
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	nonblocking := operation&linux.LOCK_NB != 0
+	operation &^= linux.LOCK_NB
+
+	// flock(2):
+	// Locks created by flock() are associated with an open file table entry. This means that
+	// duplicate file descriptors (created by, for example, fork(2) or dup(2)) refer to the
+	// same lock, and this lock may be modified or released using any of these descriptors. Furthermore,
+	// the lock is released either by an explicit LOCK_UN operation on any of these duplicate
+	// descriptors, or when all such descriptors have been closed.
+	//
+	// If a process uses open(2) (or similar) to obtain more than one descriptor for the same file,
+	// these descriptors are treated independently by flock(). An attempt to lock the file using
+	// one of these file descriptors may be denied by a lock that the calling process has already placed via
+	// another descriptor.
+	//
+	// We use the File UniqueID as the lock UniqueID because it needs to reference the same lock across dup(2)
+	// and fork(2).
+	lockUniqueID := lock.UniqueID(file.UniqueID)
+
+	// A BSD style lock spans the entire file.
+	rng := lock.LockRange{
+		Start: 0,
+		End:   lock.LockEOF,
+	}
+
+	switch operation {
+	case linux.LOCK_EX:
+		if nonblocking {
+			// Since we're nonblocking we pass a nil lock.Blocker implementation.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+				return 0, nil, syserror.EWOULDBLOCK
+			}
+		} else {
+			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+				return 0, nil, syserror.EINTR
+			}
+		}
+	case linux.LOCK_SH:
+		if nonblocking {
+			// Since we're nonblocking we pass a nil lock.Blocker implementation.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+				return 0, nil, syserror.EWOULDBLOCK
+			}
+		} else {
+			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+				return 0, nil, syserror.EINTR
+			}
+		}
+	case linux.LOCK_UN:
+		file.Dirent.Inode.LockCtx.BSD.UnlockRegion(lockUniqueID, rng)
+	default:
+		// flock(2): EINVAL operation is invalid.
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, nil
+}
+
+const (
+	memfdPrefix     = "/memfd:"
+	memfdAllFlags   = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
+	memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + 1
+)
+
+// MemfdCreate implements the linux syscall memfd_create(2).
+func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Uint()
+
+	if flags&^memfdAllFlags != 0 {
+		// Unknown bits in flags.
+		return 0, nil, syserror.EINVAL
+	}
+
+	allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
+	cloExec := flags&linux.MFD_CLOEXEC != 0
+
+	name, err := t.CopyInString(addr, syscall.PathMax-len(memfdPrefix))
+	if err != nil {
+		return 0, nil, err
+	}
+	if len(name) > memfdMaxNameLen {
+		return 0, nil, syserror.EINVAL
+	}
+	name = memfdPrefix + name
+
+	inode := tmpfs.NewMemfdInode(t, allowSeals)
+	dirent := fs.NewDirent(inode, name)
+	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
+	// FMODE_READ | FMODE_WRITE.
+	file, err := inode.GetFile(t, dirent, fs.FileFlags{Read: true, Write: true})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	defer dirent.DecRef()
+	defer file.DecRef()
+
+	fdFlags := kernel.FDFlags{CloseOnExec: cloExec}
+	newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(newFD), nil, nil
+}