// Copyright 2018 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "io" "syscall" "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock" "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/fasync" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" "gvisor.googlesource.com/gvisor/pkg/syserror" ) // fileOpAt performs an operation on the second last component in the path. func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string) error) error { // Extract the last component. dir, name := fs.SplitLast(path) if dir == "/" { // Common case: we are accessing a file in the root. root := t.FSContext().RootDirectory() err := fn(root, root, name) root.DecRef() return err } else if dir == "." && dirFD == linux.AT_FDCWD { // Common case: we are accessing a file relative to the current // working directory; skip the look-up. wd := t.FSContext().WorkingDirectory() root := t.FSContext().RootDirectory() err := fn(root, wd, name) wd.DecRef() root.DecRef() return err } return fileOpOn(t, dirFD, dir, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { return fn(root, d, name) }) } // fileOpOn performs an operation on the last entry of the path. func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error { var ( d *fs.Dirent // The file. wd *fs.Dirent // The working directory (if required.) rel *fs.Dirent // The relative directory for search (if required.) f *fs.File // The file corresponding to dirFD (if required.) err error ) // Extract the working directory (maybe). if len(path) > 0 && path[0] == '/' { // Absolute path; rel can be nil. } else if dirFD == linux.AT_FDCWD { // Need to reference the working directory. wd = t.FSContext().WorkingDirectory() rel = wd } else { // Need to extract the given FD. f = t.FDMap().GetFile(dirFD) if f == nil { return syserror.EBADF } rel = f.Dirent if !fs.IsDir(rel.Inode.StableAttr) { return syserror.ENOTDIR } } // Grab the root (always required.) root := t.FSContext().RootDirectory() // Lookup the node. remainingTraversals := uint(linux.MaxSymlinkTraversals) if resolve { d, err = t.MountNamespace().FindInode(t, root, rel, path, &remainingTraversals) } else { d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals) } root.DecRef() if wd != nil { wd.DecRef() } if f != nil { f.DecRef() } if err != nil { return err } err = fn(root, d) d.DecRef() return err } // copyInPath copies a path in. func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string, dirPath bool, err error) { path, err = t.CopyInString(addr, linux.PATH_MAX) if err != nil { return "", false, err } if path == "" && !allowEmpty { return "", false, syserror.ENOENT } // If the path ends with a /, then checks must be enforced in various // ways in the different callers. We pass this back to the caller. path, dirPath = fs.TrimTrailingSlashes(path) return path, dirPath, nil } func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, err } resolve := flags&linux.O_NOFOLLOW == 0 err = fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error { // First check a few things about the filesystem before trying to get the file // reference. // // It's required that Check does not try to open files not that aren't backed by // this dirent (e.g. pipes and sockets) because this would result in opening these // files an extra time just to check permissions. if err := d.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil { return err } if fs.IsSymlink(d.Inode.StableAttr) && !resolve { return syserror.ELOOP } fileFlags := linuxToFlags(flags) // Linux always adds the O_LARGEFILE flag when running in 64-bit mode. fileFlags.LargeFile = true if fs.IsDir(d.Inode.StableAttr) { // Don't allow directories to be opened writable. if fileFlags.Write { return syserror.EISDIR } } else { // If O_DIRECTORY is set, but the file is not a directory, then fail. if fileFlags.Directory { return syserror.ENOTDIR } // If it's a directory, then make sure. if dirPath { return syserror.ENOTDIR } if flags&linux.O_TRUNC != 0 { if err := d.Inode.Truncate(t, d, 0); err != nil { return err } } } file, err := d.Inode.GetFile(t, d, fileFlags) if err != nil { return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } defer file.DecRef() // Success. fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0} newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) if err != nil { return err } // Set return result in frame. fd = uintptr(newFD) // Generate notification for opened file. d.InotifyEvent(linux.IN_OPEN, 0) return nil }) return fd, err // Use result in frame. } func mknodAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err } if dirPath { return syserror.ENOENT } return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error { if !fs.IsDir(d.Inode.StableAttr) { return syserror.ENOTDIR } // Do we have the appropriate permissions on the parent? if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { return err } // Attempt a creation. perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask())) switch mode.FileType() { case 0: // "Zero file type is equivalent to type S_IFREG." - mknod(2) fallthrough case linux.ModeRegular: // We are not going to return the file, so the actual // flags used don't matter, but they cannot be empty or // Create will complain. flags := fs.FileFlags{Read: true, Write: true} file, err := d.Create(t, root, name, flags, perms) if err != nil { return err } file.DecRef() return nil case linux.ModeNamedPipe: return d.CreateFifo(t, root, name, perms) case linux.ModeSocket: // While it is possible create a unix domain socket file on linux // using mknod(2), in practice this is pretty useless from an // application. Linux internally uses mknod() to create the socket // node during bind(2), but we implement bind(2) independently. If // an application explicitly creates a socket node using mknod(), // you can't seem to bind() or connect() to the resulting socket. // // Instead of emulating this seemingly useless behaviour, we'll // indicate that the filesystem doesn't support the creation of // sockets. return syserror.EOPNOTSUPP case linux.ModeCharacterDevice: fallthrough case linux.ModeBlockDevice: // TODO: We don't support creating block or character // devices at the moment. // // When we start supporting block and character devices, we'll // need to check for CAP_MKNOD here. return syserror.EPERM default: // "EINVAL - mode requested creation of something other than a // regular file, device special file, FIFO or socket." - mknod(2) return syserror.EINVAL } }) } // Mknod implements the linux syscall mknod(2). func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { path := args[0].Pointer() mode := linux.FileMode(args[1].ModeT()) // We don't need this argument until we support creation of device nodes. _ = args[2].Uint() // dev return 0, nil, mknodAt(t, linux.AT_FDCWD, path, mode) } // Mknodat implements the linux syscall mknodat(2). func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirFD := kdefs.FD(args[0].Int()) path := args[1].Pointer() mode := linux.FileMode(args[2].ModeT()) // We don't need this argument until we support creation of device nodes. _ = args[3].Uint() // dev return 0, nil, mknodAt(t, dirFD, path, mode) } func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, err } if dirPath { return 0, syserror.ENOENT } err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error { if !fs.IsDir(d.Inode.StableAttr) { return syserror.ENOTDIR } fileFlags := linuxToFlags(flags) // Linux always adds the O_LARGEFILE flag when running in 64-bit mode. fileFlags.LargeFile = true // Does this file exist already? remainingTraversals := uint(linux.MaxSymlinkTraversals) targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals) var newFile *fs.File switch err { case nil: // The file existed. defer targetDirent.DecRef() // Check if we wanted to create. if flags&linux.O_EXCL != 0 { return syserror.EEXIST } // Like sys_open, check for a few things about the // filesystem before trying to get a reference to the // fs.File. The same constraints on Check apply. if err := targetDirent.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil { return err } // Should we truncate the file? if flags&linux.O_TRUNC != 0 { if err := targetDirent.Inode.Truncate(t, targetDirent, 0); err != nil { return err } } // Create a new fs.File. newFile, err = targetDirent.Inode.GetFile(t, targetDirent, fileFlags) if err != nil { return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } defer newFile.DecRef() case syserror.ENOENT: // File does not exist. Proceed with creation. // Do we have write permissions on the parent? if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { return err } // Attempt a creation. perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask())) newFile, err = d.Create(t, root, name, fileFlags, perms) if err != nil { // No luck, bail. return err } defer newFile.DecRef() targetDirent = newFile.Dirent default: return err } // Success. fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0} newFD, err := t.FDMap().NewFDFrom(0, newFile, fdFlags, t.ThreadGroup().Limits()) if err != nil { return err } // Set result in frame. fd = uintptr(newFD) // Queue the open inotify event. The creation event is // automatically queued when the dirent is targetDirent. The // open events are implemented at the syscall layer so we need // to manually queue one here. targetDirent.InotifyEvent(linux.IN_OPEN, 0) return nil }) return fd, err // Use result in frame. } // Open implements linux syscall open(2). func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() flags := uint(args[1].Uint()) if flags&linux.O_CREAT != 0 { mode := linux.FileMode(args[2].ModeT()) n, err := createAt(t, linux.AT_FDCWD, addr, flags, mode) return n, nil, err } n, err := openAt(t, linux.AT_FDCWD, addr, flags) return n, nil, err } // Openat implements linux syscall openat(2). func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirFD := kdefs.FD(args[0].Int()) addr := args[1].Pointer() flags := uint(args[2].Uint()) if flags&linux.O_CREAT != 0 { mode := linux.FileMode(args[3].ModeT()) n, err := createAt(t, dirFD, addr, flags, mode) return n, nil, err } n, err := openAt(t, dirFD, addr, flags) return n, nil, err } // Creat implements linux syscall creat(2). func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() mode := linux.FileMode(args[1].ModeT()) n, err := createAt(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_TRUNC, mode) return n, nil, err } // accessContext is a context that overrides the credentials used, but // otherwise carries the same values as the embedded context. // // accessContext should only be used for access(2). type accessContext struct { context.Context creds *auth.Credentials } // Value implements context.Context. func (ac accessContext) Value(key interface{}) interface{} { switch key { case auth.CtxCredentials: return ac.creds default: return ac.Context.Value(key) } } func accessAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, resolve bool, mode uint) error { const rOK = 4 const wOK = 2 const xOK = 1 path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err } // Sanity check the mode. if mode&^(rOK|wOK|xOK) != 0 { return syserror.EINVAL } return fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error { // access(2) and faccessat(2) check permissions using real // UID/GID, not effective UID/GID. // // "access() needs to use the real uid/gid, not the effective // uid/gid. We do this by temporarily clearing all FS-related // capabilities and switching the fsuid/fsgid around to the // real ones." -fs/open.c:faccessat creds := t.Credentials().Fork() creds.EffectiveKUID = creds.RealKUID creds.EffectiveKGID = creds.RealKGID if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID { creds.EffectiveCaps = creds.PermittedCaps } else { creds.EffectiveCaps = 0 } ctx := &accessContext{ Context: t, creds: creds, } return d.Inode.CheckPermission(ctx, fs.PermMask{ Read: mode&rOK != 0, Write: mode&wOK != 0, Execute: mode&xOK != 0, }) }) } // Access implements linux syscall access(2). func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() mode := args[1].ModeT() return 0, nil, accessAt(t, linux.AT_FDCWD, addr, true, mode) } // Faccessat implements linux syscall faccessat(2). func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirFD := kdefs.FD(args[0].Int()) addr := args[1].Pointer() mode := args[2].ModeT() flags := args[3].Int() return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode) } // Ioctl implements linux syscall ioctl(2). func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := kdefs.FD(args[0].Int()) request := int(args[1].Int()) file := t.FDMap().GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() // Shared flags between file and socket. switch request { case linux.FIONCLEX: t.FDMap().SetFlags(fd, kernel.FDFlags{ CloseOnExec: false, }) return 0, nil, nil case linux.FIOCLEX: t.FDMap().SetFlags(fd, kernel.FDFlags{ CloseOnExec: true, }) return 0, nil, nil case linux.FIONBIO: var set int32 if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { return 0, nil, err } flags := file.Flags() if set != 0 { flags.NonBlocking = true } else { flags.NonBlocking = false } file.SetFlags(flags.Settable()) return 0, nil, nil case linux.FIOASYNC: var set int32 if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { return 0, nil, err } flags := file.Flags() if set != 0 { flags.Async = true } else { flags.Async = false } file.SetFlags(flags.Settable()) return 0, nil, nil case linux.FIOSETOWN, linux.SIOCSPGRP: var set int32 if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { return 0, nil, err } fSetOwn(t, file, set) return 0, nil, nil case linux.FIOGETOWN, linux.SIOCGPGRP: who := fGetOwn(t, file) _, err := t.CopyOut(args[2].Pointer(), &who) return 0, nil, err default: ret, err := file.FileOperations.Ioctl(t, t.MemoryManager(), args) if err != nil { return 0, nil, err } return ret, nil, nil } } // Getcwd implements the linux syscall getcwd(2). func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() size := args[1].SizeT() cwd := t.FSContext().WorkingDirectory() defer cwd.DecRef() root := t.FSContext().RootDirectory() defer root.DecRef() // Get our fullname from the root and preprend unreachable if the root was // unreachable from our current dirent this is the same behavior as on linux. s, reachable := cwd.FullName(root) if !reachable { s = "(unreachable)" + s } // Note this is >= because we need a terminator. if uint(len(s)) >= size { return 0, nil, syserror.ERANGE } // Copy out the path name for the node. bytes, err := t.CopyOutBytes(addr, []byte(s)) if err != nil { return 0, nil, err } // Top it off with a terminator. _, err = t.CopyOut(addr+usermem.Addr(bytes), []byte("\x00")) return uintptr(bytes + 1), nil, err } // Chroot implements the linux syscall chroot(2). func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() if !t.HasCapability(linux.CAP_SYS_CHROOT) { return 0, nil, syserror.EPERM } path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, nil, err } return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { // Is it a directory? if !fs.IsDir(d.Inode.StableAttr) { return syserror.ENOTDIR } // Does it have execute permissions? if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil { return err } t.FSContext().SetRootDirectory(d) return nil }) } // Chdir implements the linux syscall chdir(2). func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, nil, err } return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { // Is it a directory? if !fs.IsDir(d.Inode.StableAttr) { return syserror.ENOTDIR } // Does it have execute permissions? if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil { return err } t.FSContext().SetWorkingDirectory(d) return nil }) } // Fchdir implements the linux syscall fchdir(2). func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := kdefs.FD(args[0].Int()) file := t.FDMap().GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() // Is it a directory? if !fs.IsDir(file.Dirent.Inode.StableAttr) { return 0, nil, syserror.ENOTDIR } // Does it have execute permissions? if err := file.Dirent.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil { return 0, nil, err } t.FSContext().SetWorkingDirectory(file.Dirent) return 0, nil, nil } // Close implements linux syscall close(2). func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := kdefs.FD(args[0].Int()) file, ok := t.FDMap().Remove(fd) if !ok { return 0, nil, syserror.EBADF } defer file.DecRef() err := file.Flush(t) return 0, nil, handleIOError(t, false /* partial */, err, syscall.EINTR, "close", file) } // Dup implements linux syscall dup(2). func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := kdefs.FD(args[0].Int()) file := t.FDMap().GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() newfd, err := t.FDMap().NewFDFrom(0, file, kernel.FDFlags{}, t.ThreadGroup().Limits()) if err != nil { return 0, nil, syserror.EMFILE } return uintptr(newfd), nil, nil } // Dup2 implements linux syscall dup2(2). func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldfd := kdefs.FD(args[0].Int()) newfd := kdefs.FD(args[1].Int()) // If oldfd is a valid file descriptor, and newfd has the same value as oldfd, // then dup2() does nothing, and returns newfd. if oldfd == newfd { oldFile := t.FDMap().GetFile(oldfd) if oldFile == nil { return 0, nil, syserror.EBADF } defer oldFile.DecRef() return uintptr(newfd), nil, nil } // Zero out flags arg to be used by Dup3. args[2].Value = 0 return Dup3(t, args) } // Dup3 implements linux syscall dup3(2). func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldfd := kdefs.FD(args[0].Int()) newfd := kdefs.FD(args[1].Int()) flags := args[2].Uint() if oldfd == newfd { return 0, nil, syserror.EINVAL } oldFile := t.FDMap().GetFile(oldfd) if oldFile == nil { return 0, nil, syserror.EBADF } defer oldFile.DecRef() err := t.FDMap().NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}, t.ThreadGroup().Limits()) if err != nil { return 0, nil, err } return uintptr(newfd), nil, nil } func fGetOwn(t *kernel.Task, file *fs.File) int32 { ma := file.Async(nil) if ma == nil { return 0 } a := ma.(*fasync.FileAsync) ot, otg, opg := a.Owner() switch { case ot != nil: return int32(t.PIDNamespace().IDOfTask(ot)) case otg != nil: return int32(t.PIDNamespace().IDOfThreadGroup(otg)) case opg != nil: return int32(-t.PIDNamespace().IDOfProcessGroup(opg)) default: return 0 } } // fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux. // // If who is positive, it represents a PID. If negative, it represents a PGID. // If the PID or PGID is invalid, the owner is silently unset. func fSetOwn(t *kernel.Task, file *fs.File, who int32) { a := file.Async(fasync.New).(*fasync.FileAsync) if who < 0 { pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who)) a.SetOwnerProcessGroup(t, pg) } tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who)) a.SetOwnerThreadGroup(t, tg) } // Fcntl implements linux syscall fcntl(2). func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := kdefs.FD(args[0].Int()) cmd := args[1].Int() file, flags := t.FDMap().GetDescriptor(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: from := kdefs.FD(args[2].Int()) fdFlags := kernel.FDFlags{CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC} fd, err := t.FDMap().NewFDFrom(from, file, fdFlags, t.ThreadGroup().Limits()) if err != nil { return 0, nil, err } return uintptr(fd), nil, nil case linux.F_GETFD: return uintptr(flags.ToLinuxFDFlags()), nil, nil case linux.F_SETFD: flags := args[2].Uint() t.FDMap().SetFlags(fd, kernel.FDFlags{ CloseOnExec: flags&linux.FD_CLOEXEC != 0, }) case linux.F_GETFL: return uintptr(file.Flags().ToLinux()), nil, nil case linux.F_SETFL: flags := uint(args[2].Uint()) file.SetFlags(linuxToFlags(flags).Settable()) case linux.F_SETLK, linux.F_SETLKW: // In Linux the file system can choose to provide lock operations for an inode. // Normally pipe and socket types lack lock operations. We diverge and use a heavy // hammer by only allowing locks on files and directories. if !fs.IsFile(file.Dirent.Inode.StableAttr) && !fs.IsDir(file.Dirent.Inode.StableAttr) { return 0, nil, syserror.EBADF } // Copy in the lock request. flockAddr := args[2].Pointer() var flock syscall.Flock_t if _, err := t.CopyIn(flockAddr, &flock); err != nil { return 0, nil, err } // Compute the lock whence. var sw fs.SeekWhence switch flock.Whence { case 0: sw = fs.SeekSet case 1: sw = fs.SeekCurrent case 2: sw = fs.SeekEnd default: return 0, nil, syserror.EINVAL } // Compute the lock offset. var off int64 switch sw { case fs.SeekSet: off = 0 case fs.SeekCurrent: // Note that Linux does not hold any mutexes while retrieving the file offset, // see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk. off = file.Offset() case fs.SeekEnd: uattr, err := file.Dirent.Inode.UnstableAttr(t) if err != nil { return 0, nil, err } off = uattr.Size default: return 0, nil, syserror.EINVAL } // Compute the lock range. rng, err := lock.ComputeRange(flock.Start, flock.Len, off) if err != nil { return 0, nil, err } // The lock uid is that of the Task's FDMap. lockUniqueID := lock.UniqueID(t.FDMap().ID()) // These locks don't block; execute the non-blocking operation using the inode's lock // context directly. switch flock.Type { case syscall.F_RDLCK: if !file.Flags().Read { return 0, nil, syserror.EBADF } if cmd == syscall.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) { return 0, nil, syserror.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, t) { return 0, nil, syserror.EINTR } } return 0, nil, nil case syscall.F_WRLCK: if !file.Flags().Write { return 0, nil, syserror.EBADF } if cmd == syscall.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) { return 0, nil, syserror.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, t) { return 0, nil, syserror.EINTR } } return 0, nil, nil case syscall.F_UNLCK: file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng) return 0, nil, nil default: return 0, nil, syserror.EINVAL } case linux.F_GETOWN: return uintptr(fGetOwn(t, file)), nil, nil case linux.F_SETOWN: fSetOwn(t, file, args[2].Int()) return 0, nil, nil case linux.F_GET_SEALS: val, err := tmpfs.GetSeals(file.Dirent.Inode) return uintptr(val), nil, err case linux.F_ADD_SEALS: if !file.Flags().Write { return 0, nil, syserror.EPERM } err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint()) return 0, nil, err default: // Everything else is not yet supported. return 0, nil, syserror.EINVAL } return 0, nil, nil } const ( _FADV_NORMAL = 0 _FADV_RANDOM = 1 _FADV_SEQUENTIAL = 2 _FADV_WILLNEED = 3 _FADV_DONTNEED = 4 _FADV_NOREUSE = 5 ) // Fadvise64 implements linux syscall fadvise64(2). // This implementation currently ignores the provided advice. func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := kdefs.FD(args[0].Int()) length := args[2].Int64() advice := args[3].Int() // Note: offset is allowed to be negative. if length < 0 { return 0, nil, syserror.EINVAL } file := t.FDMap().GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() // If the FD refers to a pipe or FIFO, return error. if fs.IsPipe(file.Dirent.Inode.StableAttr) { return 0, nil, syserror.ESPIPE } switch advice { case _FADV_NORMAL: case _FADV_RANDOM: case _FADV_SEQUENTIAL: case _FADV_WILLNEED: case _FADV_DONTNEED: case _FADV_NOREUSE: default: return 0, nil, syserror.EINVAL } // Sure, whatever. return 0, nil, nil } func mkdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err } return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error { if !fs.IsDir(d.Inode.StableAttr) { return syserror.ENOTDIR } // Does this directory exist already? remainingTraversals := uint(linux.MaxSymlinkTraversals) f, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals) switch err { case nil: // The directory existed. defer f.DecRef() return syserror.EEXIST case syserror.EACCES: // Permission denied while walking to the directory. return err default: // Do we have write permissions on the parent? if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { return err } // Create the directory. perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask())) return d.CreateDirectory(t, root, name, perms) } }) } // Mkdir implements linux syscall mkdir(2). func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() mode := linux.FileMode(args[1].ModeT()) return 0, nil, mkdirAt(t, linux.AT_FDCWD, addr, mode) } // Mkdirat implements linux syscall mkdirat(2). func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirFD := kdefs.FD(args[0].Int()) addr := args[1].Pointer() mode := linux.FileMode(args[2].ModeT()) return 0, nil, mkdirAt(t, dirFD, addr, mode) } func rmdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err } // Special case: removing the root always returns EBUSY. if path == "/" { return syserror.EBUSY } return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error { if !fs.IsDir(d.Inode.StableAttr) { return syserror.ENOTDIR } // Linux returns different ernos when the path ends in single // dot vs. double dots. switch name { case ".": return syserror.EINVAL case "..": return syserror.ENOTEMPTY } if err := fs.MayDelete(t, root, d, name); err != nil { return err } return d.RemoveDirectory(t, root, name) }) } // Rmdir implements linux syscall rmdir(2). func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr) } func symlinkAt(t *kernel.Task, dirFD kdefs.FD, newAddr usermem.Addr, oldAddr usermem.Addr) error { newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */) if err != nil { return err } if dirPath { return syserror.ENOENT } // The oldPath is copied in verbatim. This is because the symlink // will include all details, including trailing slashes. oldPath, err := t.CopyInString(oldAddr, linux.PATH_MAX) if err != nil { return err } if oldPath == "" { return syserror.ENOENT } return fileOpAt(t, dirFD, newPath, func(root *fs.Dirent, d *fs.Dirent, name string) error { if !fs.IsDir(d.Inode.StableAttr) { return syserror.ENOTDIR } // Make sure we have write permissions on the parent directory. if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { return err } return d.CreateLink(t, root, oldPath, name) }) } // Symlink implements linux syscall symlink(2). func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldAddr := args[0].Pointer() newAddr := args[1].Pointer() return 0, nil, symlinkAt(t, linux.AT_FDCWD, newAddr, oldAddr) } // Symlinkat implements linux syscall symlinkat(2). func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldAddr := args[0].Pointer() dirFD := kdefs.FD(args[1].Int()) newAddr := args[2].Pointer() return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr) } // mayLinkAt determines whether t can create a hard link to target. // // This corresponds to Linux's fs/namei.c:may_linkat. func mayLinkAt(t *kernel.Task, target *fs.Inode) error { // Linux will impose the following restrictions on hard links only if // sysctl_protected_hardlinks is enabled. The kernel disables this // setting by default for backward compatibility (see commit // 561ec64ae67e), but also recommends that distributions enable it (and // Debian does: // https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=889098). // // gVisor currently behaves as though sysctl_protected_hardlinks is // always enabled, and thus imposes the following restrictions on hard // links. if target.CheckOwnership(t) { // fs/namei.c:may_linkat: "Source inode owner (or CAP_FOWNER) // can hardlink all they like." return nil } // If we are not the owner, then the file must be regular and have // Read+Write permissions. if !fs.IsRegular(target.StableAttr) { return syserror.EPERM } if target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil { return syserror.EPERM } return nil } // linkAt creates a hard link to the target specified by oldDirFD and oldAddr, // specified by newDirFD and newAddr. If resolve is true, then the symlinks // will be followed when evaluating the target. func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr, resolve, allowEmpty bool) error { oldPath, _, err := copyInPath(t, oldAddr, allowEmpty) if err != nil { return err } newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */) if err != nil { return err } if dirPath { return syserror.ENOENT } if allowEmpty && oldPath == "" { target := t.FDMap().GetFile(oldDirFD) if target == nil { return syserror.EBADF } defer target.DecRef() if err := mayLinkAt(t, target.Dirent.Inode); err != nil { return err } // Resolve the target directory. return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error { if !fs.IsDir(newParent.Inode.StableAttr) { return syserror.ENOTDIR } // Make sure we have write permissions on the parent directory. if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { return err } return newParent.CreateHardLink(t, root, target.Dirent, newName) }) } // Resolve oldDirFD and oldAddr to a dirent. The "resolve" argument // only applies to this name. return fileOpOn(t, oldDirFD, oldPath, resolve, func(root *fs.Dirent, target *fs.Dirent) error { if err := mayLinkAt(t, target.Inode); err != nil { return err } // Next resolve newDirFD and newAddr to the parent dirent and name. return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error { if !fs.IsDir(newParent.Inode.StableAttr) { return syserror.ENOTDIR } // Make sure we have write permissions on the parent directory. if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { return err } return newParent.CreateHardLink(t, root, target, newName) }) }) } // Link implements linux syscall link(2). func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldAddr := args[0].Pointer() newAddr := args[1].Pointer() // man link(2): // POSIX.1-2001 says that link() should dereference oldpath if it is a // symbolic link. However, since kernel 2.0, Linux does not do so: if // oldpath is a symbolic link, then newpath is created as a (hard) link // to the same symbolic link file (i.e., newpath becomes a symbolic // link to the same file that oldpath refers to). resolve := false return 0, nil, linkAt(t, linux.AT_FDCWD, oldAddr, linux.AT_FDCWD, newAddr, resolve, false /* allowEmpty */) } // Linkat implements linux syscall linkat(2). func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldDirFD := kdefs.FD(args[0].Int()) oldAddr := args[1].Pointer() newDirFD := kdefs.FD(args[2].Int()) newAddr := args[3].Pointer() // man linkat(2): // By default, linkat(), does not dereference oldpath if it is a // symbolic link (like link(2)). Since Linux 2.6.18, the flag // AT_SYMLINK_FOLLOW can be specified in flags to cause oldpath to be // dereferenced if it is a symbolic link. flags := args[4].Int() // Sanity check flags. if flags&^(linux.AT_SYMLINK_FOLLOW|linux.AT_EMPTY_PATH) != 0 { return 0, nil, syserror.EINVAL } resolve := flags&linux.AT_SYMLINK_FOLLOW == linux.AT_SYMLINK_FOLLOW allowEmpty := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH if allowEmpty && !t.HasCapabilityIn(linux.CAP_DAC_READ_SEARCH, t.UserNamespace().Root()) { return 0, nil, syserror.ENOENT } return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty) } func readlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, err } if dirPath { return 0, syserror.ENOENT } err = fileOpOn(t, dirFD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { // Check for Read permission. if err := d.Inode.CheckPermission(t, fs.PermMask{Read: true}); err != nil { return err } s, err := d.Inode.Readlink(t) if err == syserror.ENOLINK { return syserror.EINVAL } if err != nil { return err } buffer := []byte(s) if uint(len(buffer)) > size { buffer = buffer[:size] } n, err := t.CopyOutBytes(bufAddr, buffer) // Update frame return value. copied = uintptr(n) return err }) return copied, err // Return frame value. } // Readlink implements linux syscall readlink(2). func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() bufAddr := args[1].Pointer() size := args[2].SizeT() n, err := readlinkAt(t, linux.AT_FDCWD, addr, bufAddr, size) return n, nil, err } // Readlinkat implements linux syscall readlinkat(2). func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirFD := kdefs.FD(args[0].Int()) addr := args[1].Pointer() bufAddr := args[2].Pointer() size := args[3].SizeT() n, err := readlinkAt(t, dirFD, addr, bufAddr, size) return n, nil, err } func unlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err } if dirPath { return syserror.ENOENT } return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error { if !fs.IsDir(d.Inode.StableAttr) { return syserror.ENOTDIR } if err := fs.MayDelete(t, root, d, name); err != nil { return err } return d.Remove(t, root, name) }) } // Unlink implements linux syscall unlink(2). func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() return 0, nil, unlinkAt(t, linux.AT_FDCWD, addr) } // Unlinkat implements linux syscall unlinkat(2). func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirFD := kdefs.FD(args[0].Int()) addr := args[1].Pointer() flags := args[2].Uint() if flags&linux.AT_REMOVEDIR != 0 { return 0, nil, rmdirAt(t, dirFD, addr) } return 0, nil, unlinkAt(t, dirFD, addr) } // Truncate implements linux syscall truncate(2). func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].Int64() if length < 0 { return 0, nil, syserror.EINVAL } path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return 0, nil, err } if dirPath { return 0, nil, syserror.EINVAL } if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { t.SendSignal(&arch.SignalInfo{ Signo: int32(syscall.SIGXFSZ), Code: arch.SignalInfoUser, }) return 0, nil, syserror.EFBIG } return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { if fs.IsDir(d.Inode.StableAttr) { return syserror.EISDIR } if !fs.IsFile(d.Inode.StableAttr) { return syserror.EINVAL } // Reject truncation if the access permissions do not allow truncation. // This is different from the behavior of sys_ftruncate, see below. if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil { return err } if err := d.Inode.Truncate(t, d, length); err != nil { return err } // File length modified, generate notification. d.InotifyEvent(linux.IN_MODIFY, 0) return nil }) } // Ftruncate implements linux syscall ftruncate(2). func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := kdefs.FD(args[0].Int()) length := args[1].Int64() file := t.FDMap().GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() // Reject truncation if the file flags do not permit this operation. // This is different from truncate(2) above. if !file.Flags().Write { return 0, nil, syserror.EINVAL } // Note that this is different from truncate(2) above, where a // directory returns EISDIR. if !fs.IsFile(file.Dirent.Inode.StableAttr) { return 0, nil, syserror.EINVAL } if length < 0 { return 0, nil, syserror.EINVAL } if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { t.SendSignal(&arch.SignalInfo{ Signo: int32(syscall.SIGXFSZ), Code: arch.SignalInfoUser, }) return 0, nil, syserror.EFBIG } if err := file.Dirent.Inode.Truncate(t, file.Dirent, length); err != nil { return 0, nil, err } // File length modified, generate notification. file.Dirent.InotifyEvent(linux.IN_MODIFY, 0) return 0, nil, nil } // Umask implements linux syscall umask(2). func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { mask := args[0].ModeT() mask = t.FSContext().SwapUmask(mask & 0777) return uintptr(mask), nil, nil } // Change ownership of a file. // // uid and gid may be -1, in which case they will not be changed. func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { owner := fs.FileOwner{ UID: auth.NoID, GID: auth.NoID, } uattr, err := d.Inode.UnstableAttr(t) if err != nil { return err } c := t.Credentials() hasCap := d.Inode.CheckCapability(t, linux.CAP_CHOWN) isOwner := uattr.Owner.UID == c.EffectiveKUID if uid.Ok() { kuid := c.UserNamespace.MapToKUID(uid) // Valid UID must be supplied if UID is to be changed. if !kuid.Ok() { return syserror.EINVAL } // "Only a privileged process (CAP_CHOWN) may change the owner // of a file." -chown(2) // // Linux also allows chown if you own the file and are // explicitly not changing its UID. isNoop := uattr.Owner.UID == kuid if !(hasCap || (isOwner && isNoop)) { return syserror.EPERM } owner.UID = kuid } if gid.Ok() { kgid := c.UserNamespace.MapToKGID(gid) // Valid GID must be supplied if GID is to be changed. if !kgid.Ok() { return syserror.EINVAL } // "The owner of a file may change the group of the file to any // group of which that owner is a member. A privileged process // (CAP_CHOWN) may change the group arbitrarily." -chown(2) isNoop := uattr.Owner.GID == kgid isMemberGroup := c.InGroup(kgid) if !(hasCap || (isOwner && (isNoop || isMemberGroup))) { return syserror.EPERM } owner.GID = kgid } // FIXME: This is racy; the inode's owner may have changed in // the meantime. (Linux holds i_mutex while calling // fs/attr.c:notify_change() => inode_operations::setattr => // inode_change_ok().) if err := d.Inode.SetOwner(t, d, owner); err != nil { return err } // When the owner or group are changed by an unprivileged user, // chown(2) also clears the set-user-ID and set-group-ID bits, but // we do not support them. return nil } func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error { path, _, err := copyInPath(t, addr, allowEmpty) if err != nil { return err } if path == "" { // Annoying. What's wrong with fchown? file := t.FDMap().GetFile(fd) if file == nil { return syserror.EBADF } defer file.DecRef() return chown(t, file.Dirent, uid, gid) } return fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error { return chown(t, d, uid, gid) }) } // Chown implements linux syscall chown(2). func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() uid := auth.UID(args[1].Uint()) gid := auth.GID(args[2].Uint()) return 0, nil, chownAt(t, linux.AT_FDCWD, addr, true /* resolve */, false /* allowEmpty */, uid, gid) } // Lchown implements linux syscall lchown(2). func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() uid := auth.UID(args[1].Uint()) gid := auth.GID(args[2].Uint()) return 0, nil, chownAt(t, linux.AT_FDCWD, addr, false /* resolve */, false /* allowEmpty */, uid, gid) } // Fchown implements linux syscall fchown(2). func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := kdefs.FD(args[0].Int()) uid := auth.UID(args[1].Uint()) gid := auth.GID(args[2].Uint()) file := t.FDMap().GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() return 0, nil, chown(t, file.Dirent, uid, gid) } // Fchownat implements Linux syscall fchownat(2). func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirFD := kdefs.FD(args[0].Int()) addr := args[1].Pointer() uid := auth.UID(args[2].Uint()) gid := auth.GID(args[3].Uint()) flags := args[4].Int() if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { return 0, nil, syserror.EINVAL } return 0, nil, chownAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, flags&linux.AT_EMPTY_PATH != 0, uid, gid) } func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error { // Must own file to change mode. if !d.Inode.CheckOwnership(t) { return syserror.EPERM } p := fs.FilePermsFromMode(mode) if !d.Inode.SetPermissions(t, d, p) { return syserror.EPERM } // File attribute changed, generate notification. d.InotifyEvent(linux.IN_ATTRIB, 0) return nil } func chmodAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err } return fileOpOn(t, fd, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { return chmod(t, d, mode) }) } // Chmod implements linux syscall chmod(2). func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() mode := linux.FileMode(args[1].ModeT()) return 0, nil, chmodAt(t, linux.AT_FDCWD, addr, mode) } // Fchmod implements linux syscall fchmod(2). func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := kdefs.FD(args[0].Int()) mode := linux.FileMode(args[1].ModeT()) file := t.FDMap().GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() return 0, nil, chmod(t, file.Dirent, mode) } // Fchmodat implements linux syscall fchmodat(2). func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := kdefs.FD(args[0].Int()) addr := args[1].Pointer() mode := linux.FileMode(args[2].ModeT()) return 0, nil, chmodAt(t, fd, addr, mode) } // defaultSetToSystemTimeSpec returns a TimeSpec that will set ATime and MTime // to the system time. func defaultSetToSystemTimeSpec() fs.TimeSpec { return fs.TimeSpec{ ATimeSetSystemTime: true, MTimeSetSystemTime: true, } } func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error { setTimestamp := func(root *fs.Dirent, d *fs.Dirent) error { // Does the task own the file? if !d.Inode.CheckOwnership(t) { // Trying to set a specific time? Must be owner. if (ts.ATimeOmit || !ts.ATimeSetSystemTime) && (ts.MTimeOmit || !ts.MTimeSetSystemTime) { return syserror.EPERM } // Trying to set to current system time? Must have write access. if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil { return err } } if err := d.Inode.SetTimestamps(t, d, ts); err != nil { return err } // File attribute changed, generate notification. d.InotifyEvent(linux.IN_ATTRIB, 0) return nil } // From utimes.c: // "If filename is NULL and dfd refers to an open file, then operate on // the file. Otherwise look up filename, possibly using dfd as a // starting point." if addr == 0 && dirFD != linux.AT_FDCWD { if !resolve { // Linux returns EINVAL in this case. See utimes.c. return syserror.EINVAL } f := t.FDMap().GetFile(dirFD) if f == nil { return syserror.EBADF } defer f.DecRef() root := t.FSContext().RootDirectory() defer root.DecRef() return setTimestamp(root, f.Dirent) } path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { return err } return fileOpOn(t, dirFD, path, resolve, setTimestamp) } // Utime implements linux syscall utime(2). func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { filenameAddr := args[0].Pointer() timesAddr := args[1].Pointer() // No timesAddr argument will be interpreted as current system time. ts := defaultSetToSystemTimeSpec() if timesAddr != 0 { var times syscall.Utimbuf if _, err := t.CopyIn(timesAddr, ×); err != nil { return 0, nil, err } ts = fs.TimeSpec{ ATime: ktime.FromSeconds(times.Actime), MTime: ktime.FromSeconds(times.Modtime), } } return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true) } // Utimes implements linux syscall utimes(2). func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { filenameAddr := args[0].Pointer() timesAddr := args[1].Pointer() // No timesAddr argument will be interpreted as current system time. ts := defaultSetToSystemTimeSpec() if timesAddr != 0 { var times [2]linux.Timeval if _, err := t.CopyIn(timesAddr, ×); err != nil { return 0, nil, err } ts = fs.TimeSpec{ ATime: ktime.FromTimeval(times[0]), MTime: ktime.FromTimeval(times[1]), } } return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true) } // timespecIsValid checks that the timespec is valid for use in utimensat. func timespecIsValid(ts linux.Timespec) bool { // Nsec must be UTIME_OMIT, UTIME_NOW, or less than 10^9. return ts.Nsec == linux.UTIME_OMIT || ts.Nsec == linux.UTIME_NOW || ts.Nsec < 1e9 } // Utimensat implements linux syscall utimensat(2). func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirFD := kdefs.FD(args[0].Int()) pathnameAddr := args[1].Pointer() timesAddr := args[2].Pointer() flags := args[3].Int() // No timesAddr argument will be interpreted as current system time. ts := defaultSetToSystemTimeSpec() if timesAddr != 0 { var times [2]linux.Timespec if _, err := t.CopyIn(timesAddr, ×); err != nil { return 0, nil, err } if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) { return 0, nil, syserror.EINVAL } // If both are UTIME_OMIT, this is a noop. if times[0].Nsec == linux.UTIME_OMIT && times[1].Nsec == linux.UTIME_OMIT { return 0, nil, nil } ts = fs.TimeSpec{ ATime: ktime.FromTimespec(times[0]), ATimeOmit: times[0].Nsec == linux.UTIME_OMIT, ATimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW, MTime: ktime.FromTimespec(times[1]), MTimeOmit: times[1].Nsec == linux.UTIME_OMIT, MTimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW, } } return 0, nil, utimes(t, dirFD, pathnameAddr, ts, flags&linux.AT_SYMLINK_NOFOLLOW == 0) } // Futimesat implements linux syscall futimesat(2). func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirFD := kdefs.FD(args[0].Int()) pathnameAddr := args[1].Pointer() timesAddr := args[2].Pointer() // No timesAddr argument will be interpreted as current system time. ts := defaultSetToSystemTimeSpec() if timesAddr != 0 { var times [2]linux.Timeval if _, err := t.CopyIn(timesAddr, ×); err != nil { return 0, nil, err } if times[0].Usec >= 1e6 || times[0].Usec < 0 || times[1].Usec >= 1e6 || times[1].Usec < 0 { return 0, nil, syserror.EINVAL } ts = fs.TimeSpec{ ATime: ktime.FromTimeval(times[0]), MTime: ktime.FromTimeval(times[1]), } } return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true) } func renameAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr) error { newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */) if err != nil { return err } oldPath, _, err := copyInPath(t, oldAddr, false /* allowEmpty */) if err != nil { return err } return fileOpAt(t, oldDirFD, oldPath, func(root *fs.Dirent, oldParent *fs.Dirent, oldName string) error { if !fs.IsDir(oldParent.Inode.StableAttr) { return syserror.ENOTDIR } // Rename rejects paths that end in ".", "..", or empty (i.e. // the root) with EBUSY. switch oldName { case "", ".", "..": return syserror.EBUSY } return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error { if !fs.IsDir(newParent.Inode.StableAttr) { return syserror.ENOTDIR } // Rename rejects paths that end in ".", "..", or empty // (i.e. the root) with EBUSY. switch newName { case "", ".", "..": return syserror.EBUSY } return fs.Rename(t, root, oldParent, oldName, newParent, newName) }) }) } // Rename implements linux syscall rename(2). func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldPathAddr := args[0].Pointer() newPathAddr := args[1].Pointer() return 0, nil, renameAt(t, linux.AT_FDCWD, oldPathAddr, linux.AT_FDCWD, newPathAddr) } // Renameat implements linux syscall renameat(2). func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldDirFD := kdefs.FD(args[0].Int()) oldPathAddr := args[1].Pointer() newDirFD := kdefs.FD(args[2].Int()) newPathAddr := args[3].Pointer() return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr) } // Fallocate implements linux system call fallocate(2). // (well, not really, but at least we return the expected error codes) func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := kdefs.FD(args[0].Int()) offset := args[2].Int64() length := args[3].Int64() file := t.FDMap().GetFile(fd) if file == nil { return 0, nil, syserror.EBADF } defer file.DecRef() if offset < 0 || length <= 0 { return 0, nil, syserror.EINVAL } return 0, nil, syserror.EOPNOTSUPP } // Flock implements linux syscall flock(2). func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := kdefs.FD(args[0].Int()) operation := args[1].Int() file := t.FDMap().GetFile(fd) if file == nil { // flock(2): EBADF fd is not an open file descriptor. return 0, nil, syserror.EBADF } defer file.DecRef() nonblocking := operation&linux.LOCK_NB != 0 operation &^= linux.LOCK_NB // flock(2): // Locks created by flock() are associated with an open file table entry. This means that // duplicate file descriptors (created by, for example, fork(2) or dup(2)) refer to the // same lock, and this lock may be modified or released using any of these descriptors. Furthermore, // the lock is released either by an explicit LOCK_UN operation on any of these duplicate // descriptors, or when all such descriptors have been closed. // // If a process uses open(2) (or similar) to obtain more than one descriptor for the same file, // these descriptors are treated independently by flock(). An attempt to lock the file using // one of these file descriptors may be denied by a lock that the calling process has already placed via // another descriptor. // // We use the File UniqueID as the lock UniqueID because it needs to reference the same lock across dup(2) // and fork(2). lockUniqueID := lock.UniqueID(file.UniqueID) // A BSD style lock spans the entire file. rng := lock.LockRange{ Start: 0, End: lock.LockEOF, } switch operation { case linux.LOCK_EX: if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) { return 0, nil, syserror.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, t) { return 0, nil, syserror.EINTR } } case linux.LOCK_SH: if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) { return 0, nil, syserror.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, t) { return 0, nil, syserror.EINTR } } case linux.LOCK_UN: file.Dirent.Inode.LockCtx.BSD.UnlockRegion(lockUniqueID, rng) default: // flock(2): EINVAL operation is invalid. return 0, nil, syserror.EINVAL } return 0, nil, nil } // Sendfile implements linux system call sendfile(2). func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { outFD := kdefs.FD(args[0].Int()) inFD := kdefs.FD(args[1].Int()) offsetAddr := args[2].Pointer() count := int64(args[3].SizeT()) // Don't send a negative number of bytes. if count < 0 { return 0, nil, syserror.EINVAL } if count > int64(kernel.MAX_RW_COUNT) { count = int64(kernel.MAX_RW_COUNT) } // Get files. outFile := t.FDMap().GetFile(outFD) if outFile == nil { return 0, nil, syserror.EBADF } defer outFile.DecRef() inFile := t.FDMap().GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } defer inFile.DecRef() // Verify that the outfile is writable. outFlags := outFile.Flags() if !outFlags.Write { return 0, nil, syserror.EBADF } // Verify that the outfile Append flag is not set. if outFlags.Append { return 0, nil, syserror.EINVAL } // Verify that we have a regular infile. // http://elixir.free-electrons.com/linux/latest/source/fs/splice.c#L933 if !fs.IsRegular(inFile.Dirent.Inode.StableAttr) { return 0, nil, syserror.EINVAL } // Verify that the infile is readable. if !inFile.Flags().Read { return 0, nil, syserror.EBADF } // Setup for sending data. var n int64 var err error w := &fs.FileWriter{t, outFile} hasOffset := offsetAddr != 0 // If we have a provided offset. if hasOffset { // Verify that when offset address is not null, infile must be seekable if !inFile.Flags().Pread { return 0, nil, syserror.ESPIPE } // Copy in the offset. var offset int64 if _, err := t.CopyIn(offsetAddr, &offset); err != nil { return 0, nil, err } if offset < 0 { return 0, nil, syserror.EINVAL } // Send data using Preadv. r := io.NewSectionReader(&fs.FileReader{t, inFile}, offset, count) n, err = io.Copy(w, r) // Copy out the new offset. if _, err := t.CopyOut(offsetAddr, n+offset); err != nil { return 0, nil, err } // If we don't have a provided offset. } else { // Send data using readv. inOff := inFile.Offset() r := &io.LimitedReader{R: &fs.FileReader{t, inFile}, N: count} n, err = io.Copy(w, r) inOff += n if inFile.Offset() != inOff { // Adjust file position in case more bytes were read than written. if _, err := inFile.Seek(t, fs.SeekSet, inOff); err != nil { return 0, nil, syserror.EIO } } } // We can only pass a single file to handleIOError, so pick inFile // arbitrarily. return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "sendfile", inFile) } const ( memfdPrefix = "/memfd:" memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING) memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + 1 ) // MemfdCreate implements the linux syscall memfd_create(2). func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() flags := args[1].Uint() if flags&^memfdAllFlags != 0 { // Unknown bits in flags. return 0, nil, syserror.EINVAL } allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 cloExec := flags&linux.MFD_CLOEXEC != 0 name, err := t.CopyInString(addr, syscall.PathMax-len(memfdPrefix)) if err != nil { return 0, nil, err } if len(name) > memfdMaxNameLen { return 0, nil, syserror.EINVAL } name = memfdPrefix + name inode := tmpfs.NewMemfdInode(t, allowSeals) dirent := fs.NewDirent(inode, name) // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with // FMODE_READ | FMODE_WRITE. file, err := inode.GetFile(t, dirent, fs.FileFlags{Read: true, Write: true}) if err != nil { return 0, nil, err } defer dirent.DecRef() defer file.DecRef() fdFlags := kernel.FDFlags{CloseOnExec: cloExec} newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) if err != nil { return 0, nil, err } return uintptr(newFD), nil, nil }